feat: add _find_title_in_transcript for single-title re-matching
This commit is contained in:
@@ -963,6 +963,30 @@ class PPTParser:
|
|||||||
logger.warning(f"LLM提取片段失败: {e}")
|
logger.warning(f"LLM提取片段失败: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _find_title_in_transcript(self, title: str, corrected_segments: List[Dict]) -> Optional[Tuple[float, float]]:
|
||||||
|
"""
|
||||||
|
在转录文本中搜索标题关键词,返回首次匹配的时间段。
|
||||||
|
|
||||||
|
匹配逻辑:子串匹配(transcript text 包含 title 即为匹配)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: 知识点标题
|
||||||
|
corrected_segments: corrected_transcript.json 的 segments 列表
|
||||||
|
每个元素格式: {start, end, text}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(start, end) 时间戳元组,或 None(匹配不到)
|
||||||
|
"""
|
||||||
|
if not title or not corrected_segments:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 搜索策略:在所有 segment text 中找包含 title 的 segment
|
||||||
|
for seg in corrected_segments:
|
||||||
|
text = seg.get('text', '')
|
||||||
|
if title in text:
|
||||||
|
return (seg['start'], seg['end'])
|
||||||
|
return None
|
||||||
|
|
||||||
# ==================== 主流程 ====================
|
# ==================== 主流程 ====================
|
||||||
|
|
||||||
def run(self) -> dict:
|
def run(self) -> dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user