feat: add _find_title_in_transcript for single-title re-matching

This commit is contained in:
hmo
2026-05-04 00:17:27 +08:00
parent 086ce358a5
commit 6a5ec9c04f
+24
View File
@@ -963,6 +963,30 @@ class PPTParser:
logger.warning(f"LLM提取片段失败: {e}")
return None
def _find_title_in_transcript(self, title: str, corrected_segments: List[Dict]) -> Optional[Tuple[float, float]]:
"""
在转录文本中搜索标题关键词,返回首次匹配的时间段。
匹配逻辑:子串匹配(transcript text 包含 title 即为匹配)
Args:
title: 知识点标题
corrected_segments: corrected_transcript.json 的 segments 列表
每个元素格式: {start, end, text}
Returns:
(start, end) 时间戳元组,或 None(匹配不到)
"""
if not title or not corrected_segments:
return None
# 搜索策略:在所有 segment text 中找包含 title 的 segment
for seg in corrected_segments:
text = seg.get('text', '')
if title in text:
return (seg['start'], seg['end'])
return None
# ==================== 主流程 ====================
def run(self) -> dict: