From 6a5ec9c04f6538336574d4aeb600be9f4e006bcb Mon Sep 17 00:00:00 2001 From: hmo Date: Mon, 4 May 2026 00:17:27 +0800 Subject: [PATCH] feat: add _find_title_in_transcript for single-title re-matching --- src/core/ppt_parser.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/core/ppt_parser.py b/src/core/ppt_parser.py index 0a927d3..5ea03e1 100644 --- a/src/core/ppt_parser.py +++ b/src/core/ppt_parser.py @@ -963,6 +963,30 @@ class PPTParser: logger.warning(f"LLM提取片段失败: {e}") return None + def _find_title_in_transcript(self, title: str, corrected_segments: List[Dict]) -> Optional[Tuple[float, float]]: + """ + 在转录文本中搜索标题关键词,返回首次匹配的时间段。 + + 匹配逻辑:子串匹配(transcript text 包含 title 即为匹配) + + Args: + title: 知识点标题 + corrected_segments: corrected_transcript.json 的 segments 列表 + 每个元素格式: {start, end, text} + + Returns: + (start, end) 时间戳元组,或 None(匹配不到) + """ + if not title or not corrected_segments: + return None + + # 搜索策略:在所有 segment text 中找包含 title 的 segment + for seg in corrected_segments: + text = seg.get('text', '') + if title in text: + return (seg['start'], seg['end']) + return None + # ==================== 主流程 ==================== def run(self) -> dict: