feat: add _find_title_in_transcript for single-title re-matching

2026-05-04 00:17:27 +08:00
parent 086ce358a5
commit 6a5ec9c04f
1 changed files with 24 additions and 0 deletions
@@ -963,6 +963,30 @@ class PPTParser:
            logger.warning(f"LLM提取片段失败: {e}")
            return None

+    def _find_title_in_transcript(self, title: str, corrected_segments: List[Dict]) -> Optional[Tuple[float, float]]:
+        """
+        在转录文本中搜索标题关键词，返回首次匹配的时间段。
+
+        匹配逻辑：子串匹配（transcript text 包含 title 即为匹配）
+
+        Args:
+            title: 知识点标题
+            corrected_segments: corrected_transcript.json 的 segments 列表
+                              每个元素格式: {start, end, text}
+
+        Returns:
+            (start, end) 时间戳元组，或 None（匹配不到）
+        """
+        if not title or not corrected_segments:
+            return None
+
+        # 搜索策略：在所有 segment text 中找包含 title 的 segment
+        for seg in corrected_segments:
+            text = seg.get('text', '')
+            if title in text:
+                return (seg['start'], seg['end'])
+        return None
+
    # ==================== 主流程 ====================

    def run(self) -> dict: