feat: add _find_title_in_transcript for single-title re-matching

2026-05-04 00:17:27 +08:00
parent 086ce358a5
commit 6a5ec9c04f
1 changed files with 24 additions and 0 deletions
@@ -963,6 +963,30 @@ class PPTParser:
            logger.warning(f"LLM提取片段失败: {e}")
            return None
    def _find_title_in_transcript(self, title: str, corrected_segments: List[Dict]) -> Optional[Tuple[float, float]]:
        """
        在转录文本中搜索标题关键词，返回首次匹配的时间段。
        匹配逻辑：子串匹配（transcript text 包含 title 即为匹配）
        Args:
            title: 知识点标题
            corrected_segments: corrected_transcript.json 的 segments 列表
                              每个元素格式: {start, end, text}
        Returns:
            (start, end) 时间戳元组，或 None（匹配不到）
        """
        if not title or not corrected_segments:
            return None
        # 搜索策略：在所有 segment text 中找包含 title 的 segment
        for seg in corrected_segments:
            text = seg.get('text', '')
            if title in text:
                return (seg['start'], seg['end'])
        return None
    # ==================== 主流程 ====================
    def run(self) -> dict: