feat: add _find_title_in_transcript for single-title re-matching
This commit is contained in:
@@ -963,6 +963,30 @@ class PPTParser:
|
||||
logger.warning(f"LLM提取片段失败: {e}")
|
||||
return None
|
||||
|
||||
def _find_title_in_transcript(self, title: str, corrected_segments: List[Dict]) -> Optional[Tuple[float, float]]:
|
||||
"""
|
||||
在转录文本中搜索标题关键词,返回首次匹配的时间段。
|
||||
|
||||
匹配逻辑:子串匹配(transcript text 包含 title 即为匹配)
|
||||
|
||||
Args:
|
||||
title: 知识点标题
|
||||
corrected_segments: corrected_transcript.json 的 segments 列表
|
||||
每个元素格式: {start, end, text}
|
||||
|
||||
Returns:
|
||||
(start, end) 时间戳元组,或 None(匹配不到)
|
||||
"""
|
||||
if not title or not corrected_segments:
|
||||
return None
|
||||
|
||||
# 搜索策略:在所有 segment text 中找包含 title 的 segment
|
||||
for seg in corrected_segments:
|
||||
text = seg.get('text', '')
|
||||
if title in text:
|
||||
return (seg['start'], seg['end'])
|
||||
return None
|
||||
|
||||
# ==================== 主流程 ====================
|
||||
|
||||
def run(self) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user