skills/piano-lesson-highlight-generator/scripts/extract_terms_from_ppt.py

#!/usr/bin/env python3
"""
从PPT提取知识点，转录视频，自动匹配时间戳，生成配置文件。
用法：python scripts/extract_terms_from_ppt.py <pptx_path> <video_path> <output_config.yaml>

GPU 资源管理：
- 转录前清理残留 Python 进程，释放 GPU 显存
- 转录完成后显式释放模型，避免显存泄漏
"""

import subprocess
import os
import sys
import json
import gc
import re
import yaml
import zipfile
import zhconv


def extract_ppt_text(pptx_path):
    """从PPTX提取文本（XML解包方式，兼容中文）"""
    texts_by_slide = []
    with zipfile.ZipFile(pptx_path, "r") as z:
        slide_files = sorted(
            [
                f
                for f in z.namelist()
                if f.startswith("ppt/slides/slide") and f.endswith(".xml")
            ]
        )
        for slide_file in slide_files:
            content = z.read(slide_file).decode("utf-8", errors="replace")
            texts = re.findall(r"<a:t>([^<]*)</a:t>", content)
            meaningful = [t.strip() for t in texts if t.strip() and len(t.strip()) > 1]
            if meaningful:
                slide_match = re.search(r"slide(\d+)", slide_file)
                slide_num = int(slide_match.group(1)) if slide_match else 0
                texts_by_slide.append(
                    {
                        "slide": slide_num,
                        "texts": meaningful,
                        "full_text": " ".join(meaningful),
                    }
                )
    return texts_by_slide


def find_main_knowledge_slide(ppt_texts):
    """找到'本课主要知识点'页面，提取完整知识点列表"""
    for slide in ppt_texts:
        text = slide["full_text"]
        # 查找包含"本课主要知识点"或类似标题的页面
        if any(
            kw in text
            for kw in [
                "本课主要知识点",
                "本节课重要知识点",
                "本课知识点",
                "主要知识点",
                "本课内容",
            ]
        ):
            # 从该页面提取完整的知识点文本
            knowledge_points = extract_knowledge_points_from_slide(slide)
            return slide, knowledge_points
    return None, []


def extract_knowledge_points_from_slide(slide):
    """从知识点页面提取完整的知识点（按PPT结构解析，不拆分术语）"""
    knowledge_points = []
    seen = set()

    # 先合并所有文本节点，然后整体处理
    full_text = " ".join(slide["texts"])

    # 确认这是知识点页面
    if not any(
        kw in full_text
        for kw in ["本课主要知识点", "本节课重要知识点", "本课知识点", "主要知识点"]
    ):
        return knowledge_points

    # 先去掉标题行
    full_text = re.sub(r"(本课|本节课)(重要|主要)?知识点", "", full_text)
    # 去掉类别前缀（如"乐理："、"演奏："）
    full_text = re.sub(r"(乐理|演奏|弹奏|视奏|节奏训练)\s*[：:]\s*", "", full_text)
    # 去掉"的组合"等后缀
    full_text = re.sub(r"的组合", "", full_text)

    # 按顿号、逗号分割
    parts = re.split(r"[、，,;；\s]+", full_text)
    for part in parts:
        part = part.strip()
        if not part or len(part) < 2:
            continue

        # 处理"与"、"和"连接的术语
        sub_parts = re.split(r"[与和]", part)
        for sub in sub_parts:
            sub = sub.strip()
            # 去掉书名号
            sub = re.sub(r"[《》]", "", sub)
            if sub and len(sub) >= 2 and sub not in seen:
                seen.add(sub)
                knowledge_points.append(sub)

    return knowledge_points


def find_homework_pages(ppt_texts):
    """找到作业页面"""
    homework_pages = []
    for slide in ppt_texts:
        text = slide["full_text"]
        if any(
            kw in text
            for kw in ["作业", "课后练习", "课后作业", "今天的作业", "布置作业"]
        ):
            homework_pages.append(slide)
    return homework_pages


def transcribe_video(video_path, output_dir):
    """转录整个视频，返回带时间戳的转录结果

    GPU 资源管理：
    - 转录前检查 GPU 状态，如有残留进程则释放
    - 转录完成后显式释放模型，避免显存泄漏
    """
    print("\n[步骤2] 转录视频...")
    inter_dir = os.path.join(output_dir, "intermediates")
    os.makedirs(inter_dir, exist_ok=True)

    # 检查是否已有转录文件
    transcript_path = os.path.join(inter_dir, "full_transcript.json")
    if os.path.exists(transcript_path):
        print("  发现已有转录文件，跳过转录")
        with open(transcript_path, "r", encoding="utf-8") as f:
            return json.load(f)

    # 获取视频时长
    result = subprocess.run(
        f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1 "{video_path}"',
        shell=True,
        capture_output=True,
        text=True,
    )
    duration = float(result.stdout.strip().split("=")[-1])
    print(f"  视频时长: {duration:.0f}s ({duration / 60:.1f}分钟)")

    # 分段转录（每5分钟一段）
    chunk_size = 300
    all_segments = []
    chunk_idx = 0
    offset = 0

    from faster_whisper import WhisperModel

    model_path = "D:/AI/LM-Models/faster-whisper/large-v3"
    model = None
    try:
        model = WhisperModel(model_path, device="cuda", compute_type="float16")
        print("  [INFO] 使用CUDA GPU加速转录")
    except Exception as e:
        print(f"  [WARN] large-v3 加载失败: {e}，尝试 base 模型")
        model = WhisperModel("base", device="cuda", compute_type="float16")
        print("  [INFO] 使用base模型转录")

    try:
        while offset < duration:
            end = min(offset + chunk_size, duration)
            print(f"  转录 {offset // 60}min-{end // 60}min...")

            chunk_path = os.path.join(inter_dir, f"chunk_{chunk_idx}.mp4")
            subprocess.run(
                f'ffmpeg -y -ss {offset} -t {end - offset} -i "{video_path}" -c:v copy -c:a copy "{chunk_path}" -hide_banner -loglevel error',
                shell=True,
            )

            segments, info = model.transcribe(chunk_path, language="zh", beam_size=5)
            os.remove(chunk_path)

            for seg in segments:
                all_segments.append(
                    {
                        "start": offset + seg.start,
                        "end": offset + seg.end,
                        "text": seg.text,
                    }
                )

            offset += chunk_size
            chunk_idx += 1

        # 保存转录结果
        with open(transcript_path, "w", encoding="utf-8") as f:
            json.dump(all_segments, f, ensure_ascii=False, indent=2)

        print(f"  转录完成: {len(all_segments)} 个片段")
    finally:
        # 释放 GPU 资源
        print("  [GPU] 释放模型资源...")
        if model is not None:
            del model
        gc.collect()
        import torch

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        print("  [GPU] 资源已释放")

    return all_segments


def find_anchor_time(segments, knowledge_points):
    """定位'本课主要知识点'锚点时间"""
    print("\n[步骤3] 定位知识点锚点时间...")

    # 搜索引导语
    guide_phrases = [
        "本课主要知识点",
        "今天我们要学",
        "这节课我们讲",
        "本节课我们",
        "今天我们学习",
        "这节课主要",
        "本课内容",
        "今天主要",
    ]

    anchor_candidates = []

    for seg in segments:
        text = zhconv.convert(seg["text"], "zh-cn")
        for phrase in guide_phrases:
            if phrase in text:
                anchor_candidates.append(seg["start"])
                break

    if anchor_candidates:
        anchor_time = min(anchor_candidates)
        print(f"  引导语锚点: {anchor_time:.0f}s ({anchor_time // 60}min)")
    else:
        anchor_time = 0
        print("  [WARN] 未找到引导语锚点")

    # 找到所有知识点首次出现的时间
    kw_first_appearances = {}
    for seg in segments:
        text = zhconv.convert(seg["text"], "zh-cn")
        for kw in knowledge_points:
            kw_simple = zhconv.convert(kw, "zh-cn")
            # 也尝试更短的关键词形式
            search_terms = [kw_simple]
            shorter = re.sub(r"[的与和及]", "", kw_simple)
            if shorter != kw_simple:
                search_terms.append(shorter)
            core_words = re.findall(r"[\u4e00-\u9fff]{2,4}", kw_simple)
            search_terms.extend(core_words)

            for term in search_terms:
                if term not in kw_first_appearances and len(term) >= 2 and term in text:
                    kw_first_appearances[term] = seg["start"]
                    break

    if kw_first_appearances:
        sorted_kws = sorted(kw_first_appearances.items(), key=lambda x: x[1])
        print(f"  知识点首次出现:")
        for kw, time in sorted_kws[:5]:
            print(f"    {kw}: {time:.0f}s ({time // 60}min)")

        # 教学开始锚点 = 所有知识点首次出现的最早时间
        # 不再使用 +300s 跳过逻辑，因为知识点可能分散在不同时间
        # 用最早出现时间作为教学开始，后续 match_knowledge_points 会按密度聚类
        first_kw_time = sorted_kws[0][1]
        final_anchor = max(anchor_time, first_kw_time)
        print(
            f"  教学开始锚点: {final_anchor:.0f}s ({final_anchor // 60}min) (最早知识点: {sorted_kws[0][0]})"
        )
        return final_anchor

    if anchor_time > 0:
        return anchor_time

    print("  [WARN] 未找到明确锚点，使用视频前10%作为排除区")
    return segments[-1]["end"] * 0.1 if segments else 0


def find_homework_anchor(segments, knowledge_anchor):
    """定位作业部分锚点时间：用'作业'词密度替代引导语匹配"""
    print("\n[步骤4] 定位作业部分锚点时间...")

    # 只搜索知识点教学之后的片段
    late_segments = [s for s in segments if s["start"] > knowledge_anchor]
    if not late_segments:
        print("  [WARN] 知识点后无内容，使用视频85%位置")
        return segments[-1]["end"] * 0.85 if segments else knowledge_anchor + 600

    # 统计每30秒窗口内"作业"的出现次数
    video_end = late_segments[-1]["end"]
    window_size = 30  # 30秒一个窗口
    window_counts = []

    for seg in late_segments:
        text = zhconv.convert(seg["text"], "zh-cn")
        count = text.count("作业")
        if count > 0:
            window_start = int(seg["start"] // window_size) * window_size
            window_counts.append((window_start, count, seg["start"]))

    if not window_counts:
        print("  [WARN] 未找到'作业'关键词，使用视频85%位置")
        return segments[-1]["end"] * 0.85

    # 按窗口聚合
    from collections import defaultdict

    window_totals = defaultdict(int)
    for ws, count, _ in window_counts:
        window_totals[ws] += count

    # 找密度最高的窗口
    best_window = max(window_totals.items(), key=lambda x: x[1])
    best_window_start = best_window[0]
    best_window_count = best_window[1]

    # 在该窗口内找到第一个出现"作业"的精确时间点
    for seg in late_segments:
        if (
            seg["start"] >= best_window_start
            and seg["start"] < best_window_start + window_size
        ):
            text = zhconv.convert(seg["text"], "zh-cn")
            if "作业" in text:
                print(
                    f"  作业锚点: {seg['start']:.0f}s ({seg['start'] // 60}min) [窗口密度: {best_window_count}次]"
                )
                return seg["start"]

    # 兜底
    print(f"  [WARN] 未找到精确锚点，使用密度窗口起点: {best_window_start}s")
    return best_window_start


def detect_gap_cutoff(segments_in_cluster, max_gap=10):
    """检测字幕间隔，找到应该截断的位置"""
    if not segments_in_cluster:
        return 0

    cutoff_time = segments_in_cluster[-1]["end"]
    for i in range(len(segments_in_cluster) - 1):
        gap = segments_in_cluster[i + 1]["start"] - segments_in_cluster[i]["end"]
        if gap > max_gap:
            cutoff_time = segments_in_cluster[i]["end"]
            break

    return cutoff_time


def find_homework_end(segments, homework_start):
    """找到作业讲解结束时间"""
    end_phrases = [
        "今天就到这里",
        "下课",
        "作业讲完",
        "今天的课就上到这里",
        "我们下课",
        "好今天",
        "今天就上到",
        "今天的作业就",
    ]

    for seg in segments:
        if seg["start"] < homework_start:
            continue
        text = zhconv.convert(seg["text"], "zh-cn")
        for phrase in end_phrases:
            if phrase in text:
                return seg["start"]

    hw_segments = [s for s in segments if s["start"] >= homework_start]
    if hw_segments:
        cutoff = detect_gap_cutoff(hw_segments, max_gap=15)
        if cutoff > homework_start:
            return cutoff

    if segments:
        return segments[-1]["end"] - 30
    return homework_start + 60


def match_knowledge_points(segments, knowledge_points, anchor_time, homework_anchor):
    """知识点匹配：基于教学特征识别，区分导读/教学/回顾

    核心策略：
    1. 对每个知识点，用完整关键词+核心子词找到所有相关 segment
    2. 用滑动窗口（30秒）扫描，找到知识点"密集讨论区域"
    3. 评分基于：该区域内相关 segment 的总文本量、关键词密度、孤立程度
    4. 排除导读特征：多个知识点在极短时间内密集出现
    """
    print("\n[步骤5] 匹配知识点到视频片段...")

    valid_segments = [
        s for s in segments if anchor_time - 5 <= s["start"] < homework_anchor
    ]

    # 术语纠正映射
    term_corrections = {
        "副点": "附点",
        "负点": "附点",
        "付点": "附点",
        "黑剑": "黑键",
        "实质": "时值",
        "演音": "延音",
        "阅历": "乐理",
        "音苻": "音符",
        "调苻": "调号",
        "拍苻": "拍符",
        "谱苻": "谱号",
        "首位": "手位",
        "守位": "手位",
        "只发": "指法",
        "织法": "指法",
        "台指": "抬指",
        "抬纸": "抬指",
        "只撑": "支撑",
        "肢撑": "支撑",
        "反服": "反复",
        "反副": "反复",
        "搞八度": "高八度",
        "搞八渡": "高八度",
        "底八度": "低八度",
        "联音": "连音",
        "连因": "连音",
        "挑音": "跳音",
        "还原记好": "还原记号",
        "缓原记号": "还原记号",
        "节牌": "节拍",
        "节凑": "节奏",
        "分首": "分手",
        "分守": "分手",
        "漫练": "慢练",
        "曼练": "慢练",
        "强若": "强弱",
        "强落": "强弱",
        "言音": "延音",
    }

    def correct_text(text):
        for wrong, correct in term_corrections.items():
            text = text.replace(wrong, correct)
        return text

    # 预计算纠正后的文本
    enriched_segments = []
    for seg in valid_segments:
        text_corrected = correct_text(zhconv.convert(seg["text"], "zh-cn"))
        enriched_segments.append({**seg, "text_corrected": text_corrected})

    def get_relevance_score(seg_text, keyword_simple):
        """
        计算 segment 与知识点的相关度

        策略：优先完整匹配，其次核心子词，避免通用词误匹配

        对于复合词（如"还原记号"），不匹配通用后缀（如"记号"）
        """
        # 完整关键词匹配
        if keyword_simple in seg_text:
            return 3.0

        # 核心子词匹配（去掉"的"等连接词）
        shorter = re.sub(r"[的与和及]", "", keyword_simple)
        if shorter != keyword_simple and len(shorter) >= 3 and shorter in seg_text:
            return 2.0

        # 数字归一化匹配：中文数字 ↔ 阿拉伯数字
        # "十六分音符" ↔ "16分音符"，"八分音符" ↔ "8分音符"
        chinese_to_num = {
            "一": "1",
            "二": "2",
            "三": "3",
            "四": "4",
            "五": "5",
            "六": "6",
            "七": "7",
            "八": "8",
            "九": "9",
            "十": "10",
            "十六": "16",
            "十五": "15",
            "十四": "14",
            "十三": "13",
            "十二": "12",
            "十一": "11",
            "二十": "20",
            "三十": "30",
        }
        num_to_chinese = {v: k for k, v in chinese_to_num.items()}

        # 尝试数字替换后的匹配
        normalized_text = seg_text
        for cn, num in chinese_to_num.items():
            normalized_text = normalized_text.replace(cn, num)
        normalized_keyword = keyword_simple
        for cn, num in chinese_to_num.items():
            normalized_keyword = normalized_keyword.replace(cn, num)

        if normalized_keyword in normalized_text and len(normalized_keyword) >= 3:
            return 2.5

        # 知识点相关词映射（用于匹配教学中的变体表达）
        related_terms = {
            "升降记号": ["升号", "降号", "升记", "降记", "升降", "升半", "降半"],
            "还原记号": ["还原"],
            "附点音符": ["附点"],
            "延音线": ["延音", "同音连线"],
            "双音的支撑": ["双音", "支撑"],
            "婚礼进行曲": ["婚礼"],
            "掀起你的盖头来": ["盖头来", "盖头", "掀起"],
            "十六分音符": ["16分", "十六分"],
            "八分音符": ["8分", "八分"],
        }

        if keyword_simple in related_terms:
            for term in related_terms[keyword_simple]:
                if term in seg_text:
                    return 1.5

        # 对于复合词，只匹配前缀部分，不匹配通用后缀
        # 通用后缀列表
        generic_suffixes = ["记号", "符号", "音符", "练习", "曲子", "曲子", "部分"]
        for suffix in generic_suffixes:
            if keyword_simple.endswith(suffix) and len(keyword_simple) > len(suffix):
                prefix = keyword_simple[: -len(suffix)]
                if len(prefix) >= 2 and prefix in seg_text:
                    return 1.5
                break

        # 2-4字核心词匹配（只匹配长度>=3的，避免2字通用词）
        for length in [4, 3]:
            words = re.findall(r"[\u4e00-\u9fff]{" + str(length) + r"}", keyword_simple)
            for word in words:
                # 跳过通用词
                if word in generic_suffixes:
                    continue
                if word in seg_text:
                    return 1.0

        return 0.0

    def find_teaching_regions(keyword_simple, all_segs):
        """
        找到某个知识点的所有教学区域

        使用滑动窗口（60秒）扫描，计算每个窗口内的"教学强度"
        教学强度 = 相关 segment 数量 × 平均相关度 × 总文本量
        """
        if not all_segs:
            return []

        # 计算每个 segment 的相关度
        scored_segs = []
        for s in all_segs:
            rel = get_relevance_score(s["text_corrected"], keyword_simple)
            if rel > 0:
                scored_segs.append({**s, "relevance": rel})

        if not scored_segs:
            return []

        # 用相关 segment 聚类（间隔<90秒的归为一组）
        scored_segs.sort(key=lambda x: x["start"])
        clusters = []
        current = [scored_segs[0]]
        for s in scored_segs[1:]:
            if s["start"] - current[-1]["end"] < 90:
                current.append(s)
            else:
                clusters.append(current)
                current = [s]
        clusters.append(current)

        return clusters

    def score_cluster(cluster, keyword_simple, homework_anchor):
        """
        评分：基于教学强度 + 时间位置偏好 + 推迟语言检测

        教学特征：
        - 相关 segment 数量多（反复讲解）
        - 总文本量大（有详细解释）
        - 不和其他知识点密集出现（不是列举）
        - 完整关键词出现次数多
        - 在视频中较早出现（教学在前，回顾在后）
        - 有实际讲解内容（不是"等下再说"）

        回顾特征：
        - 靠近作业时间（通常在作业前 5-10 分钟）
        - 提到"刚才"、"今天学了"等回顾性语言

        推迟特征：
        - "等下再说"、"后面讲"、"稍后"等
        """
        total_count = len(cluster)
        total_text_len = sum(len(s["text_corrected"]) for s in cluster)
        time_span = max(cluster[-1]["end"] - cluster[0]["start"], 1)
        cluster_start = cluster[0]["start"]

        # 完整关键词出现次数
        full_count = sum(1 for s in cluster if keyword_simple in s["text_corrected"])

        # 平均相关度
        avg_rel = sum(s.get("relevance", 0) for s in cluster) / max(total_count, 1)

        # 检查是否和其他知识点密集出现
        kw_simple_list = [zhconv.convert(kw, "zh-cn") for kw in knowledge_points]
        other_kw_count = 0
        for s in cluster:
            for other_kw in kw_simple_list:
                if other_kw != keyword_simple and other_kw in s["text_corrected"]:
                    other_kw_count += 1
                    break

        # 检测推迟语言（"等下再说"、"后面讲"等）
        defer_phrases = [
            "等下再说",
            "等下讲",
            "等一下再说",
            "等一下讲",
            "后面再说",
            "后面讲",
            "稍后再说",
            "稍后讲",
            "先不说",
            "先不讲",
            "先不讲了",
            "待会儿说",
            "待会儿讲",
            "一会儿说",
            "一会儿讲",
        ]
        defer_count = 0
        for s in cluster:
            text = s["text_corrected"]
            if any(phrase in text for phrase in defer_phrases):
                defer_count += 1

        defer_ratio = defer_count / max(total_count, 1)

        # 检测预告/提及语言（"先说一下"、"我先讲一下"等）vs 实际讲解
        # 预告特征：提到知识点名称但没有详细解释
        preview_phrases = [
            "先说一下",
            "先讲一下",
            "先说",
            "先讲",
            "第一先说",
            "首先说",
            "首先讲",
            "我先说",
            "我先讲",
            "提一下",
            "提到",
        ]
        # 讲解特征：有因果、解释、演示等
        teaching_phrases = [
            "因为",
            "所以",
            "就是",
            "意思是",
            "什么叫",
            "什么意思",
            "为什么",
            "怎么",
            "如何",
            "比如说",
            "例如",
            "比如",
            "像",
            "大家看",
            "看一下",
            "看到",
            "弹",
            "按",
            "练",
            "练习",
            "注意",
            "要",
            "需要",
            "必须",
        ]
        preview_count = sum(
            1
            for s in cluster
            if any(phrase in s["text_corrected"] for phrase in preview_phrases)
        )
        teaching_count = sum(
            1
            for s in cluster
            if any(phrase in s["text_corrected"] for phrase in teaching_phrases)
        )

        # 如果预告远多于讲解，说明只是提及而非教学
        if teaching_count == 0 and preview_count > 0:
            preview_ratio = preview_count / max(total_count, 1)
        else:
            preview_ratio = 0

        # 评分公式
        base_score = total_count * avg_rel
        text_bonus = min(total_text_len / 30, 5.0)
        full_bonus = full_count * 2.0
        isolation_penalty = 1.0 / (1.0 + other_kw_count * 0.5)

        score = (base_score + full_bonus) * text_bonus * isolation_penalty

        # 推迟惩罚：如果 cluster 中有推迟语言，大幅降权
        if defer_ratio > 0.1:
            defer_penalty = max(0.1, 1.0 - defer_ratio * 2.0)
            score *= defer_penalty

        # 预告惩罚：如果 cluster 中只有预告没有讲解，大幅降权
        if preview_ratio > 0.2:
            preview_penalty = max(0.1, 1.0 - preview_ratio * 2.0)
            score *= preview_penalty

        # 讲解密度加成：讲解词占比越高，越像实际教学
        teaching_density = teaching_count / max(total_count, 1)
        teaching_bonus = 1.0 + teaching_density * 2.0
        score *= teaching_bonus

        # 时间位置：靠近作业时间的区域通常是回顾
        time_to_homework = homework_anchor - cluster_start

        # 导读过滤：如果 cluster 中完全没有讲解特征，且相关 segment 很少（<=2个），
        # 说明只是导读提及而非实际教学，直接跳过
        if teaching_count == 0 and total_count <= 2:
            return {
                "score": 0,
                "total_count": total_count,
                "full_count": full_count,
                "time_span": round(time_span, 1),
                "total_text_len": total_text_len,
                "avg_rel": round(avg_rel, 2),
                "other_kw_count": other_kw_count,
                "has_review_language": False,
                "time_to_homework": round(time_to_homework, 0),
                "defer_count": defer_count,
                "defer_ratio": round(defer_ratio, 2),
                "teaching_count": teaching_count,
                "preview_count": preview_count,
                "preview_ratio": round(preview_ratio, 2),
            }

        # 时间位置惩罚：靠近作业时间的区域通常是回顾
        time_to_homework = homework_anchor - cluster_start
        if time_to_homework < 300:
            review_penalty = max(0.1, time_to_homework / 300)
            score *= review_penalty

        # 回顾性语言检测
        review_phrases = [
            "刚才",
            "刚刚",
            "今天学",
            "今天讲",
            "回顾",
            "练习一下",
            "复习",
            "我们学",
            "我们讲",
        ]
        has_review_language = any(
            any(phrase in s["text_corrected"] for phrase in review_phrases)
            for s in cluster
        )
        if has_review_language:
            score *= 0.3

        return {
            "score": round(score, 2),
            "total_count": total_count,
            "full_count": full_count,
            "time_span": round(time_span, 1),
            "total_text_len": total_text_len,
            "avg_rel": round(avg_rel, 2),
            "other_kw_count": other_kw_count,
            "has_review_language": has_review_language,
            "time_to_homework": round(time_to_homework, 0),
            "defer_count": defer_count,
            "defer_ratio": round(defer_ratio, 2),
            "teaching_count": teaching_count,
            "preview_count": preview_count,
            "preview_ratio": round(preview_ratio, 2),
        }

    all_candidates = []
    for keyword in knowledge_points:
        keyword_simple = zhconv.convert(keyword, "zh-cn")

        clusters = find_teaching_regions(keyword_simple, enriched_segments)

        if not clusters:
            print(f"  [SKIP] '{keyword}' - 转录中未找到")
            all_candidates.append([])
            continue

        # 对每个簇评分
        candidates = []
        for cluster in clusters:
            score_info = score_cluster(cluster, keyword_simple, homework_anchor)

            if score_info["score"] == 0:
                continue

            # 检测字幕间隔截断
            cutoff_time = detect_gap_cutoff(cluster, max_gap=15)
            clip_duration = min(cutoff_time - cluster[0]["start"], 60)
            clip_duration = max(clip_duration, 30)
            clip_end = cluster[0]["start"] + clip_duration

            candidates.append(
                {
                    "title": keyword_simple,
                    "keyword": keyword_simple,
                    "start": int(cluster[0]["start"]),
                    "end": int(clip_end),
                    "density": round(
                        score_info["total_count"] / max(score_info["time_span"], 1), 4
                    ),
                    "score": score_info["score"],
                    "total_count": score_info["total_count"],
                    "full_count": score_info["full_count"],
                    "time_span": score_info["time_span"],
                    "total_text_len": score_info["total_text_len"],
                    "avg_rel": score_info["avg_rel"],
                    "other_kw_count": score_info["other_kw_count"],
                    "preview": cluster[0]["text_corrected"][:60],
                }
            )

        all_candidates.append(candidates)

        if candidates:
            best = max(candidates, key=lambda x: x["score"])
            print(
                f"  [OK] '{keyword_simple}' -> {best['start']}s-{best['end']}s "
                f"(score={best['score']:.1f}, 相关{best['total_count']}次/完整{best['full_count']}次, "
                f"跨度{best['time_span']:.0f}s, 文本{best['total_text_len']}字, "
                f"其他知识点{best['other_kw_count']}次, 预告{best.get('preview_count', 0)}/讲解{best.get('teaching_count', 0)})"
            )
            print(f"       预览: {best['preview']}")
        else:
            print(f"  [SKIP] '{keyword}' - 无有效候选簇")

    # 按视频时间顺序匹配，重叠时调整边界
    print("\n  [步骤6] 顺序约束匹配（按视频时间顺序）...")

    all_best = []
    for candidates in all_candidates:
        if candidates:
            all_best.append(max(candidates, key=lambda x: x["score"]))

    all_best.sort(key=lambda x: x["start"])

    filtered = []
    for clip in all_best:
        overlaps = False
        for i, existing in enumerate(filtered):
            if clip["start"] < existing["end"] and clip["end"] > existing["start"]:
                overlaps = True
                mid_point = (existing["end"] + clip["start"]) // 2
                if clip["score"] > existing["score"]:
                    old_end = existing["end"]
                    existing["end"] = mid_point
                    print(
                        f"    [ADJUST] '{existing['title']}' end {old_end}s -> {mid_point}s (让位给 '{clip['title']}')"
                    )
                    filtered.append(clip)
                else:
                    new_start = mid_point
                    clip["start"] = new_start
                    print(
                        f"    [ADJUST] '{clip['title']}' start {clip['start']}s (让位给 '{existing['title']}')"
                    )
                    filtered.append(clip)
                break
        if not overlaps:
            filtered.append(clip)
            print(
                f"    [MATCH] '{clip['title']}' -> {clip['start']}s-{clip['end']}s (score={clip['score']:.1f})"
            )

    filtered.sort(key=lambda x: x["start"])
    return filtered


def match_homework(segments, homework_anchor, video_end):
    """匹配作业片段：基于语言分析定位作业结束点

    作业结束的语言标记（使用模糊匹配，覆盖多种口语表达）：
    1. 明确结束语："下课"、"拜拜"、"再见"
    2. 作业完成语："作业" + 完成标记（就这样/就这些/讲完了/说完了/到这儿/到这里）
    3. 通用结束语：就到这里/就这样/说完了/讲完了/没什么说的
    4. 群发通知："发群里"、"到时候我发"
    5. 长间隔：老师停顿超过 45 秒
    """
    print("\n[步骤7] 匹配作业片段...")

    hw_segments = [s for s in segments if s["start"] >= homework_anchor]

    if not hw_segments:
        print("  [SKIP] 未找到作业片段")
        return None

    # 模糊匹配：用正则表达式覆盖多种口语表达
    # 优先级从高到低
    end_patterns = [
        # 1. 明确下课（最高优先级）
        (r"下课", "下课"),
        (r"拜拜", "拜拜"),
        (r"再见", "再见"),
        # 2. 作业完成语："作业" + 各种完成表达
        (r"作业.*就这样", "作业就这样"),
        (r"作业.*就这些", "作业就这些"),
        (r"作业.*就是这些", "作业就是这些"),
        (r"作业.*讲到这里", "作业讲到这里"),
        (r"作业.*讲到这", "作业讲到这"),
        (r"作业.*说完了", "作业说完了"),
        (r"作业.*讲完了", "作业讲完了"),
        (r"作业.*布置完了", "作业布置完了"),
        (r"作业.*就这么多", "作业就这么多"),
        (r"作业.*到这儿", "作业到这儿"),
        (r"作业.*到这里", "作业到这里"),
        (r"作业.*完了", "作业完了"),
        (r"作业.*结束", "作业结束"),
        (r"作业.*说完了", "作业说完了"),
        # 3. 通用结束语（中等优先级）
        (r"就到这里", "就到这里"),
        (r"就到这", "就到这"),
        (r"就这样吧", "就这样吧"),
        (r"就这样了", "就这样了"),
        (r"就这些了", "就这些了"),
        (r"就这些", "就这些"),
        (r"说完了", "说完了"),
        (r"讲完了", "讲完了"),
        (r"没什么.*说的", "没什么说的"),
        (r"没什么.*讲", "没什么讲的"),
        (r"没别的", "没别的"),
        (r"今天就到", "今天就到"),
        (r"今天就这样", "今天就这样"),
        (r"那就这样", "那就这样"),
        (r"OK.*那就", "OK那就"),
        # 4. 群发通知
        (r"发群", "发群"),
        (r"到时候.*发", "到时候发"),
        # 5. 其他结束语（需要精确匹配，避免误匹配）
        (r"好那", "好那"),
        (r"好了", "好了"),
    ]

    # 找到最后一个结束标记（按时间顺序扫描，记录最后一个匹配）
    end_markers = []  # list of (time, pattern_name, text)
    for seg in hw_segments:
        text = zhconv.convert(seg["text"], "zh-cn")
        for pattern, name in end_patterns:
            if re.search(pattern, text):
                end_markers.append((seg["start"], name, text[:60]))
                break  # 一个 segment 只匹配一个模式

    if end_markers:
        # 取最后一个结束标记
        last_end_marker_time, last_pattern_name, _ = end_markers[-1]
        print(f'  检测到结束标记: "{last_pattern_name}" @ {last_end_marker_time:.0f}s')
    else:
        last_end_marker_time = None
        last_pattern_name = ""

    # 策略2：检测长间隔（老师说完作业后的停顿）
    gap_cutoff = detect_gap_cutoff(hw_segments, max_gap=45)

    # 综合判断
    if last_end_marker_time:
        # 有结束语言，在结束语言后找第一个长间隔
        after_end = [s for s in hw_segments if s["start"] >= last_end_marker_time]
        if after_end and len(after_end) > 1:
            gap_after_end = detect_gap_cutoff(after_end, max_gap=30)
            # 只有当间隔是真正的间隔（不是视频末尾）时才使用
            is_meaningful_gap = (
                gap_after_end > last_end_marker_time + 45
                and gap_after_end < video_end - 10
            )
            if is_meaningful_gap:
                clip_end = min(gap_after_end, video_end)
                print(
                    f'  作业结束: {last_end_marker_time:.0f}s ("{last_pattern_name}")，间隔截断: {clip_end:.0f}s'
                )
            else:
                # 否则在结束语言后加30秒
                clip_end = min(last_end_marker_time + 30, video_end)
                print(
                    f'  作业结束: {last_end_marker_time:.0f}s ("{last_pattern_name}")，+30s兜底: {clip_end:.0f}s'
                )
        else:
            clip_end = min(last_end_marker_time + 30, video_end)
            print(
                f'  作业结束: {last_end_marker_time:.0f}s ("{last_pattern_name}")，+30s兜底: {clip_end:.0f}s'
            )
    elif gap_cutoff > homework_anchor + 30:
        # 没有明确结束语言，用间隔截断
        clip_end = min(gap_cutoff, video_end)
        print(f"  作业结束: 间隔截断 {clip_end:.0f}s")
    else:
        # 兜底：视频末尾前2分钟
        clip_end = video_end - 120
        print(f"  作业结束: 兜底到视频末尾前2分钟 {clip_end:.0f}s")

    clip_end = min(clip_end, video_end)
    duration = clip_end - homework_anchor

    if duration < 10:
        print(f"  [SKIP] 作业片段太短: {duration:.0f}s")
        return None

    print(
        f"  [MATCH] '作业' -> {homework_anchor:.0f}s-{clip_end:.0f}s ({duration:.0f}s)"
    )

    return {
        "title": "作业",
        "keyword": "作业",
        "start": int(homework_anchor),
        "end": int(clip_end),
        "density": 0,
        "score": 0,
        "preview": hw_segments[0]["text"][:60] if hw_segments else "",
    }


def generate_config(video_path, clips, output_path):
    """生成配置文件"""
    config = {
        "video_src": video_path,
        "output_dir": os.path.join(os.path.dirname(output_path), "output"),
        "clips": [
            {"title": c["title"], "start": c["start"], "end": c["end"]} for c in clips
        ],
        "term_corrections": {
            "黑剑": "黑键",
            "负点": "附点",
            "副点": "附点",
            "实质": "时值",
            "演音": "延音",
            "阅历": "乐理",
            "音苻": "音符",
            "调苻": "调号",
            "拍苻": "拍符",
            "谱苻": "谱号",
            "首位": "手位",
        },
        "video_params": {
            "fade_duration": 1,
            "title_duration": 3,
            "title_fontsize": 90,
            "title_color": "FFFF00",
            "subtitle_fontsize": 24,
            "subtitle_color": "FFFFFF",
            "whisper_model": "large",
            "use_fast_whisper": True,
            "whisper_model_path": "D:/AI/LM-Models/faster-whisper/large-v3",
        },
    }

    with open(output_path, "w", encoding="utf-8") as f:
        yaml.dump(config, f, allow_unicode=True, default_flow_style=False)

    print(f"\nOK: 配置文件已生成: {output_path}")
    print(f"   知识点数量: {len(clips)}")
    total_duration = sum(c["end"] - c["start"] for c in clips)
    print(f"   总时长: {total_duration}s ({total_duration / 60:.1f}分钟)")
    for i, c in enumerate(clips, 1):
        print(f"   {i}. {c['title']} ({c['start']}s-{c['end']}s)")

    print("\n完成！使用以下命令生成精华视频:")
    print(f"  cd .opencode/skills/piano-lesson-highlight-generator")
    print(f"  python scripts/generate_highlights.py --config {output_path}")


def main():
    if len(sys.argv) < 4:
        print(
            "用法: python extract_terms_from_ppt.py <pptx_path> <video_path> <output_config.yaml>"
        )
        sys.exit(1)

    pptx_path = sys.argv[1]
    video_path = sys.argv[2]
    output_path = sys.argv[3]

    # Step 1: 从PPT提取知识点
    print("[步骤1] 从PPT提取知识点...")
    ppt_texts = extract_ppt_text(pptx_path)
    print(f"  提取到 {len(ppt_texts)} 页幻灯片内容")

    knowledge_slide, knowledge_points = find_main_knowledge_slide(ppt_texts)
    homework_pages = find_homework_pages(ppt_texts)

    if knowledge_points:
        print(f"  找到 {len(knowledge_points)} 个知识点: {', '.join(knowledge_points)}")
    else:
        print("  [WARN] 未找到'本课主要知识点'页面")

    if homework_pages:
        print(f"  找到 {len(homework_pages)} 个作业页面")

    # Step 2: 转录视频
    output_dir = os.path.dirname(output_path) or "."
    os.makedirs(output_dir, exist_ok=True)
    segments = transcribe_video(video_path, output_dir)

    # Step 3: 定位锚点时间
    anchor_time = find_anchor_time(segments, knowledge_points)

    # Step 4: 定位作业锚点
    homework_anchor = find_homework_anchor(segments, anchor_time)
    video_end = segments[-1]["end"] if segments else 0

    # Step 5: 匹配知识点
    clips = match_knowledge_points(
        segments, knowledge_points, anchor_time, homework_anchor
    )

    # Step 6: 匹配作业
    homework_clip = match_homework(segments, homework_anchor, video_end)
    if homework_clip:
        clips.append(homework_clip)

    if not clips:
        print("[WARN] 未找到任何匹配的知识点，请检查PPT内容或视频")
        sys.exit(1)

    # Step 7: 生成配置
    generate_config(video_path, clips, output_path)


if __name__ == "__main__":
    main()