Initial commit: lesson-highlights generator

2026-05-03 03:07:22 +08:00
commit 9e62247a60
55 changed files with 6189 additions and 0 deletions
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+"""
+字幕纠错模块
+
+包含术语纠正、异常检测、上下文纠错等功能
+"""
+
+import re
+from pypinyin import pinyin, Style
+
+# ============== 直接替换词典 ==============
+# 格式: "错误词": "正确词"
+DIRECT_FIXES = {
+    "副点": "附点",
+    "拍苻": "拍符",
+    "演音": "延音",
+    "调苻": "调号",
+    "谱苻": "谱号",
+    "负点": "附点",
+    "阅历": "乐理",
+    "音苻": "音符",
+    "首位": "手位",
+    "黑剑": "黑键",
+    # 新增（从lesson1经验）
+    "非联奏": "非连奏",
+    "任谱": "认谱",
+    "实谱": "识谱",
+    "任音": "认音",
+    "传人": "唱人",
+    "修纸符": "休止符",
+    "修纸": "休止",
+    "修纸整小节": "休止整小节",
+}
+
+# ============== 音符名称纠错 ==============
+SONG_NAME_FIXES = {
+    "Doramifasalasi": "Do Re Mi Fa So La Si",
+    "刀": "do",
+    "锐": "re",
+    "咪": "mi",
+    "发": "fa",
+    "嗦": "so",
+    "啦": "la",
+    "西": "si",
+}
+
+# ============== 异常词检测 ==============
+ANOMALY_WORDS = [
+    "羞耻", "休息",  # 可能是"休止"
+    "实质", "时值",  # 时值相关
+]
+
+# ============== 音乐术语 ==============
+MUSIC_TERMS = [
+    "音符", "休止符", "拍子", "节拍", "节奏",
+    "全分", "二分", "四分", "八分", "十六分", "三十二分",
+    "附点", "调号", "谱号", "音名", "唱名",
+    "手型", "手位", "支撑", "放松",
+    "弹奏", "非连奏", "跳奏", "连奏",
+]
+
+# ============== 异常模式 ==============
+ANOMALY_PATTERNS = [
+    (r'羞耻', '休止'),
+    (r'实质音符', '时值音符'),
+    (r'实质', '时值'),
+]
+
+
+def apply_term_corrections(text, corrections=None):
+    """
+    应用术语纠正
+
+    Args:
+        text: 原始文本
+        corrections: 额外的纠正词典
+
+    Returns:
+        纠正后的文本
+    """
+    if not text:
+        return text
+
+    # 合并纠正词典
+    all_fixes = dict(DIRECT_FIXES)
+    if corrections:
+        all_fixes.update(corrections)
+
+    # 先处理长词，再处理短词（避免部分替换）
+    sorted_fixes = sorted(all_fixes.items(), key=lambda x: len(x[0]), reverse=True)
+
+    for wrong, correct in sorted_fixes:
+        if wrong in text:
+            text = text.replace(wrong, correct)
+
+    return text
+
+
+def apply_song_name_fixes(text):
+    """应用音符名称纠错"""
+    for wrong, correct in SONG_NAME_FIXES.items():
+        if wrong in text:
+            text = text.replace(wrong, correct)
+    return text
+
+
+def apply_anomaly_fixes(text):
+    """应用异常模式纠错"""
+    for pattern, replacement in ANOMALY_PATTERNS:
+        text = re.sub(pattern, replacement, text)
+    return text
+
+
+def apply_all_corrections(text, extra_corrections=None):
+    """
+    应用所有纠错规则
+
+    Args:
+        text: 原始文本
+        extra_corrections: 额外的纠正词典（来自config）
+
+    Returns:
+        纠错后的文本
+    """
+    text = apply_term_corrections(text, extra_corrections)
+    text = apply_song_name_fixes(text)
+    text = apply_anomaly_fixes(text)
+    return text
+
+
+def detect_anomalies(text, knowledge_terms=None):
+    """
+    检测文本中的异常
+
+    Args:
+        text: 文本
+        knowledge_terms: 知识点列表
+
+    Returns:
+        异常词列表
+    """
+    anomalies = []
+
+    # 检查异常词
+    for word in ANOMALY_WORDS:
+        if word in text:
+            anomalies.append(word)
+
+    # 检查是否包含知识术语
+    if knowledge_terms:
+        text_lower = text.lower()
+        has_knowledge = any(term.lower() in text_lower for term in knowledge_terms)
+        if not has_knowledge and len(text) > 10:
+            anomalies.append("NO_KNOWLEDGE_TERM")
+
+    return anomalies
+
+
+def get_pinyin(text):
+    """获取文本的拼音"""
+    try:
+        return ' '.join([p[0] for p in pinyin(text, style=Style.TONE3)])
+    except:
+        return text
+
+
+def pinyin_similarity(word1, word2):
+    """
+    计算两个词的拼音相似度
+
+    Args:
+        word1: 词1
+        word2: 词2
+
+    Returns:
+        相似度分数 (0-1)
+    """
+    p1 = get_pinyin(word1)
+    p2 = get_pinyin(word2)
+
+    if p1 == p2:
+        return 1.0
+
+    # 简单相似度计算
+    common = sum(1 for c1, c2 in zip(p1, p2) if c1 == c2)
+    max_len = max(len(p1), len(p2))
+
+    return common / max_len if max_len > 0 else 0
+
+
+def ai_context_correct(text, clip_title="", all_clips=None):
+    """
+    上下文感知纠错（基于拼音相似度）
+
+    Args:
+        text: 文本
+        clip_title: 片段标题
+        all_clips: 所有片段信息
+
+    Returns:
+        纠错后的文本
+    """
+    # 如果文本包含异常词，尝试修复
+    for wrong, correct in DIRECT_FIXES.items():
+        if wrong in text:
+            # 检查拼音相似度
+            similarity = pinyin_similarity(wrong, correct)
+            if similarity > 0.5:
+                text = text.replace(wrong, correct)
+
+    return text
+
+
+def load_term_corrections_from_config(config):
+    """
+    从配置加载术语纠正词典
+
+    Args:
+        config: 配置字典
+
+    Returns:
+        术语纠正词典
+    """
+    term_corrections = config.get('term_corrections', {})
+
+    # 确保基本纠正规则存在
+    defaults = {
+        "副点": "附点",
+        "实质": "时值",
+    }
+
+    defaults.update(term_corrections)
+    return defaults