# -*- coding: utf-8 -*- """ 字幕纠错模块 包含术语纠正、异常检测、上下文纠错等功能 """ import re from pypinyin import pinyin, Style # ============== 直接替换词典 ============== # 格式: "错误词": "正确词" DIRECT_FIXES = { "副点": "附点", "拍苻": "拍符", "演音": "延音", "调苻": "调号", "谱苻": "谱号", "负点": "附点", "阅历": "乐理", "音苻": "音符", "首位": "手位", "黑剑": "黑键", # 新增(从lesson1经验) "非联奏": "非连奏", "任谱": "认谱", "实谱": "识谱", "任音": "认音", "传人": "唱人", "修纸符": "休止符", "修纸": "休止", "修纸整小节": "休止整小节", } # ============== 音符名称纠错 ============== SONG_NAME_FIXES = { "Doramifasalasi": "Do Re Mi Fa So La Si", "刀": "do", "锐": "re", "咪": "mi", "发": "fa", "嗦": "so", "啦": "la", "西": "si", } # ============== 异常词检测 ============== ANOMALY_WORDS = [ "羞耻", "休息", # 可能是"休止" "实质", "时值", # 时值相关 ] # ============== 音乐术语 ============== MUSIC_TERMS = [ "音符", "休止符", "拍子", "节拍", "节奏", "全分", "二分", "四分", "八分", "十六分", "三十二分", "附点", "调号", "谱号", "音名", "唱名", "手型", "手位", "支撑", "放松", "弹奏", "非连奏", "跳奏", "连奏", ] # ============== 异常模式 ============== ANOMALY_PATTERNS = [ (r'羞耻', '休止'), (r'实质音符', '时值音符'), (r'实质', '时值'), ] def apply_term_corrections(text, corrections=None): """ 应用术语纠正 Args: text: 原始文本 corrections: 额外的纠正词典 Returns: 纠正后的文本 """ if not text: return text # 合并纠正词典 all_fixes = dict(DIRECT_FIXES) if corrections: all_fixes.update(corrections) # 先处理长词,再处理短词(避免部分替换) sorted_fixes = sorted(all_fixes.items(), key=lambda x: len(x[0]), reverse=True) for wrong, correct in sorted_fixes: if wrong in text: text = text.replace(wrong, correct) return text def apply_song_name_fixes(text): """应用音符名称纠错""" for wrong, correct in SONG_NAME_FIXES.items(): if wrong in text: text = text.replace(wrong, correct) return text def apply_anomaly_fixes(text): """应用异常模式纠错""" for pattern, replacement in ANOMALY_PATTERNS: text = re.sub(pattern, replacement, text) return text def apply_all_corrections(text, extra_corrections=None): """ 应用所有纠错规则 Args: text: 原始文本 extra_corrections: 额外的纠正词典(来自config) Returns: 纠错后的文本 """ text = apply_term_corrections(text, extra_corrections) text = apply_song_name_fixes(text) text = apply_anomaly_fixes(text) return text def detect_anomalies(text, knowledge_terms=None): """ 检测文本中的异常 Args: text: 文本 knowledge_terms: 知识点列表 Returns: 异常词列表 """ anomalies = [] # 检查异常词 for word in ANOMALY_WORDS: if word in text: anomalies.append(word) # 检查是否包含知识术语 if knowledge_terms: text_lower = text.lower() has_knowledge = any(term.lower() in text_lower for term in knowledge_terms) if not has_knowledge and len(text) > 10: anomalies.append("NO_KNOWLEDGE_TERM") return anomalies def get_pinyin(text): """获取文本的拼音""" try: return ' '.join([p[0] for p in pinyin(text, style=Style.TONE3)]) except: return text def pinyin_similarity(word1, word2): """ 计算两个词的拼音相似度 Args: word1: 词1 word2: 词2 Returns: 相似度分数 (0-1) """ p1 = get_pinyin(word1) p2 = get_pinyin(word2) if p1 == p2: return 1.0 # 简单相似度计算 common = sum(1 for c1, c2 in zip(p1, p2) if c1 == c2) max_len = max(len(p1), len(p2)) return common / max_len if max_len > 0 else 0 def ai_context_correct(text, clip_title="", all_clips=None): """ 上下文感知纠错(基于拼音相似度) Args: text: 文本 clip_title: 片段标题 all_clips: 所有片段信息 Returns: 纠错后的文本 """ # 如果文本包含异常词,尝试修复 for wrong, correct in DIRECT_FIXES.items(): if wrong in text: # 检查拼音相似度 similarity = pinyin_similarity(wrong, correct) if similarity > 0.5: text = text.replace(wrong, correct) return text def load_term_corrections_from_config(config): """ 从配置加载术语纠正词典 Args: config: 配置字典 Returns: 术语纠正词典 """ term_corrections = config.get('term_corrections', {}) # 确保基本纠正规则存在 defaults = { "副点": "附点", "实质": "时值", } defaults.update(term_corrections) return defaults