lesson-highlights/src/core/corrections.py

# -*- coding: utf-8 -*-
"""
字幕纠错模块

包含术语纠正、异常检测、上下文纠错等功能
"""

import re
from pypinyin import pinyin, Style

# ============== 直接替换词典 ==============
# 格式: "错误词": "正确词"
DIRECT_FIXES = {
    "副点": "附点",
    "拍苻": "拍符",
    "演音": "延音",
    "调苻": "调号",
    "谱苻": "谱号",
    "负点": "附点",
    "阅历": "乐理",
    "音苻": "音符",
    "首位": "手位",
    "黑剑": "黑键",
    # 新增（从lesson1经验）
    "非联奏": "非连奏",
    "任谱": "认谱",
    "实谱": "识谱",
    "任音": "认音",
    "传人": "唱人",
    "修纸符": "休止符",
    "修纸": "休止",
    "修纸整小节": "休止整小节",
}

# ============== 音符名称纠错 ==============
SONG_NAME_FIXES = {
    "Doramifasalasi": "Do Re Mi Fa So La Si",
    "刀": "do",
    "锐": "re",
    "咪": "mi",
    "发": "fa",
    "嗦": "so",
    "啦": "la",
    "西": "si",
}

# ============== 异常词检测 ==============
ANOMALY_WORDS = [
    "羞耻", "休息",  # 可能是"休止"
    "实质", "时值",  # 时值相关
]

# ============== 音乐术语 ==============
MUSIC_TERMS = [
    "音符", "休止符", "拍子", "节拍", "节奏",
    "全分", "二分", "四分", "八分", "十六分", "三十二分",
    "附点", "调号", "谱号", "音名", "唱名",
    "手型", "手位", "支撑", "放松",
    "弹奏", "非连奏", "跳奏", "连奏",
]

# ============== 异常模式 ==============
ANOMALY_PATTERNS = [
    (r'羞耻', '休止'),
    (r'实质音符', '时值音符'),
    (r'实质', '时值'),
]


def apply_term_corrections(text, corrections=None):
    """
    应用术语纠正

    Args:
        text: 原始文本
        corrections: 额外的纠正词典

    Returns:
        纠正后的文本
    """
    if not text:
        return text

    # 合并纠正词典
    all_fixes = dict(DIRECT_FIXES)
    if corrections:
        all_fixes.update(corrections)

    # 先处理长词，再处理短词（避免部分替换）
    sorted_fixes = sorted(all_fixes.items(), key=lambda x: len(x[0]), reverse=True)

    for wrong, correct in sorted_fixes:
        if wrong in text:
            text = text.replace(wrong, correct)

    return text


def apply_song_name_fixes(text):
    """应用音符名称纠错"""
    for wrong, correct in SONG_NAME_FIXES.items():
        if wrong in text:
            text = text.replace(wrong, correct)
    return text


def apply_anomaly_fixes(text):
    """应用异常模式纠错"""
    for pattern, replacement in ANOMALY_PATTERNS:
        text = re.sub(pattern, replacement, text)
    return text


def apply_all_corrections(text, extra_corrections=None):
    """
    应用所有纠错规则

    Args:
        text: 原始文本
        extra_corrections: 额外的纠正词典（来自config）

    Returns:
        纠错后的文本
    """
    text = apply_term_corrections(text, extra_corrections)
    text = apply_song_name_fixes(text)
    text = apply_anomaly_fixes(text)
    return text


def detect_anomalies(text, knowledge_terms=None):
    """
    检测文本中的异常

    Args:
        text: 文本
        knowledge_terms: 知识点列表

    Returns:
        异常词列表
    """
    anomalies = []

    # 检查异常词
    for word in ANOMALY_WORDS:
        if word in text:
            anomalies.append(word)

    # 检查是否包含知识术语
    if knowledge_terms:
        text_lower = text.lower()
        has_knowledge = any(term.lower() in text_lower for term in knowledge_terms)
        if not has_knowledge and len(text) > 10:
            anomalies.append("NO_KNOWLEDGE_TERM")

    return anomalies


def get_pinyin(text):
    """获取文本的拼音"""
    try:
        return ' '.join([p[0] for p in pinyin(text, style=Style.TONE3)])
    except:
        return text


def pinyin_similarity(word1, word2):
    """
    计算两个词的拼音相似度

    Args:
        word1: 词1
        word2: 词2

    Returns:
        相似度分数 (0-1)
    """
    p1 = get_pinyin(word1)
    p2 = get_pinyin(word2)

    if p1 == p2:
        return 1.0

    # 简单相似度计算
    common = sum(1 for c1, c2 in zip(p1, p2) if c1 == c2)
    max_len = max(len(p1), len(p2))

    return common / max_len if max_len > 0 else 0


def ai_context_correct(text, clip_title="", all_clips=None):
    """
    上下文感知纠错（基于拼音相似度）

    Args:
        text: 文本
        clip_title: 片段标题
        all_clips: 所有片段信息

    Returns:
        纠错后的文本
    """
    # 如果文本包含异常词，尝试修复
    for wrong, correct in DIRECT_FIXES.items():
        if wrong in text:
            # 检查拼音相似度
            similarity = pinyin_similarity(wrong, correct)
            if similarity > 0.5:
                text = text.replace(wrong, correct)

    return text


def load_term_corrections_from_config(config):
    """
    从配置加载术语纠正词典

    Args:
        config: 配置字典

    Returns:
        术语纠正词典
    """
    term_corrections = config.get('term_corrections', {})

    # 确保基本纠正规则存在
    defaults = {
        "副点": "附点",
        "实质": "时值",
    }

    defaults.update(term_corrections)
    return defaults