Files
lesson-highlights/src/core/corrections.py
T

233 lines
5.3 KiB
Python

# -*- coding: utf-8 -*-
"""
字幕纠错模块
包含术语纠正、异常检测、上下文纠错等功能
"""
import re
from pypinyin import pinyin, Style
# ============== 直接替换词典 ==============
# 格式: "错误词": "正确词"
DIRECT_FIXES = {
"副点": "附点",
"拍苻": "拍符",
"演音": "延音",
"调苻": "调号",
"谱苻": "谱号",
"负点": "附点",
"阅历": "乐理",
"音苻": "音符",
"首位": "手位",
"黑剑": "黑键",
# 新增(从lesson1经验)
"非联奏": "非连奏",
"任谱": "认谱",
"实谱": "识谱",
"任音": "认音",
"传人": "唱人",
"修纸符": "休止符",
"修纸": "休止",
"修纸整小节": "休止整小节",
}
# ============== 音符名称纠错 ==============
SONG_NAME_FIXES = {
"Doramifasalasi": "Do Re Mi Fa So La Si",
"": "do",
"": "re",
"": "mi",
"": "fa",
"": "so",
"": "la",
"西": "si",
}
# ============== 异常词检测 ==============
ANOMALY_WORDS = [
"羞耻", "休息", # 可能是"休止"
"实质", "时值", # 时值相关
]
# ============== 音乐术语 ==============
MUSIC_TERMS = [
"音符", "休止符", "拍子", "节拍", "节奏",
"全分", "二分", "四分", "八分", "十六分", "三十二分",
"附点", "调号", "谱号", "音名", "唱名",
"手型", "手位", "支撑", "放松",
"弹奏", "非连奏", "跳奏", "连奏",
]
# ============== 异常模式 ==============
ANOMALY_PATTERNS = [
(r'羞耻', '休止'),
(r'实质音符', '时值音符'),
(r'实质', '时值'),
]
def apply_term_corrections(text, corrections=None):
"""
应用术语纠正
Args:
text: 原始文本
corrections: 额外的纠正词典
Returns:
纠正后的文本
"""
if not text:
return text
# 合并纠正词典
all_fixes = dict(DIRECT_FIXES)
if corrections:
all_fixes.update(corrections)
# 先处理长词,再处理短词(避免部分替换)
sorted_fixes = sorted(all_fixes.items(), key=lambda x: len(x[0]), reverse=True)
for wrong, correct in sorted_fixes:
if wrong in text:
text = text.replace(wrong, correct)
return text
def apply_song_name_fixes(text):
"""应用音符名称纠错"""
for wrong, correct in SONG_NAME_FIXES.items():
if wrong in text:
text = text.replace(wrong, correct)
return text
def apply_anomaly_fixes(text):
"""应用异常模式纠错"""
for pattern, replacement in ANOMALY_PATTERNS:
text = re.sub(pattern, replacement, text)
return text
def apply_all_corrections(text, extra_corrections=None):
"""
应用所有纠错规则
Args:
text: 原始文本
extra_corrections: 额外的纠正词典(来自config)
Returns:
纠错后的文本
"""
text = apply_term_corrections(text, extra_corrections)
text = apply_song_name_fixes(text)
text = apply_anomaly_fixes(text)
return text
def detect_anomalies(text, knowledge_terms=None):
"""
检测文本中的异常
Args:
text: 文本
knowledge_terms: 知识点列表
Returns:
异常词列表
"""
anomalies = []
# 检查异常词
for word in ANOMALY_WORDS:
if word in text:
anomalies.append(word)
# 检查是否包含知识术语
if knowledge_terms:
text_lower = text.lower()
has_knowledge = any(term.lower() in text_lower for term in knowledge_terms)
if not has_knowledge and len(text) > 10:
anomalies.append("NO_KNOWLEDGE_TERM")
return anomalies
def get_pinyin(text):
"""获取文本的拼音"""
try:
return ' '.join([p[0] for p in pinyin(text, style=Style.TONE3)])
except:
return text
def pinyin_similarity(word1, word2):
"""
计算两个词的拼音相似度
Args:
word1: 词1
word2: 词2
Returns:
相似度分数 (0-1)
"""
p1 = get_pinyin(word1)
p2 = get_pinyin(word2)
if p1 == p2:
return 1.0
# 简单相似度计算
common = sum(1 for c1, c2 in zip(p1, p2) if c1 == c2)
max_len = max(len(p1), len(p2))
return common / max_len if max_len > 0 else 0
def ai_context_correct(text, clip_title="", all_clips=None):
"""
上下文感知纠错(基于拼音相似度)
Args:
text: 文本
clip_title: 片段标题
all_clips: 所有片段信息
Returns:
纠错后的文本
"""
# 如果文本包含异常词,尝试修复
for wrong, correct in DIRECT_FIXES.items():
if wrong in text:
# 检查拼音相似度
similarity = pinyin_similarity(wrong, correct)
if similarity > 0.5:
text = text.replace(wrong, correct)
return text
def load_term_corrections_from_config(config):
"""
从配置加载术语纠正词典
Args:
config: 配置字典
Returns:
术语纠正词典
"""
term_corrections = config.get('term_corrections', {})
# 确保基本纠正规则存在
defaults = {
"副点": "附点",
"实质": "时值",
}
defaults.update(term_corrections)
return defaults