233 lines
5.3 KiB
Python
233 lines
5.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
字幕纠错模块
|
|
|
|
包含术语纠正、异常检测、上下文纠错等功能
|
|
"""
|
|
|
|
import re
|
|
from pypinyin import pinyin, Style
|
|
|
|
# ============== 直接替换词典 ==============
|
|
# 格式: "错误词": "正确词"
|
|
DIRECT_FIXES = {
|
|
"副点": "附点",
|
|
"拍苻": "拍符",
|
|
"演音": "延音",
|
|
"调苻": "调号",
|
|
"谱苻": "谱号",
|
|
"负点": "附点",
|
|
"阅历": "乐理",
|
|
"音苻": "音符",
|
|
"首位": "手位",
|
|
"黑剑": "黑键",
|
|
# 新增(从lesson1经验)
|
|
"非联奏": "非连奏",
|
|
"任谱": "认谱",
|
|
"实谱": "识谱",
|
|
"任音": "认音",
|
|
"传人": "唱人",
|
|
"修纸符": "休止符",
|
|
"修纸": "休止",
|
|
"修纸整小节": "休止整小节",
|
|
}
|
|
|
|
# ============== 音符名称纠错 ==============
|
|
SONG_NAME_FIXES = {
|
|
"Doramifasalasi": "Do Re Mi Fa So La Si",
|
|
"刀": "do",
|
|
"锐": "re",
|
|
"咪": "mi",
|
|
"发": "fa",
|
|
"嗦": "so",
|
|
"啦": "la",
|
|
"西": "si",
|
|
}
|
|
|
|
# ============== 异常词检测 ==============
|
|
ANOMALY_WORDS = [
|
|
"羞耻", "休息", # 可能是"休止"
|
|
"实质", "时值", # 时值相关
|
|
]
|
|
|
|
# ============== 音乐术语 ==============
|
|
MUSIC_TERMS = [
|
|
"音符", "休止符", "拍子", "节拍", "节奏",
|
|
"全分", "二分", "四分", "八分", "十六分", "三十二分",
|
|
"附点", "调号", "谱号", "音名", "唱名",
|
|
"手型", "手位", "支撑", "放松",
|
|
"弹奏", "非连奏", "跳奏", "连奏",
|
|
]
|
|
|
|
# ============== 异常模式 ==============
|
|
ANOMALY_PATTERNS = [
|
|
(r'羞耻', '休止'),
|
|
(r'实质音符', '时值音符'),
|
|
(r'实质', '时值'),
|
|
]
|
|
|
|
|
|
def apply_term_corrections(text, corrections=None):
|
|
"""
|
|
应用术语纠正
|
|
|
|
Args:
|
|
text: 原始文本
|
|
corrections: 额外的纠正词典
|
|
|
|
Returns:
|
|
纠正后的文本
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
# 合并纠正词典
|
|
all_fixes = dict(DIRECT_FIXES)
|
|
if corrections:
|
|
all_fixes.update(corrections)
|
|
|
|
# 先处理长词,再处理短词(避免部分替换)
|
|
sorted_fixes = sorted(all_fixes.items(), key=lambda x: len(x[0]), reverse=True)
|
|
|
|
for wrong, correct in sorted_fixes:
|
|
if wrong in text:
|
|
text = text.replace(wrong, correct)
|
|
|
|
return text
|
|
|
|
|
|
def apply_song_name_fixes(text):
|
|
"""应用音符名称纠错"""
|
|
for wrong, correct in SONG_NAME_FIXES.items():
|
|
if wrong in text:
|
|
text = text.replace(wrong, correct)
|
|
return text
|
|
|
|
|
|
def apply_anomaly_fixes(text):
|
|
"""应用异常模式纠错"""
|
|
for pattern, replacement in ANOMALY_PATTERNS:
|
|
text = re.sub(pattern, replacement, text)
|
|
return text
|
|
|
|
|
|
def apply_all_corrections(text, extra_corrections=None):
|
|
"""
|
|
应用所有纠错规则
|
|
|
|
Args:
|
|
text: 原始文本
|
|
extra_corrections: 额外的纠正词典(来自config)
|
|
|
|
Returns:
|
|
纠错后的文本
|
|
"""
|
|
text = apply_term_corrections(text, extra_corrections)
|
|
text = apply_song_name_fixes(text)
|
|
text = apply_anomaly_fixes(text)
|
|
return text
|
|
|
|
|
|
def detect_anomalies(text, knowledge_terms=None):
|
|
"""
|
|
检测文本中的异常
|
|
|
|
Args:
|
|
text: 文本
|
|
knowledge_terms: 知识点列表
|
|
|
|
Returns:
|
|
异常词列表
|
|
"""
|
|
anomalies = []
|
|
|
|
# 检查异常词
|
|
for word in ANOMALY_WORDS:
|
|
if word in text:
|
|
anomalies.append(word)
|
|
|
|
# 检查是否包含知识术语
|
|
if knowledge_terms:
|
|
text_lower = text.lower()
|
|
has_knowledge = any(term.lower() in text_lower for term in knowledge_terms)
|
|
if not has_knowledge and len(text) > 10:
|
|
anomalies.append("NO_KNOWLEDGE_TERM")
|
|
|
|
return anomalies
|
|
|
|
|
|
def get_pinyin(text):
|
|
"""获取文本的拼音"""
|
|
try:
|
|
return ' '.join([p[0] for p in pinyin(text, style=Style.TONE3)])
|
|
except:
|
|
return text
|
|
|
|
|
|
def pinyin_similarity(word1, word2):
|
|
"""
|
|
计算两个词的拼音相似度
|
|
|
|
Args:
|
|
word1: 词1
|
|
word2: 词2
|
|
|
|
Returns:
|
|
相似度分数 (0-1)
|
|
"""
|
|
p1 = get_pinyin(word1)
|
|
p2 = get_pinyin(word2)
|
|
|
|
if p1 == p2:
|
|
return 1.0
|
|
|
|
# 简单相似度计算
|
|
common = sum(1 for c1, c2 in zip(p1, p2) if c1 == c2)
|
|
max_len = max(len(p1), len(p2))
|
|
|
|
return common / max_len if max_len > 0 else 0
|
|
|
|
|
|
def ai_context_correct(text, clip_title="", all_clips=None):
|
|
"""
|
|
上下文感知纠错(基于拼音相似度)
|
|
|
|
Args:
|
|
text: 文本
|
|
clip_title: 片段标题
|
|
all_clips: 所有片段信息
|
|
|
|
Returns:
|
|
纠错后的文本
|
|
"""
|
|
# 如果文本包含异常词,尝试修复
|
|
for wrong, correct in DIRECT_FIXES.items():
|
|
if wrong in text:
|
|
# 检查拼音相似度
|
|
similarity = pinyin_similarity(wrong, correct)
|
|
if similarity > 0.5:
|
|
text = text.replace(wrong, correct)
|
|
|
|
return text
|
|
|
|
|
|
def load_term_corrections_from_config(config):
|
|
"""
|
|
从配置加载术语纠正词典
|
|
|
|
Args:
|
|
config: 配置字典
|
|
|
|
Returns:
|
|
术语纠正词典
|
|
"""
|
|
term_corrections = config.get('term_corrections', {})
|
|
|
|
# 确保基本纠正规则存在
|
|
defaults = {
|
|
"副点": "附点",
|
|
"实质": "时值",
|
|
}
|
|
|
|
defaults.update(term_corrections)
|
|
return defaults |