Initial commit: lesson-highlights generator
This commit is contained in:
@@ -0,0 +1,233 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
字幕纠错模块
|
||||
|
||||
包含术语纠正、异常检测、上下文纠错等功能
|
||||
"""
|
||||
|
||||
import re
|
||||
from pypinyin import pinyin, Style
|
||||
|
||||
# ============== 直接替换词典 ==============
|
||||
# 格式: "错误词": "正确词"
|
||||
DIRECT_FIXES = {
|
||||
"副点": "附点",
|
||||
"拍苻": "拍符",
|
||||
"演音": "延音",
|
||||
"调苻": "调号",
|
||||
"谱苻": "谱号",
|
||||
"负点": "附点",
|
||||
"阅历": "乐理",
|
||||
"音苻": "音符",
|
||||
"首位": "手位",
|
||||
"黑剑": "黑键",
|
||||
# 新增(从lesson1经验)
|
||||
"非联奏": "非连奏",
|
||||
"任谱": "认谱",
|
||||
"实谱": "识谱",
|
||||
"任音": "认音",
|
||||
"传人": "唱人",
|
||||
"修纸符": "休止符",
|
||||
"修纸": "休止",
|
||||
"修纸整小节": "休止整小节",
|
||||
}
|
||||
|
||||
# ============== 音符名称纠错 ==============
|
||||
SONG_NAME_FIXES = {
|
||||
"Doramifasalasi": "Do Re Mi Fa So La Si",
|
||||
"刀": "do",
|
||||
"锐": "re",
|
||||
"咪": "mi",
|
||||
"发": "fa",
|
||||
"嗦": "so",
|
||||
"啦": "la",
|
||||
"西": "si",
|
||||
}
|
||||
|
||||
# ============== 异常词检测 ==============
|
||||
ANOMALY_WORDS = [
|
||||
"羞耻", "休息", # 可能是"休止"
|
||||
"实质", "时值", # 时值相关
|
||||
]
|
||||
|
||||
# ============== 音乐术语 ==============
|
||||
MUSIC_TERMS = [
|
||||
"音符", "休止符", "拍子", "节拍", "节奏",
|
||||
"全分", "二分", "四分", "八分", "十六分", "三十二分",
|
||||
"附点", "调号", "谱号", "音名", "唱名",
|
||||
"手型", "手位", "支撑", "放松",
|
||||
"弹奏", "非连奏", "跳奏", "连奏",
|
||||
]
|
||||
|
||||
# ============== 异常模式 ==============
|
||||
ANOMALY_PATTERNS = [
|
||||
(r'羞耻', '休止'),
|
||||
(r'实质音符', '时值音符'),
|
||||
(r'实质', '时值'),
|
||||
]
|
||||
|
||||
|
||||
def apply_term_corrections(text, corrections=None):
|
||||
"""
|
||||
应用术语纠正
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
corrections: 额外的纠正词典
|
||||
|
||||
Returns:
|
||||
纠正后的文本
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# 合并纠正词典
|
||||
all_fixes = dict(DIRECT_FIXES)
|
||||
if corrections:
|
||||
all_fixes.update(corrections)
|
||||
|
||||
# 先处理长词,再处理短词(避免部分替换)
|
||||
sorted_fixes = sorted(all_fixes.items(), key=lambda x: len(x[0]), reverse=True)
|
||||
|
||||
for wrong, correct in sorted_fixes:
|
||||
if wrong in text:
|
||||
text = text.replace(wrong, correct)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def apply_song_name_fixes(text):
|
||||
"""应用音符名称纠错"""
|
||||
for wrong, correct in SONG_NAME_FIXES.items():
|
||||
if wrong in text:
|
||||
text = text.replace(wrong, correct)
|
||||
return text
|
||||
|
||||
|
||||
def apply_anomaly_fixes(text):
|
||||
"""应用异常模式纠错"""
|
||||
for pattern, replacement in ANOMALY_PATTERNS:
|
||||
text = re.sub(pattern, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def apply_all_corrections(text, extra_corrections=None):
|
||||
"""
|
||||
应用所有纠错规则
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
extra_corrections: 额外的纠正词典(来自config)
|
||||
|
||||
Returns:
|
||||
纠错后的文本
|
||||
"""
|
||||
text = apply_term_corrections(text, extra_corrections)
|
||||
text = apply_song_name_fixes(text)
|
||||
text = apply_anomaly_fixes(text)
|
||||
return text
|
||||
|
||||
|
||||
def detect_anomalies(text, knowledge_terms=None):
|
||||
"""
|
||||
检测文本中的异常
|
||||
|
||||
Args:
|
||||
text: 文本
|
||||
knowledge_terms: 知识点列表
|
||||
|
||||
Returns:
|
||||
异常词列表
|
||||
"""
|
||||
anomalies = []
|
||||
|
||||
# 检查异常词
|
||||
for word in ANOMALY_WORDS:
|
||||
if word in text:
|
||||
anomalies.append(word)
|
||||
|
||||
# 检查是否包含知识术语
|
||||
if knowledge_terms:
|
||||
text_lower = text.lower()
|
||||
has_knowledge = any(term.lower() in text_lower for term in knowledge_terms)
|
||||
if not has_knowledge and len(text) > 10:
|
||||
anomalies.append("NO_KNOWLEDGE_TERM")
|
||||
|
||||
return anomalies
|
||||
|
||||
|
||||
def get_pinyin(text):
|
||||
"""获取文本的拼音"""
|
||||
try:
|
||||
return ' '.join([p[0] for p in pinyin(text, style=Style.TONE3)])
|
||||
except:
|
||||
return text
|
||||
|
||||
|
||||
def pinyin_similarity(word1, word2):
|
||||
"""
|
||||
计算两个词的拼音相似度
|
||||
|
||||
Args:
|
||||
word1: 词1
|
||||
word2: 词2
|
||||
|
||||
Returns:
|
||||
相似度分数 (0-1)
|
||||
"""
|
||||
p1 = get_pinyin(word1)
|
||||
p2 = get_pinyin(word2)
|
||||
|
||||
if p1 == p2:
|
||||
return 1.0
|
||||
|
||||
# 简单相似度计算
|
||||
common = sum(1 for c1, c2 in zip(p1, p2) if c1 == c2)
|
||||
max_len = max(len(p1), len(p2))
|
||||
|
||||
return common / max_len if max_len > 0 else 0
|
||||
|
||||
|
||||
def ai_context_correct(text, clip_title="", all_clips=None):
|
||||
"""
|
||||
上下文感知纠错(基于拼音相似度)
|
||||
|
||||
Args:
|
||||
text: 文本
|
||||
clip_title: 片段标题
|
||||
all_clips: 所有片段信息
|
||||
|
||||
Returns:
|
||||
纠错后的文本
|
||||
"""
|
||||
# 如果文本包含异常词,尝试修复
|
||||
for wrong, correct in DIRECT_FIXES.items():
|
||||
if wrong in text:
|
||||
# 检查拼音相似度
|
||||
similarity = pinyin_similarity(wrong, correct)
|
||||
if similarity > 0.5:
|
||||
text = text.replace(wrong, correct)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def load_term_corrections_from_config(config):
|
||||
"""
|
||||
从配置加载术语纠正词典
|
||||
|
||||
Args:
|
||||
config: 配置字典
|
||||
|
||||
Returns:
|
||||
术语纠正词典
|
||||
"""
|
||||
term_corrections = config.get('term_corrections', {})
|
||||
|
||||
# 确保基本纠正规则存在
|
||||
defaults = {
|
||||
"副点": "附点",
|
||||
"实质": "时值",
|
||||
}
|
||||
|
||||
defaults.update(term_corrections)
|
||||
return defaults
|
||||
Reference in New Issue
Block a user