Initial commit: lesson-highlights generator
This commit is contained in:
+200
@@ -0,0 +1,200 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
LLM调用封装
|
||||
|
||||
统一管理火山方舟API调用,包含重试和错误处理
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from .constants import (
|
||||
DEFAULT_API_HOST, LLM_MODEL, LLM_TIMEOUT,
|
||||
LLM_MAX_RETRIES, LLM_TITLE_TIMEOUT, LLM_VALIDATE_TIMEOUT,
|
||||
get_api_key
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""LLM客户端封装"""
|
||||
|
||||
def __init__(self, api_key=None, api_host=None):
|
||||
# 优先使用传入的参数,其次使用环境变量
|
||||
self.api_key = api_key or get_api_key()
|
||||
self.api_host = api_host or DEFAULT_API_HOST
|
||||
if not self.api_key:
|
||||
logger.warning("No API key configured - LLM calls will be skipped")
|
||||
|
||||
def chat(self, prompt, max_tokens=500, timeout=LLM_TIMEOUT):
|
||||
"""
|
||||
发送聊天请求到LLM
|
||||
|
||||
Args:
|
||||
prompt: 提示词
|
||||
max_tokens: 最大token数
|
||||
timeout: 超时时间
|
||||
|
||||
Returns:
|
||||
LLM回复文本,失败返回None
|
||||
"""
|
||||
if not self.api_key:
|
||||
logger.info("LLM: No API key, skipping")
|
||||
return None
|
||||
|
||||
url = f"{self.api_host}/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
payload = {
|
||||
"model": LLM_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens
|
||||
}
|
||||
|
||||
for attempt in range(LLM_MAX_RETRIES):
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=payload, timeout=timeout)
|
||||
# 401错误立即停止,不重试
|
||||
if response.status_code == 401:
|
||||
logger.error(f"LLM: 401 Unauthorized - API key invalid, stopping immediately")
|
||||
return None
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
choices = result.get("choices", [])
|
||||
if not choices:
|
||||
logger.warning(f"LLM: No choices in response (attempt {attempt+1})")
|
||||
continue
|
||||
|
||||
content = choices[0].get("message", {}).get("content", "").strip()
|
||||
if content:
|
||||
return content
|
||||
|
||||
logger.warning(f"LLM: Empty content (attempt {attempt+1})")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.warning(f"LLM: Timeout (attempt {attempt+1}/{LLM_MAX_RETRIES})")
|
||||
if attempt < LLM_MAX_RETRIES - 1:
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
logger.error(f"LLM: Error - {e}")
|
||||
if attempt < LLM_MAX_RETRIES - 1:
|
||||
time.sleep(1)
|
||||
|
||||
return None
|
||||
|
||||
def correct_title(self, transcript_text, original_title, all_titles=None):
|
||||
"""
|
||||
使用LLM纠正标题
|
||||
|
||||
Args:
|
||||
transcript_text: 字幕文本
|
||||
original_title: 原始标题
|
||||
all_titles: 所有标题列表
|
||||
|
||||
Returns:
|
||||
纠正后的标题
|
||||
"""
|
||||
titles_str = ", ".join(all_titles[:20]) if all_titles else "无"
|
||||
|
||||
prompt = f"""你是一个钢琴教学视频的标题验证专家。
|
||||
|
||||
PPT提取的标题:{original_title}
|
||||
|
||||
视频字幕内容:{transcript_text[:500] if transcript_text else "无"}
|
||||
|
||||
本节课所有标题:{titles_str}
|
||||
|
||||
【重要规则】
|
||||
- 只有当你有90%以上把握认为原标题错误时,才输出纠正后的标题
|
||||
- 如果原标题基本正确,即使不完美,也必须输出原标题
|
||||
- 绝对不能输出与原标题完全不同概念的词
|
||||
- 如果不确定,输出原标题
|
||||
|
||||
请直接输出标题,不要添加任何解释。"""
|
||||
|
||||
result = self.chat(prompt, max_tokens=50, timeout=LLM_TITLE_TIMEOUT)
|
||||
return result if result else original_title
|
||||
|
||||
def validate_content(self, transcript_text, title):
|
||||
"""
|
||||
使用LLM验证内容是否与标题相关
|
||||
|
||||
Args:
|
||||
transcript_text: 字幕文本
|
||||
title: 标题
|
||||
|
||||
Returns:
|
||||
(is_valid: bool, reason: str)
|
||||
"""
|
||||
prompt = f"""判断视频字幕内容是否与标题相关。
|
||||
|
||||
标题:{title}
|
||||
|
||||
字幕内容:{transcript_text[:300] if transcript_text else "无"}
|
||||
|
||||
判断标准:
|
||||
- 内容讨论的主题与标题概念相关 = 相关
|
||||
- 内容与标题无关(如广告、闲聊、无关话题)= 无关
|
||||
- 无法判断 = 不确定
|
||||
|
||||
请直接输出:相关/无关/不确定"""
|
||||
|
||||
result = self.chat(prompt, max_tokens=20, timeout=LLM_VALIDATE_TIMEOUT)
|
||||
if not result:
|
||||
return True, "error"
|
||||
|
||||
if "无关" in result:
|
||||
return False, result
|
||||
elif "不确定" in result:
|
||||
return True, "uncertain"
|
||||
return True, result
|
||||
|
||||
def full_text_correction(self, text, clip_title, knowledge_terms=None):
|
||||
"""
|
||||
使用LLM进行全文字幕纠错
|
||||
|
||||
Args:
|
||||
text: 原始字幕
|
||||
clip_title: 片段标题
|
||||
knowledge_terms: 知识点列表
|
||||
|
||||
Returns:
|
||||
纠错后的字幕
|
||||
"""
|
||||
knowledge_str = ", ".join(knowledge_terms[:20]) if knowledge_terms else "无"
|
||||
|
||||
prompt = f"""你是一个钢琴教学视频的字幕纠错专家。
|
||||
|
||||
原始字幕:{text}
|
||||
|
||||
本节课片段标题:{clip_title}
|
||||
本节课知识点:{knowledge_str}
|
||||
|
||||
请进行字幕纠错:
|
||||
1. 修复语音识别错误(如"羞耻"→"休止","副点"→"附点","负点"→"附点")
|
||||
2. 修复同音字错误
|
||||
3. 保留原文的专业术语和表达方式
|
||||
4. 不要改变原文的语气和意思
|
||||
|
||||
请直接输出纠错后的字幕,不要添加任何解释。"""
|
||||
|
||||
result = self.chat(prompt, max_tokens=500, timeout=LLM_TIMEOUT)
|
||||
return result if result else text
|
||||
|
||||
|
||||
# 全局LLM客户端实例
|
||||
_llm_client = None
|
||||
|
||||
|
||||
def get_llm_client():
|
||||
"""获取LLM客户端单例"""
|
||||
global _llm_client
|
||||
if _llm_client is None:
|
||||
_llm_client = LLMClient()
|
||||
return _llm_client
|
||||
Reference in New Issue
Block a user