Initial commit: lesson-highlights generator

2026-05-03 03:07:22 +08:00
commit 9e62247a60
55 changed files with 6189 additions and 0 deletions
@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+"""
+LLM调用封装
+
+统一管理火山方舟API调用，包含重试和错误处理
+"""
+
+import os
+import time
+import logging
+from .constants import (
+    DEFAULT_API_HOST, LLM_MODEL, LLM_TIMEOUT,
+    LLM_MAX_RETRIES, LLM_TITLE_TIMEOUT, LLM_VALIDATE_TIMEOUT,
+    get_api_key
+)
+
+logger = logging.getLogger(__name__)
+
+import requests
+
+
+class LLMClient:
+    """LLM客户端封装"""
+
+    def __init__(self, api_key=None, api_host=None):
+        # 优先使用传入的参数，其次使用环境变量
+        self.api_key = api_key or get_api_key()
+        self.api_host = api_host or DEFAULT_API_HOST
+        if not self.api_key:
+            logger.warning("No API key configured - LLM calls will be skipped")
+
+    def chat(self, prompt, max_tokens=500, timeout=LLM_TIMEOUT):
+        """
+        发送聊天请求到LLM
+
+        Args:
+            prompt: 提示词
+            max_tokens: 最大token数
+            timeout: 超时时间
+
+        Returns:
+            LLM回复文本，失败返回None
+        """
+        if not self.api_key:
+            logger.info("LLM: No API key, skipping")
+            return None
+
+        url = f"{self.api_host}/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": LLM_MODEL,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": max_tokens
+        }
+
+        for attempt in range(LLM_MAX_RETRIES):
+            try:
+                response = requests.post(url, headers=headers, json=payload, timeout=timeout)
+                # 401错误立即停止，不重试
+                if response.status_code == 401:
+                    logger.error(f"LLM: 401 Unauthorized - API key invalid, stopping immediately")
+                    return None
+                response.raise_for_status()
+                result = response.json()
+
+                choices = result.get("choices", [])
+                if not choices:
+                    logger.warning(f"LLM: No choices in response (attempt {attempt+1})")
+                    continue
+
+                content = choices[0].get("message", {}).get("content", "").strip()
+                if content:
+                    return content
+
+                logger.warning(f"LLM: Empty content (attempt {attempt+1})")
+
+            except requests.exceptions.Timeout:
+                logger.warning(f"LLM: Timeout (attempt {attempt+1}/{LLM_MAX_RETRIES})")
+                if attempt < LLM_MAX_RETRIES - 1:
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"LLM: Error - {e}")
+                if attempt < LLM_MAX_RETRIES - 1:
+                    time.sleep(1)
+
+        return None
+
+    def correct_title(self, transcript_text, original_title, all_titles=None):
+        """
+        使用LLM纠正标题
+
+        Args:
+            transcript_text: 字幕文本
+            original_title: 原始标题
+            all_titles: 所有标题列表
+
+        Returns:
+            纠正后的标题
+        """
+        titles_str = ", ".join(all_titles[:20]) if all_titles else "无"
+
+        prompt = f"""你是一个钢琴教学视频的标题验证专家。
+
+PPT提取的标题：{original_title}
+
+视频字幕内容：{transcript_text[:500] if transcript_text else "无"}
+
+本节课所有标题：{titles_str}
+
+【重要规则】
+- 只有当你有90%以上把握认为原标题错误时，才输出纠正后的标题
+- 如果原标题基本正确，即使不完美，也必须输出原标题
+- 绝对不能输出与原标题完全不同概念的词
+- 如果不确定，输出原标题
+
+请直接输出标题，不要添加任何解释。"""
+
+        result = self.chat(prompt, max_tokens=50, timeout=LLM_TITLE_TIMEOUT)
+        return result if result else original_title
+
+    def validate_content(self, transcript_text, title):
+        """
+        使用LLM验证内容是否与标题相关
+
+        Args:
+            transcript_text: 字幕文本
+            title: 标题
+
+        Returns:
+            (is_valid: bool, reason: str)
+        """
+        prompt = f"""判断视频字幕内容是否与标题相关。
+
+标题：{title}
+
+字幕内容：{transcript_text[:300] if transcript_text else "无"}
+
+判断标准：
+- 内容讨论的主题与标题概念相关 = 相关
+- 内容与标题无关（如广告、闲聊、无关话题）= 无关
+- 无法判断 = 不确定
+
+请直接输出：相关/无关/不确定"""
+
+        result = self.chat(prompt, max_tokens=20, timeout=LLM_VALIDATE_TIMEOUT)
+        if not result:
+            return True, "error"
+
+        if "无关" in result:
+            return False, result
+        elif "不确定" in result:
+            return True, "uncertain"
+        return True, result
+
+    def full_text_correction(self, text, clip_title, knowledge_terms=None):
+        """
+        使用LLM进行全文字幕纠错
+
+        Args:
+            text: 原始字幕
+            clip_title: 片段标题
+            knowledge_terms: 知识点列表
+
+        Returns:
+            纠错后的字幕
+        """
+        knowledge_str = ", ".join(knowledge_terms[:20]) if knowledge_terms else "无"
+
+        prompt = f"""你是一个钢琴教学视频的字幕纠错专家。
+
+原始字幕：{text}
+
+本节课片段标题：{clip_title}
+本节课知识点：{knowledge_str}
+
+请进行字幕纠错：
+1. 修复语音识别错误（如"羞耻"→"休止"，"副点"→"附点"，"负点"→"附点"）
+2. 修复同音字错误
+3. 保留原文的专业术语和表达方式
+4. 不要改变原文的语气和意思
+
+请直接输出纠错后的字幕，不要添加任何解释。"""
+
+        result = self.chat(prompt, max_tokens=500, timeout=LLM_TIMEOUT)
+        return result if result else text
+
+
+# 全局LLM客户端实例
+_llm_client = None
+
+
+def get_llm_client():
+    """获取LLM客户端单例"""
+    global _llm_client
+    if _llm_client is None:
+        _llm_client = LLMClient()
+    return _llm_client