新增16个AI技能：包含图像生成、视频剪辑、数据分析、智能查询等功能模块

2026-02-13 20:18:38 +08:00
parent 456cd45de4
commit 8200a17176
154 changed files with 14585 additions and 1 deletions
--- a/.opencode/skills/image-service/scripts/image_to_text.py
+++ b/.opencode/skills/image-service/scripts/image_to_text.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""
+图生文脚本 (Image-to-Text) - 视觉识别
+使用 Qwen2.5-VL 模型分析图片内容并生成文字描述
+
+Author: 翟星人
+"""
+
+import httpx
+import base64
+import json
+import os
+from typing import Dict, Any, Optional, Union, List
+from pathlib import Path
+
+
+class ImageToTextAnalyzer:
+    """图生文分析器 - 视觉识别"""
+    
+    # 预定义的分析模式
+    ANALYSIS_MODES = {
+        "describe": "请详细描述这张图片的内容，包括：人物、场景、物品、颜色、布局等所有细节。",
+        "ocr": "请仔细识别这张图片中的所有文字内容，按照文字在图片中的位置顺序输出。如果是中文，请保持原文输出。",
+        "chart": "请分析这张图表的内容，包括：图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
+        "fashion": "请分析这张图片中人物的穿搭，包括：服装款式、颜色搭配、配饰、整体风格等。",
+        "product": "请分析这张产品图片，包括：产品类型、外观特征、功能特点、品牌信息等。",
+        "scene": "请描述这张图片的场景，包括：地点、环境、氛围、时间（白天/夜晚）等。"
+    }
+    
+    def __init__(self, config: Optional[Dict[str, str]] = None):
+        """
+        初始化分析器
+        
+        Args:
+            config: 配置字典，包含 api_key, base_url, model
+                   如果不传则从环境变量或配置文件读取
+        """
+        if config is None:
+            config = self._load_config()
+        
+        self.api_key = config.get('api_key') or config.get('VISION_API_KEY') or config.get('IMAGE_API_KEY')
+        self.base_url = config.get('base_url') or config.get('VISION_API_BASE_URL') or config.get('IMAGE_API_BASE_URL')
+        self.model = config.get('model') or config.get('VISION_MODEL') or 'qwen2.5-vl-72b-instruct'
+        
+        if not self.api_key or not self.base_url:
+            raise ValueError("缺少必要的 API 配置：api_key 和 base_url")
+    
+    def _load_config(self) -> Dict[str, str]:
+        """从配置文件或环境变量加载配置"""
+        config = {}
+        
+        # 尝试从配置文件加载
+        config_path = Path(__file__).parent.parent / 'config' / 'settings.json'
+        if config_path.exists():
+            with open(config_path, 'r', encoding='utf-8') as f:
+                settings = json.load(f)
+                # 优先使用 vision_api 配置
+                vision_config = settings.get('vision_api', {})
+                if vision_config:
+                    config['api_key'] = vision_config.get('key')
+                    config['base_url'] = vision_config.get('base_url')
+                    config['model'] = vision_config.get('model')
+                else:
+                    # 回退到 image_api 配置
+                    api_config = settings.get('image_api', {})
+                    config['api_key'] = api_config.get('key')
+                    config['base_url'] = api_config.get('base_url')
+        
+        # 环境变量优先级更高
+        config['api_key'] = os.getenv('VISION_API_KEY', os.getenv('IMAGE_API_KEY', config.get('api_key')))
+        config['base_url'] = os.getenv('VISION_API_BASE_URL', os.getenv('IMAGE_API_BASE_URL', config.get('base_url')))
+        config['model'] = os.getenv('VISION_MODEL', config.get('model', 'qwen2.5-vl-72b-instruct'))
+        
+        return config
+    
+    @staticmethod
+    def image_to_base64(image_path: str) -> str:
+        """
+        将图片文件转换为 base64 编码（带 data URL 前缀）
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            base64 编码字符串（含 data URL 前缀）
+        """
+        path = Path(image_path)
+        if not path.exists():
+            raise FileNotFoundError(f"图片文件不存在: {image_path}")
+        
+        # 获取 MIME 类型
+        suffix = path.suffix.lower()
+        mime_types = {
+            '.jpg': 'image/jpeg',
+            '.jpeg': 'image/jpeg',
+            '.png': 'image/png',
+            '.gif': 'image/gif',
+            '.webp': 'image/webp'
+        }
+        mime_type = mime_types.get(suffix, 'image/png')
+        
+        with open(image_path, 'rb') as f:
+            b64_str = base64.b64encode(f.read()).decode('utf-8')
+        
+        return f"data:{mime_type};base64,{b64_str}"
+    
+    def analyze(
+        self,
+        image: Union[str, bytes],
+        prompt: Optional[str] = None,
+        mode: str = "describe",
+        max_tokens: int = 2000,
+        temperature: float = 0.7
+    ) -> Dict[str, Any]:
+        """
+        分析图片并生成文字描述
+        
+        Args:
+            image: 图片路径、URL 或 base64 字符串
+            prompt: 自定义分析提示词（如果提供则忽略 mode）
+            mode: 分析模式 (describe/ocr/chart/fashion/product/scene)
+            max_tokens: 最大输出 token 数
+            temperature: 温度参数
+            
+        Returns:
+            包含分析结果的字典
+        """
+        # 确定使用的提示词
+        if prompt is None:
+            prompt = self.ANALYSIS_MODES.get(mode, self.ANALYSIS_MODES["describe"])
+        
+        # 处理图片输入
+        if isinstance(image, str):
+            if os.path.isfile(image):
+                image_url = self.image_to_base64(image)
+            elif image.startswith('data:') or image.startswith('http'):
+                image_url = image
+            else:
+                # 假设是纯 base64 字符串
+                image_url = f"data:image/png;base64,{image}"
+        else:
+            image_url = f"data:image/png;base64,{base64.b64encode(image).decode('utf-8')}"
+        
+        # 构建请求
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            }
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": max_tokens,
+            "temperature": temperature
+        }
+        
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        
+        try:
+            with httpx.Client(timeout=120.0) as client:
+                response = client.post(
+                    f"{self.base_url}/chat/completions",
+                    headers=headers,
+                    json=payload
+                )
+                response.raise_for_status()
+                result = response.json()
+                
+                # 提取文本内容
+                content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+                
+                return {
+                    "success": True,
+                    "content": content,
+                    "mode": mode,
+                    "usage": result.get("usage", {})
+                }
+                
+        except httpx.HTTPStatusError as e:
+            return {
+                "success": False,
+                "error": f"HTTP 错误: {e.response.status_code}",
+                "detail": str(e)
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": "分析失败",
+                "detail": str(e)
+            }
+    
+    def describe(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        """通用图片描述"""
+        return self.analyze(image, mode="describe")
+    
+    def ocr(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        """文字识别 (OCR)"""
+        return self.analyze(image, mode="ocr")
+    
+    def analyze_chart(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        """图表分析"""
+        return self.analyze(image, mode="chart")
+    
+    def analyze_fashion(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        """穿搭分析"""
+        return self.analyze(image, mode="fashion")
+    
+    def analyze_product(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        """产品分析"""
+        return self.analyze(image, mode="product")
+    
+    def analyze_scene(self, image: Union[str, bytes]) -> Dict[str, Any]:
+        """场景分析"""
+        return self.analyze(image, mode="scene")
+    
+    def batch_analyze(
+        self,
+        images: List[str],
+        mode: str = "describe"
+    ) -> List[Dict[str, Any]]:
+        """
+        批量分析多张图片
+        
+        Args:
+            images: 图片路径列表
+            mode: 分析模式
+            
+        Returns:
+            分析结果列表
+        """
+        results = []
+        for image in images:
+            result = self.analyze(image, mode=mode)
+            result["image"] = image
+            results.append(result)
+        return results
+
+
+def main():
+    """命令行入口"""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='图生文分析工具（视觉识别）')
+    parser.add_argument('image', help='输入图片路径')
+    parser.add_argument('-m', '--mode', default='describe',
+                       choices=['describe', 'ocr', 'chart', 'fashion', 'product', 'scene'],
+                       help='分析模式')
+    parser.add_argument('-p', '--prompt', help='自定义分析提示词')
+    parser.add_argument('--max-tokens', type=int, default=2000, help='最大输出 token 数')
+    
+    args = parser.parse_args()
+    
+    analyzer = ImageToTextAnalyzer()
+    result = analyzer.analyze(
+        image=args.image,
+        prompt=args.prompt,
+        mode=args.mode,
+        max_tokens=args.max_tokens
+    )
+    
+    if result["success"]:
+        print(f"\n=== 分析结果 ({result['mode']}) ===\n")
+        print(result["content"])
+        print(f"\n=== Token 使用 ===")
+        print(f"输入: {result['usage'].get('prompt_tokens', 'N/A')}")
+        print(f"输出: {result['usage'].get('completion_tokens', 'N/A')}")
+    else:
+        print(f"分析失败: {result['error']}")
+        print(f"详情: {result.get('detail', 'N/A')}")
+
+
+if __name__ == "__main__":
+    main()