skills/image-service/scripts/vision_analyzer_simple.py

#!/usr/bin/env python3
"""
视觉分析器 - 简化版本，避免Unicode和配置问题
使用阿里云视觉模型分析图片内容
"""

import base64
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, Optional
import httpx


class VisionAnalyzer:
    """视觉分析器 - 使用阿里云qwen-vl模型"""

    # 预定义的分析模式
    ANALYSIS_MODES = {
        "describe": "请详细描述这张图片的内容，包括：人物、场景、物品、颜色、布局等所有细节。",
        "ocr": "请仔细识别这张图片中的所有文字内容，按照文字在图片中的位置顺序输出。如果是中文，请保持原文输出。",
        "chart": "请分析这张图表的内容，包括：图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
        "fashion": "请分析这张图片中人物的穿搭，包括：服装款式、颜色搭配、配饰、整体风格等。",
        "product": "请分析这张产品图片，包括：产品类型、外观特征、功能特点、品牌信息等。",
        "scene": "请描述这张图片的场景，包括：地点、环境、氛围、时间（白天/夜晚）等。",
        "custom": "用户自定义问题",
    }

    def __init__(self):
        """初始化分析器"""
        config = self._load_config()

        self.api_key = (
            config.get("VOLCENGINE_API_KEY") or "b0359bed-09f2-49e2-a53c-32ba057412e3"
        )
        self.base_url = config.get(
            "VOLCENGINE_BASE_URL", "https://ark.cn-beijing.volces.com/api/coding/v3"
        )
        self.model = config.get("VISION_MODEL", "doubao-seed-code")

        if not self.api_key:
            raise ValueError("API key is required")

    def _load_config(self) -> Dict[str, str]:
        """从配置文件加载配置"""
        # 导入简化版的配置加载
        sys.path.append(str(Path(__file__).parent))
        from load_config_simple import load_config

        return load_config()

    def encode_image(self, image_path: Path) -> str:
        """将图片编码为base64"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    def analyze(
        self, image_path: str, mode: str = "describe", custom_query: str = None
    ) -> str:
        """分析图片"""
        image_path = Path(image_path)
        if not image_path.exists():
            raise FileNotFoundError(f"Image file not found: {image_path}")

        # 获取提示词
        if mode == "custom" and custom_query:
            prompt = custom_query
        else:
            prompt = self.ANALYSIS_MODES.get(mode, self.ANALYSIS_MODES["describe"])

        # 编码图片
        image_base64 = self.encode_image(image_path)

        # 构建请求
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

        payload = {
            "model": self.model,
            "input": {
                "messages": [
                    {
                        "role": "user",
                        "content": [{"image": image_base64}, {"text": prompt}],
                    }
                ]
            },
            "parameters": {"max_tokens": 2000},
        }

        # 发送请求
        try:
            response = httpx.post(
                f"{self.base_url}/services/aigc/multimodal-generation/generation",
                headers=headers,
                json=payload,
                timeout=120.0,
            )
            response.raise_for_status()

            result = response.json()
            return result["output"]["choices"][0]["message"]["content"]

        except Exception as e:
            print(f"API request failed: {e}")
            raise


def main():
    """主函数"""
    if len(sys.argv) < 2:
        print(
            "Usage: python vision_analyzer_simple.py <image_path> [-m mode] [-q query]"
        )
        sys.exit(1)

    image_path = sys.argv[1]
    mode = "describe"
    custom_query = None

    # 解析命令行参数
    i = 2
    while i < len(sys.argv):
        if sys.argv[i] == "-m":
            mode = sys.argv[i + 1]
            i += 2
        elif sys.argv[i] == "-q":
            custom_query = sys.argv[i + 1]
            mode = "custom"
            i += 2
        else:
            i += 1

    try:
        analyzer = VisionAnalyzer()
        result = analyzer.analyze(image_path, mode, custom_query)
        print(result)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()