skills/vision-analyze.py

#!/usr/bin/env python3
"""
标准化视觉分析脚本 - DashScope OpenAI兼容模式
使用正确的API配置，避免Google Vision API依赖
"""

import base64
import json
import httpx
import os
from pathlib import Path

# 从 skill config 读取配置
CONFIG_PATH = Path(__file__).parent / "config" / "settings.json"
if CONFIG_PATH.exists():
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        config = json.load(f)
        API_KEY = config.get("vision_api", {}).get("key")
        BASE_URL = config.get("vision_api", {}).get(
            "base_url", "https://ark.cn-beijing.volces.com/api/coding/v3"
        )
        DEFAULT_MODEL = config.get("vision_api", {}).get("model", "doubao-seed-code")
else:
    # 回退到环境变量
    API_KEY = os.getenv("VOLCENGINE_API_KEY")
    BASE_URL = os.getenv(
        "VOLCENGINE_BASE_URL", "https://ark.cn-beijing.volces.com/api/coding/v3"
    )
    DEFAULT_MODEL = "doubao-seed-code"

if not API_KEY:
    raise ValueError(
        "No API Key found. Please configure in config/settings.json or set VOLCENGINE_API_KEY"
    )


def encode_image(image_path):
    """将图片编码为base64"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def analyze_image(image_path, prompt="请识别这张图片中的所有文字内容", model=None):
    """分析图片内容"""
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found: {image_path}")

    image_base64 = encode_image(image_path)
    model = model or DEFAULT_MODEL

    headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
        "max_tokens": 2000,
    }

    try:
        response = httpx.post(
            f"{BASE_URL}/chat/completions", headers=headers, json=payload, timeout=120.0
        )
        response.raise_for_status()
        result = response.json()
        return result.get("choices", [{}])[0].get("message", {}).get("content", "")
    except Exception as e:
        raise RuntimeError(f"Vision analysis failed: {e}")


def main():
    """命令行接口"""
    import sys

    if len(sys.argv) < 2:
        print("Usage: python vision-analyze.py <image_path> [prompt]")
        sys.exit(1)

    image_path = sys.argv[1]
    prompt = sys.argv[2] if len(sys.argv) > 2 else "请识别这张图片中的所有文字内容"

    try:
        result = analyze_image(image_path, prompt)
        print(result)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()