#!/usr/bin/env python3 """ 标准化视觉分析脚本 - DashScope OpenAI兼容模式 使用正确的API配置,避免Google Vision API依赖 """ import base64 import json import httpx import os from pathlib import Path # 从 skill config 读取配置 CONFIG_PATH = Path(__file__).parent / "config" / "settings.json" if CONFIG_PATH.exists(): with open(CONFIG_PATH, "r", encoding="utf-8") as f: config = json.load(f) API_KEY = config.get("vision_api", {}).get("key") BASE_URL = config.get("vision_api", {}).get( "base_url", "https://ark.cn-beijing.volces.com/api/coding/v3" ) DEFAULT_MODEL = config.get("vision_api", {}).get("model", "doubao-seed-code") else: # 回退到环境变量 API_KEY = os.getenv("VOLCENGINE_API_KEY") BASE_URL = os.getenv( "VOLCENGINE_BASE_URL", "https://ark.cn-beijing.volces.com/api/coding/v3" ) DEFAULT_MODEL = "doubao-seed-code" if not API_KEY: raise ValueError( "No API Key found. Please configure in config/settings.json or set VOLCENGINE_API_KEY" ) def encode_image(image_path): """将图片编码为base64""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def analyze_image(image_path, prompt="请识别这张图片中的所有文字内容", model=None): """分析图片内容""" if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") image_base64 = encode_image(image_path) model = model or DEFAULT_MODEL headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} payload = { "model": model, "messages": [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, }, {"type": "text", "text": prompt}, ], } ], "max_tokens": 2000, } try: response = httpx.post( f"{BASE_URL}/chat/completions", headers=headers, json=payload, timeout=120.0 ) response.raise_for_status() result = response.json() return result.get("choices", [{}])[0].get("message", {}).get("content", "") except Exception as e: raise RuntimeError(f"Vision analysis failed: {e}") def main(): """命令行接口""" import sys if len(sys.argv) < 2: print("Usage: python vision-analyze.py [prompt]") sys.exit(1) image_path = sys.argv[1] prompt = sys.argv[2] if len(sys.argv) > 2 else "请识别这张图片中的所有文字内容" try: result = analyze_image(image_path, prompt) print(result) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()