#!/usr/bin/env python3 """ Standalone Vision Analyzer - Simplified version for agent-vision-awareness skill This is a self-contained version of the vision analyzer that doesn't depend on the image-service skill structure, making it easier to integrate directly. """ import base64 import json import os import sys from pathlib import Path from typing import Dict, Any, Optional import httpx class StandaloneVisionAnalyzer: """Standalone vision analyzer using direct API calls.""" # Predefined analysis modes ANALYSIS_MODES = { "describe": "请详细描述这张图片的内容,包括:人物、场景、物品、颜色、布局等所有细节。", "ocr": "请仔细识别这张图片中的所有文字内容,按照文字在图片中的位置顺序输出。如果是中文,请保持原文输出。", "chart": "请分析这张图表的内容,包括:图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。", "fashion": "请分析这张图片中人物的穿搭,包括:服装款式、颜色搭配、配饰、整体风格等。", "product": "请分析这张产品图片,包括:产品类型、外观特征、功能特点、品牌信息等。", "scene": "请描述这张图片的场景,包括:地点、环境、氛围、时间(白天/夜晚)等。", "custom": "用户自定义问题", } def __init__(self, config: Optional[Dict[str, str]] = None): """ Initialize the analyzer. Args: config: Configuration dictionary with api_key, base_url, model """ if config is None: config = self._load_config() self.api_key = ( config.get("api_key") or config.get("VOLCENGINE_API_KEY") or "b0359bed-09f2-49e2-a53c-32ba057412e3" ) self.base_url = ( config.get("base_url") or "https://ark.cn-beijing.volces.com/api/coding/v3" ) self.model = config.get("model") or "doubao-seed-code" if not self.api_key or not self.base_url: raise ValueError("Missing required API configuration: api_key and base_url") def _load_config(self) -> Dict[str, str]: """Load configuration from environment variables or config file.""" config = {} # Load from environment variables config["api_key"] = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get( "DASHSCOPE_API_KEY" ) config["base_url"] = os.environ.get("VISION_API_BASE_URL") config["model"] = os.environ.get("VISION_MODEL") return config def encode_image(self, image_path: Path) -> str: """Encode image to base64.""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def analyze(self, image_path: Path, question: str) -> str: """ Analyze image content. Args: image_path: Path to the image file question: Question/prompt for analysis Returns: Analysis result text """ if not image_path.exists(): raise FileNotFoundError(f"Image not found: {image_path}") base64_image = self.encode_image(image_path) headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } payload = { "model": self.model, "messages": [ { "role": "user", "content": [ {"type": "text", "text": question}, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" }, }, ], } ], "max_tokens": 2000, } try: with httpx.Client(timeout=30.0) as client: response = client.post( f"{self.base_url}/chat/completions", headers=headers, json=payload ) response.raise_for_status() result = response.json() return result["choices"][0]["message"]["content"] except httpx.HTTPStatusError as e: if e.response.status_code == 404: raise ValueError( f"API endpoint not found, check base_url: {self.base_url}" ) elif e.response.status_code == 401: raise ValueError("Invalid or expired API key") else: raise RuntimeError(f"API request failed: {e}") except Exception as e: raise RuntimeError(f"Analysis failed: {e}") def analyze_with_mode( self, image_path: Path, mode: str = "describe", custom_question: Optional[str] = None, ) -> str: """ Analyze image with predefined mode. Args: image_path: Path to the image file mode: Analysis mode (describe, ocr, chart, fashion, product, scene, custom) custom_question: Custom question for custom mode Returns: Analysis result text """ if mode not in self.ANALYSIS_MODES: raise ValueError( f"Unsupported mode: {mode}, available: {list(self.ANALYSIS_MODES.keys())}" ) if mode == "custom": if not custom_question: raise ValueError("Custom mode requires custom_question parameter") question = custom_question else: question = self.ANALYSIS_MODES[mode] return self.analyze(image_path, question) def main(): """Command line interface.""" import argparse parser = argparse.ArgumentParser(description="Standalone Vision Analyzer") parser.add_argument("image", help="Image path") parser.add_argument( "--mode", "-m", choices=["describe", "ocr", "chart", "fashion", "product", "scene", "custom"], default="describe", help="Analysis mode", ) parser.add_argument("--question", "-q", help="Custom question for custom mode") parser.add_argument("--output", "-o", help="Output file") args = parser.parse_args() image_path = Path(args.image) if not image_path.exists(): print(f"Error: Image not found: {image_path}", file=sys.stderr) sys.exit(1) try: analyzer = StandaloneVisionAnalyzer() if args.mode == "custom": if not args.question: print( "Error: Custom mode requires --question parameter", file=sys.stderr ) sys.exit(1) result = analyzer.analyze_with_mode(image_path, "custom", args.question) else: result = analyzer.analyze_with_mode(image_path, args.mode) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result) print(f"Result saved to: {args.output}") else: print("Analysis Result:") print("-" * 50) print(result) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()