skills/agent-vision-awareness/scripts/standalone_vision_analyzer.py

#!/usr/bin/env python3
"""
Standalone Vision Analyzer - Simplified version for agent-vision-awareness skill

This is a self-contained version of the vision analyzer that doesn't depend on
the image-service skill structure, making it easier to integrate directly.
"""

import base64
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, Optional
import httpx


class StandaloneVisionAnalyzer:
    """Standalone vision analyzer using direct API calls."""

    # Predefined analysis modes
    ANALYSIS_MODES = {
        "describe": "请详细描述这张图片的内容，包括：人物、场景、物品、颜色、布局等所有细节。",
        "ocr": "请仔细识别这张图片中的所有文字内容，按照文字在图片中的位置顺序输出。如果是中文，请保持原文输出。",
        "chart": "请分析这张图表的内容，包括：图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
        "fashion": "请分析这张图片中人物的穿搭，包括：服装款式、颜色搭配、配饰、整体风格等。",
        "product": "请分析这张产品图片，包括：产品类型、外观特征、功能特点、品牌信息等。",
        "scene": "请描述这张图片的场景，包括：地点、环境、氛围、时间（白天/夜晚）等。",
        "custom": "用户自定义问题",
    }

    def __init__(self, config: Optional[Dict[str, str]] = None):
        """
        Initialize the analyzer.

        Args:
            config: Configuration dictionary with api_key, base_url, model
        """
        if config is None:
            config = self._load_config()

        self.api_key = (
            config.get("api_key")
            or config.get("VOLCENGINE_API_KEY")
            or "b0359bed-09f2-49e2-a53c-32ba057412e3"
        )
        self.base_url = (
            config.get("base_url") or "https://ark.cn-beijing.volces.com/api/coding/v3"
        )
        self.model = config.get("model") or "doubao-seed-code"

        if not self.api_key or not self.base_url:
            raise ValueError("Missing required API configuration: api_key and base_url")

    def _load_config(self) -> Dict[str, str]:
        """Load configuration from environment variables or config file."""
        config = {}

        # Load from environment variables
        config["api_key"] = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
            "DASHSCOPE_API_KEY"
        )
        config["base_url"] = os.environ.get("VISION_API_BASE_URL")
        config["model"] = os.environ.get("VISION_MODEL")

        return config

    def encode_image(self, image_path: Path) -> str:
        """Encode image to base64."""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    def analyze(self, image_path: Path, question: str) -> str:
        """
        Analyze image content.

        Args:
            image_path: Path to the image file
            question: Question/prompt for analysis

        Returns:
            Analysis result text
        """
        if not image_path.exists():
            raise FileNotFoundError(f"Image not found: {image_path}")

        base64_image = self.encode_image(image_path)

        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

        payload = {
            "model": self.model,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"
                            },
                        },
                    ],
                }
            ],
            "max_tokens": 2000,
        }

        try:
            with httpx.Client(timeout=30.0) as client:
                response = client.post(
                    f"{self.base_url}/chat/completions", headers=headers, json=payload
                )
                response.raise_for_status()
                result = response.json()
                return result["choices"][0]["message"]["content"]
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 404:
                raise ValueError(
                    f"API endpoint not found, check base_url: {self.base_url}"
                )
            elif e.response.status_code == 401:
                raise ValueError("Invalid or expired API key")
            else:
                raise RuntimeError(f"API request failed: {e}")
        except Exception as e:
            raise RuntimeError(f"Analysis failed: {e}")

    def analyze_with_mode(
        self,
        image_path: Path,
        mode: str = "describe",
        custom_question: Optional[str] = None,
    ) -> str:
        """
        Analyze image with predefined mode.

        Args:
            image_path: Path to the image file
            mode: Analysis mode (describe, ocr, chart, fashion, product, scene, custom)
            custom_question: Custom question for custom mode

        Returns:
            Analysis result text
        """
        if mode not in self.ANALYSIS_MODES:
            raise ValueError(
                f"Unsupported mode: {mode}, available: {list(self.ANALYSIS_MODES.keys())}"
            )

        if mode == "custom":
            if not custom_question:
                raise ValueError("Custom mode requires custom_question parameter")
            question = custom_question
        else:
            question = self.ANALYSIS_MODES[mode]

        return self.analyze(image_path, question)


def main():
    """Command line interface."""
    import argparse

    parser = argparse.ArgumentParser(description="Standalone Vision Analyzer")
    parser.add_argument("image", help="Image path")
    parser.add_argument(
        "--mode",
        "-m",
        choices=["describe", "ocr", "chart", "fashion", "product", "scene", "custom"],
        default="describe",
        help="Analysis mode",
    )
    parser.add_argument("--question", "-q", help="Custom question for custom mode")
    parser.add_argument("--output", "-o", help="Output file")

    args = parser.parse_args()

    image_path = Path(args.image)
    if not image_path.exists():
        print(f"Error: Image not found: {image_path}", file=sys.stderr)
        sys.exit(1)

    try:
        analyzer = StandaloneVisionAnalyzer()

        if args.mode == "custom":
            if not args.question:
                print(
                    "Error: Custom mode requires --question parameter", file=sys.stderr
                )
                sys.exit(1)
            result = analyzer.analyze_with_mode(image_path, "custom", args.question)
        else:
            result = analyzer.analyze_with_mode(image_path, args.mode)

        if args.output:
            with open(args.output, "w", encoding="utf-8") as f:
                f.write(result)
            print(f"Result saved to: {args.output}")
        else:
            print("Analysis Result:")
            print("-" * 50)
            print(result)

    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()