skills/agent-vision-awareness/scripts/integrate_vision.py

#!/usr/bin/env python3
"""
Integration script for agent-vision-awareness skill

This script demonstrates the complete workflow:
1. Detect visual content in user input
2. Extract image paths
3. Analyze images using the vision analyzer
4. Return structured results

This replaces the problematic custom agent delegation approach.
"""

import sys
import os
from pathlib import Path
from typing import Dict, Any, List


def process_user_input(
    user_input: str, user_request: str = "", config: Dict[str, str] = None
) -> Dict[str, Any]:
    """
    Process user input for visual content and return analysis results.

    Args:
        user_input: The user's input text that may contain visual content references
        user_request: The original user request/context (optional)
        config: Configuration for the vision analyzer (optional)

    Returns:
        Dictionary containing detection results and analysis
    """
    try:
        # Import the detector and analyzer
        from .vision_detector import VisionContentDetector, DetectionConfidence
        from .standalone_vision_analyzer import StandaloneVisionAnalyzer

        # Initialize components
        detector = VisionContentDetector()
        analyzer = StandaloneVisionAnalyzer(config)

        # Step 1: Detect visual content
        confidence, detected_items = detector.detect_visual_content(user_input)
        result = {
            "status": "success",
            "confidence": confidence.value,
            "detected_items": detected_items,
            "analysis_results": [],
            "errors": [],
        }

        if confidence == DetectionConfidence.NONE:
            result["message"] = "No visual content detected"
            return result

        # Step 2: Extract image paths
        image_paths = detector.extract_image_paths(user_input)
        if not image_paths:
            result["message"] = "Visual content detected but no valid image paths found"
            result["errors"].append("No valid image paths found")
            return result

        # Step 3: Determine analysis mode
        combined_text = (user_request + " " + user_input).lower()
        if any(
            word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
        ):
            mode = "ocr"
        elif any(
            word in combined_text
            for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
        ):
            mode = "chart"
        elif any(
            word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
        ):
            mode = "fashion"
        elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
            mode = "product"
        elif any(
            word in combined_text
            for word in ["scene", "场景", "环境", "location", "place"]
        ):
            mode = "scene"
        else:
            mode = "describe"

        # Step 4: Analyze each image
        for image_path in image_paths:
            try:
                # Handle relative paths
                if not os.path.isabs(image_path):
                    image_path = os.path.join(os.getcwd(), image_path)

                # Analyze the image
                if mode == "custom":
                    analysis_result = analyzer.analyze_with_mode(
                        Path(image_path),
                        "custom",
                        user_request or "Please analyze this image.",
                    )
                else:
                    analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)

                result["analysis_results"].append(
                    {"image_path": image_path, "mode": mode, "result": analysis_result}
                )

            except Exception as e:
                error_msg = f"Failed to analyze {image_path}: {str(e)}"
                result["errors"].append(error_msg)
                print(f"Error: {error_msg}", file=sys.stderr)

        return result

    except Exception as e:
        return {
            "status": "error",
            "error": str(e),
            "message": f"Processing failed: {str(e)}",
        }


def main():
    """Command line interface for testing."""
    import argparse
    import json

    parser = argparse.ArgumentParser(description="Process visual content in user input")
    parser.add_argument("input", help="User input containing visual content references")
    parser.add_argument("--request", "-r", help="Original user request/context")
    parser.addiction_group = parser.add_mutually_exclusive_group()
    parser.addiction_group.add_argument("--api-key", help="API key for vision service")
    parser.addiction_group.add_argument("--config-file", help="Configuration file path")
    parser.add_argument("--output", "-o", help="Output file for results")

    args = parser.parse_args()

    # Build configuration
    config = {}
    if args.api_key:
        config["api_key"] = args.api_key
        config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
        config["model"] = "doubao-seed-code"
    elif args.config_file:
        import json

        with open(args.config_file, "r", encoding="utf-8") as f:
            config = json.load(f)

    # Process the input
    result = process_user_input(args.input, args.request or "", config)

    # Output results
    output = json.dumps(result, indent=2, ensure_ascii=False)

    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output)
        print(f"Results saved to: {args.output}")
    else:
        print(output)


if __name__ == "__main__":
    main()