skills/agent-vision-awareness/scripts/vision_processor.py

#!/usr/bin/env python3
"""
Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill

This script provides the actual implementation that replaces the problematic
custom agent delegation approach described in the outdated documentation.
"""

import sys
import os
from pathlib import Path
from typing import Dict, Any, Optional, List
from enum import Enum

# Add the image-service scripts to path to reuse the vision analyzer
sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))

try:
    from vision_analyzer import VisionAnalyzer
except ImportError:
    # Fallback: try to import from current directory if vision_analyzer is copied here
    try:
        from .vision_analyzer import VisionAnalyzer
    except ImportError:
        raise ImportError(
            "Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
        )


class AnalysisMode(Enum):
    """Available analysis modes."""

    DESCRIBE = "describe"
    OCR = "ocr"
    CHART = "chart"
    FASHION = "fashion"
    PRODUCT = "product"
    SCENE = "scene"
    CUSTOM = "custom"


class VisionProcessor:
    """Main vision processing class that integrates detection and analysis."""

    def __init__(self, config: Optional[Dict[str, str]] = None):
        """
        Initialize the vision processor.

        Args:
            config: Configuration dictionary for the VisionAnalyzer
        """
        self.analyzer = VisionAnalyzer(config)
        self.detector = None  # Will be created when needed

    def process_visual_content(
        self, user_input: str, user_request: str = ""
    ) -> Dict[str, Any]:
        """
        Process visual content in user input.

        Args:
            user_input: The user's input text that may contain visual content references
            user_request: The original user request/context

        Returns:
            Dictionary containing analysis results and metadata
        """
        from .vision_detector import VisionContentDetector, DetectionConfidence

        # Initialize detector if not already done
        if self.detector is None:
            self.detector = VisionContentDetector()

        # Detect visual content
        confidence, detected_items = self.detector.detect_visual_content(user_input)
        result = {
            "confidence": confidence.value,
            "detected_items": detected_items,
            "analysis_results": [],
            "errors": [],
        }

        if confidence == DetectionConfidence.NONE:
            result["message"] = "No visual content detected"
            return result

        # Extract image paths
        image_paths = self.detector.extract_image_paths(user_input)
        if not image_paths:
            result["message"] = "Visual content detected but no valid image paths found"
            result["errors"].append("No valid image paths found")
            return result

        # Determine analysis mode based on user request
        analysis_mode = self._determine_analysis_mode(user_request, user_input)

        # Process each image
        for image_path in image_paths:
            try:
                # Handle URLs by downloading first (simplified - in practice would need download logic)
                if image_path.startswith(("http://", "https://")):
                    # In a real implementation, you'd download the URL to a temp file
                    # For now, we'll assume local paths only
                    result["errors"].append(
                        f"URL handling not implemented: {image_path}"
                    )
                    continue

                # Ensure path is absolute
                if not os.path.isabs(image_path):
                    # Try to resolve relative to current working directory
                    image_path = os.path.join(os.getcwd(), image_path)

                # Analyze the image
                if analysis_mode == AnalysisMode.CUSTOM:
                    # Use the user request as the custom question
                    analysis_result = self.analyzer.analyze_with_mode(
                        Path(image_path),
                        "custom",
                        user_request or "Please analyze this image.",
                    )
                else:
                    analysis_result = self.analyzer.analyze_with_mode(
                        Path(image_path), analysis_mode.value
                    )

                result["analysis_results"].append(
                    {
                        "image_path": image_path,
                        "analysis_mode": analysis_mode.value,
                        "result": analysis_result,
                    }
                )

            except Exception as e:
                error_msg = f"Failed to analyze {image_path}: {str(e)}"
                result["errors"].append(error_msg)
                print(f"Error: {error_msg}", file=sys.stderr)

        return result

    def _determine_analysis_mode(
        self, user_request: str, user_input: str
    ) -> AnalysisMode:
        """
        Determine the appropriate analysis mode based on user context.

        Args:
            user_request: The user's original request
            user_input: The full input containing visual content references

        Returns:
            AnalysisMode enum value
        """
        combined_text = (user_request + " " + user_input).lower()

        # Check for specific keywords to determine mode
        if any(
            word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
        ):
            return AnalysisMode.OCR
        elif any(
            word in combined_text
            for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
        ):
            return AnalysisMode.CHART
        elif any(
            word in combined_text
            for word in ["fashion", "服装", "穿搭", "style", "style"]
        ):
            return AnalysisMode.FASHION
        elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
            return AnalysisMode.PRODUCT
        elif any(
            word in combined_text
            for word in ["scene", "场景", "环境", "location", "place"]
        ):
            return AnalysisMode.SCENE
        else:
            return AnalysisMode.DESCRIBE


def main():
    """Command line interface for testing."""
    import argparse

    parser = argparse.ArgumentParser(description="Process visual content in user input")
    parser.add_argument("input", help="User input containing visual content references")
    parser.add_argument("--request", "-r", help="Original user request/context")
    parser.add_argument("--output", "-o", help="Output file for results")

    args = parser.parse_args()

    try:
        processor = VisionProcessor()
        result = processor.process_visual_content(args.input, args.request or "")

        import json

        output = json.dumps(result, indent=2, ensure_ascii=False)

        if args.output:
            with open(args.output, "w", encoding="utf-8") as f:
                f.write(output)
            print(f"Results saved to: {args.output}")
        else:
            print(output)

    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()