#!/usr/bin/env python3 """ Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill This script provides the actual implementation that replaces the problematic custom agent delegation approach described in the outdated documentation. """ import sys import os from pathlib import Path from typing import Dict, Any, Optional, List from enum import Enum # Add the image-service scripts to path to reuse the vision analyzer sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts")) try: from vision_analyzer import VisionAnalyzer except ImportError: # Fallback: try to import from current directory if vision_analyzer is copied here try: from .vision_analyzer import VisionAnalyzer except ImportError: raise ImportError( "Cannot find VisionAnalyzer. Please ensure image-service is properly installed." ) class AnalysisMode(Enum): """Available analysis modes.""" DESCRIBE = "describe" OCR = "ocr" CHART = "chart" FASHION = "fashion" PRODUCT = "product" SCENE = "scene" CUSTOM = "custom" class VisionProcessor: """Main vision processing class that integrates detection and analysis.""" def __init__(self, config: Optional[Dict[str, str]] = None): """ Initialize the vision processor. Args: config: Configuration dictionary for the VisionAnalyzer """ self.analyzer = VisionAnalyzer(config) self.detector = None # Will be created when needed def process_visual_content( self, user_input: str, user_request: str = "" ) -> Dict[str, Any]: """ Process visual content in user input. Args: user_input: The user's input text that may contain visual content references user_request: The original user request/context Returns: Dictionary containing analysis results and metadata """ from .vision_detector import VisionContentDetector, DetectionConfidence # Initialize detector if not already done if self.detector is None: self.detector = VisionContentDetector() # Detect visual content confidence, detected_items = self.detector.detect_visual_content(user_input) result = { "confidence": confidence.value, "detected_items": detected_items, "analysis_results": [], "errors": [], } if confidence == DetectionConfidence.NONE: result["message"] = "No visual content detected" return result # Extract image paths image_paths = self.detector.extract_image_paths(user_input) if not image_paths: result["message"] = "Visual content detected but no valid image paths found" result["errors"].append("No valid image paths found") return result # Determine analysis mode based on user request analysis_mode = self._determine_analysis_mode(user_request, user_input) # Process each image for image_path in image_paths: try: # Handle URLs by downloading first (simplified - in practice would need download logic) if image_path.startswith(("http://", "https://")): # In a real implementation, you'd download the URL to a temp file # For now, we'll assume local paths only result["errors"].append( f"URL handling not implemented: {image_path}" ) continue # Ensure path is absolute if not os.path.isabs(image_path): # Try to resolve relative to current working directory image_path = os.path.join(os.getcwd(), image_path) # Analyze the image if analysis_mode == AnalysisMode.CUSTOM: # Use the user request as the custom question analysis_result = self.analyzer.analyze_with_mode( Path(image_path), "custom", user_request or "Please analyze this image.", ) else: analysis_result = self.analyzer.analyze_with_mode( Path(image_path), analysis_mode.value ) result["analysis_results"].append( { "image_path": image_path, "analysis_mode": analysis_mode.value, "result": analysis_result, } ) except Exception as e: error_msg = f"Failed to analyze {image_path}: {str(e)}" result["errors"].append(error_msg) print(f"Error: {error_msg}", file=sys.stderr) return result def _determine_analysis_mode( self, user_request: str, user_input: str ) -> AnalysisMode: """ Determine the appropriate analysis mode based on user context. Args: user_request: The user's original request user_input: The full input containing visual content references Returns: AnalysisMode enum value """ combined_text = (user_request + " " + user_input).lower() # Check for specific keywords to determine mode if any( word in combined_text for word in ["text", "文字", "ocr", "read", "识别"] ): return AnalysisMode.OCR elif any( word in combined_text for word in ["chart", "graph", "plot", "图表", "数据", "趋势"] ): return AnalysisMode.CHART elif any( word in combined_text for word in ["fashion", "服装", "穿搭", "style", "style"] ): return AnalysisMode.FASHION elif any(word in combined_text for word in ["product", "产品", "商品", "item"]): return AnalysisMode.PRODUCT elif any( word in combined_text for word in ["scene", "场景", "环境", "location", "place"] ): return AnalysisMode.SCENE else: return AnalysisMode.DESCRIBE def main(): """Command line interface for testing.""" import argparse parser = argparse.ArgumentParser(description="Process visual content in user input") parser.add_argument("input", help="User input containing visual content references") parser.add_argument("--request", "-r", help="Original user request/context") parser.add_argument("--output", "-o", help="Output file for results") args = parser.parse_args() try: processor = VisionProcessor() result = processor.process_visual_content(args.input, args.request or "") import json output = json.dumps(result, indent=2, ensure_ascii=False) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) print(f"Results saved to: {args.output}") else: print(output) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()