#!/usr/bin/env python3 """ Integration script for agent-vision-awareness skill This script demonstrates the complete workflow: 1. Detect visual content in user input 2. Extract image paths 3. Analyze images using the vision analyzer 4. Return structured results This replaces the problematic custom agent delegation approach. """ import sys import os from pathlib import Path from typing import Dict, Any, List def process_user_input( user_input: str, user_request: str = "", config: Dict[str, str] = None ) -> Dict[str, Any]: """ Process user input for visual content and return analysis results. Args: user_input: The user's input text that may contain visual content references user_request: The original user request/context (optional) config: Configuration for the vision analyzer (optional) Returns: Dictionary containing detection results and analysis """ try: # Import the detector and analyzer from .vision_detector import VisionContentDetector, DetectionConfidence from .standalone_vision_analyzer import StandaloneVisionAnalyzer # Initialize components detector = VisionContentDetector() analyzer = StandaloneVisionAnalyzer(config) # Step 1: Detect visual content confidence, detected_items = detector.detect_visual_content(user_input) result = { "status": "success", "confidence": confidence.value, "detected_items": detected_items, "analysis_results": [], "errors": [], } if confidence == DetectionConfidence.NONE: result["message"] = "No visual content detected" return result # Step 2: Extract image paths image_paths = detector.extract_image_paths(user_input) if not image_paths: result["message"] = "Visual content detected but no valid image paths found" result["errors"].append("No valid image paths found") return result # Step 3: Determine analysis mode combined_text = (user_request + " " + user_input).lower() if any( word in combined_text for word in ["text", "文字", "ocr", "read", "识别"] ): mode = "ocr" elif any( word in combined_text for word in ["chart", "graph", "plot", "图表", "数据", "趋势"] ): mode = "chart" elif any( word in combined_text for word in ["fashion", "服装", "穿搭", "style"] ): mode = "fashion" elif any(word in combined_text for word in ["product", "产品", "商品", "item"]): mode = "product" elif any( word in combined_text for word in ["scene", "场景", "环境", "location", "place"] ): mode = "scene" else: mode = "describe" # Step 4: Analyze each image for image_path in image_paths: try: # Handle relative paths if not os.path.isabs(image_path): image_path = os.path.join(os.getcwd(), image_path) # Analyze the image if mode == "custom": analysis_result = analyzer.analyze_with_mode( Path(image_path), "custom", user_request or "Please analyze this image.", ) else: analysis_result = analyzer.analyze_with_mode(Path(image_path), mode) result["analysis_results"].append( {"image_path": image_path, "mode": mode, "result": analysis_result} ) except Exception as e: error_msg = f"Failed to analyze {image_path}: {str(e)}" result["errors"].append(error_msg) print(f"Error: {error_msg}", file=sys.stderr) return result except Exception as e: return { "status": "error", "error": str(e), "message": f"Processing failed: {str(e)}", } def main(): """Command line interface for testing.""" import argparse import json parser = argparse.ArgumentParser(description="Process visual content in user input") parser.add_argument("input", help="User input containing visual content references") parser.add_argument("--request", "-r", help="Original user request/context") parser.addiction_group = parser.add_mutually_exclusive_group() parser.addiction_group.add_argument("--api-key", help="API key for vision service") parser.addiction_group.add_argument("--config-file", help="Configuration file path") parser.add_argument("--output", "-o", help="Output file for results") args = parser.parse_args() # Build configuration config = {} if args.api_key: config["api_key"] = args.api_key config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3" config["model"] = "doubao-seed-code" elif args.config_file: import json with open(args.config_file, "r", encoding="utf-8") as f: config = json.load(f) # Process the input result = process_user_input(args.input, args.request or "", config) # Output results output = json.dumps(result, indent=2, ensure_ascii=False) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) print(f"Results saved to: {args.output}") else: print(output) if __name__ == "__main__": main()