Initial commit: skills library

- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
+
+This script provides the actual implementation that replaces the problematic
+custom agent delegation approach described in the outdated documentation.
+"""
+
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from enum import Enum
+
+# Add the image-service scripts to path to reuse the vision analyzer
+sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
+
+try:
+    from vision_analyzer import VisionAnalyzer
+except ImportError:
+    # Fallback: try to import from current directory if vision_analyzer is copied here
+    try:
+        from .vision_analyzer import VisionAnalyzer
+    except ImportError:
+        raise ImportError(
+            "Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
+        )
+
+
+class AnalysisMode(Enum):
+    """Available analysis modes."""
+
+    DESCRIBE = "describe"
+    OCR = "ocr"
+    CHART = "chart"
+    FASHION = "fashion"
+    PRODUCT = "product"
+    SCENE = "scene"
+    CUSTOM = "custom"
+
+
+class VisionProcessor:
+    """Main vision processing class that integrates detection and analysis."""
+
+    def __init__(self, config: Optional[Dict[str, str]] = None):
+        """
+        Initialize the vision processor.
+
+        Args:
+            config: Configuration dictionary for the VisionAnalyzer
+        """
+        self.analyzer = VisionAnalyzer(config)
+        self.detector = None  # Will be created when needed
+
+    def process_visual_content(
+        self, user_input: str, user_request: str = ""
+    ) -> Dict[str, Any]:
+        """
+        Process visual content in user input.
+
+        Args:
+            user_input: The user's input text that may contain visual content references
+            user_request: The original user request/context
+
+        Returns:
+            Dictionary containing analysis results and metadata
+        """
+        from .vision_detector import VisionContentDetector, DetectionConfidence
+
+        # Initialize detector if not already done
+        if self.detector is None:
+            self.detector = VisionContentDetector()
+
+        # Detect visual content
+        confidence, detected_items = self.detector.detect_visual_content(user_input)
+        result = {
+            "confidence": confidence.value,
+            "detected_items": detected_items,
+            "analysis_results": [],
+            "errors": [],
+        }
+
+        if confidence == DetectionConfidence.NONE:
+            result["message"] = "No visual content detected"
+            return result
+
+        # Extract image paths
+        image_paths = self.detector.extract_image_paths(user_input)
+        if not image_paths:
+            result["message"] = "Visual content detected but no valid image paths found"
+            result["errors"].append("No valid image paths found")
+            return result
+
+        # Determine analysis mode based on user request
+        analysis_mode = self._determine_analysis_mode(user_request, user_input)
+
+        # Process each image
+        for image_path in image_paths:
+            try:
+                # Handle URLs by downloading first (simplified - in practice would need download logic)
+                if image_path.startswith(("http://", "https://")):
+                    # In a real implementation, you'd download the URL to a temp file
+                    # For now, we'll assume local paths only
+                    result["errors"].append(
+                        f"URL handling not implemented: {image_path}"
+                    )
+                    continue
+
+                # Ensure path is absolute
+                if not os.path.isabs(image_path):
+                    # Try to resolve relative to current working directory
+                    image_path = os.path.join(os.getcwd(), image_path)
+
+                # Analyze the image
+                if analysis_mode == AnalysisMode.CUSTOM:
+                    # Use the user request as the custom question
+                    analysis_result = self.analyzer.analyze_with_mode(
+                        Path(image_path),
+                        "custom",
+                        user_request or "Please analyze this image.",
+                    )
+                else:
+                    analysis_result = self.analyzer.analyze_with_mode(
+                        Path(image_path), analysis_mode.value
+                    )
+
+                result["analysis_results"].append(
+                    {
+                        "image_path": image_path,
+                        "analysis_mode": analysis_mode.value,
+                        "result": analysis_result,
+                    }
+                )
+
+            except Exception as e:
+                error_msg = f"Failed to analyze {image_path}: {str(e)}"
+                result["errors"].append(error_msg)
+                print(f"Error: {error_msg}", file=sys.stderr)
+
+        return result
+
+    def _determine_analysis_mode(
+        self, user_request: str, user_input: str
+    ) -> AnalysisMode:
+        """
+        Determine the appropriate analysis mode based on user context.
+
+        Args:
+            user_request: The user's original request
+            user_input: The full input containing visual content references
+
+        Returns:
+            AnalysisMode enum value
+        """
+        combined_text = (user_request + " " + user_input).lower()
+
+        # Check for specific keywords to determine mode
+        if any(
+            word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
+        ):
+            return AnalysisMode.OCR
+        elif any(
+            word in combined_text
+            for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
+        ):
+            return AnalysisMode.CHART
+        elif any(
+            word in combined_text
+            for word in ["fashion", "服装", "穿搭", "style", "style"]
+        ):
+            return AnalysisMode.FASHION
+        elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
+            return AnalysisMode.PRODUCT
+        elif any(
+            word in combined_text
+            for word in ["scene", "场景", "环境", "location", "place"]
+        ):
+            return AnalysisMode.SCENE
+        else:
+            return AnalysisMode.DESCRIBE
+
+
+def main():
+    """Command line interface for testing."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Process visual content in user input")
+    parser.add_argument("input", help="User input containing visual content references")
+    parser.add_argument("--request", "-r", help="Original user request/context")
+    parser.add_argument("--output", "-o", help="Output file for results")
+
+    args = parser.parse_args()
+
+    try:
+        processor = VisionProcessor()
+        result = processor.process_visual_content(args.input, args.request or "")
+
+        import json
+
+        output = json.dumps(result, indent=2, ensure_ascii=False)
+
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(output)
+            print(f"Results saved to: {args.output}")
+        else:
+            print(output)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()