Initial commit: skills library

- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Integration script for agent-vision-awareness skill
+
+This script demonstrates the complete workflow:
+1. Detect visual content in user input
+2. Extract image paths
+3. Analyze images using the vision analyzer
+4. Return structured results
+
+This replaces the problematic custom agent delegation approach.
+"""
+
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any, List
+
+
+def process_user_input(
+    user_input: str, user_request: str = "", config: Dict[str, str] = None
+) -> Dict[str, Any]:
+    """
+    Process user input for visual content and return analysis results.
+
+    Args:
+        user_input: The user's input text that may contain visual content references
+        user_request: The original user request/context (optional)
+        config: Configuration for the vision analyzer (optional)
+
+    Returns:
+        Dictionary containing detection results and analysis
+    """
+    try:
+        # Import the detector and analyzer
+        from .vision_detector import VisionContentDetector, DetectionConfidence
+        from .standalone_vision_analyzer import StandaloneVisionAnalyzer
+
+        # Initialize components
+        detector = VisionContentDetector()
+        analyzer = StandaloneVisionAnalyzer(config)
+
+        # Step 1: Detect visual content
+        confidence, detected_items = detector.detect_visual_content(user_input)
+        result = {
+            "status": "success",
+            "confidence": confidence.value,
+            "detected_items": detected_items,
+            "analysis_results": [],
+            "errors": [],
+        }
+
+        if confidence == DetectionConfidence.NONE:
+            result["message"] = "No visual content detected"
+            return result
+
+        # Step 2: Extract image paths
+        image_paths = detector.extract_image_paths(user_input)
+        if not image_paths:
+            result["message"] = "Visual content detected but no valid image paths found"
+            result["errors"].append("No valid image paths found")
+            return result
+
+        # Step 3: Determine analysis mode
+        combined_text = (user_request + " " + user_input).lower()
+        if any(
+            word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
+        ):
+            mode = "ocr"
+        elif any(
+            word in combined_text
+            for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
+        ):
+            mode = "chart"
+        elif any(
+            word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
+        ):
+            mode = "fashion"
+        elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
+            mode = "product"
+        elif any(
+            word in combined_text
+            for word in ["scene", "场景", "环境", "location", "place"]
+        ):
+            mode = "scene"
+        else:
+            mode = "describe"
+
+        # Step 4: Analyze each image
+        for image_path in image_paths:
+            try:
+                # Handle relative paths
+                if not os.path.isabs(image_path):
+                    image_path = os.path.join(os.getcwd(), image_path)
+
+                # Analyze the image
+                if mode == "custom":
+                    analysis_result = analyzer.analyze_with_mode(
+                        Path(image_path),
+                        "custom",
+                        user_request or "Please analyze this image.",
+                    )
+                else:
+                    analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
+
+                result["analysis_results"].append(
+                    {"image_path": image_path, "mode": mode, "result": analysis_result}
+                )
+
+            except Exception as e:
+                error_msg = f"Failed to analyze {image_path}: {str(e)}"
+                result["errors"].append(error_msg)
+                print(f"Error: {error_msg}", file=sys.stderr)
+
+        return result
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e),
+            "message": f"Processing failed: {str(e)}",
+        }
+
+
+def main():
+    """Command line interface for testing."""
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Process visual content in user input")
+    parser.add_argument("input", help="User input containing visual content references")
+    parser.add_argument("--request", "-r", help="Original user request/context")
+    parser.addiction_group = parser.add_mutually_exclusive_group()
+    parser.addiction_group.add_argument("--api-key", help="API key for vision service")
+    parser.addiction_group.add_argument("--config-file", help="Configuration file path")
+    parser.add_argument("--output", "-o", help="Output file for results")
+
+    args = parser.parse_args()
+
+    # Build configuration
+    config = {}
+    if args.api_key:
+        config["api_key"] = args.api_key
+        config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
+        config["model"] = "doubao-seed-code"
+    elif args.config_file:
+        import json
+
+        with open(args.config_file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+    # Process the input
+    result = process_user_input(args.input, args.request or "", config)
+
+    # Output results
+    output = json.dumps(result, indent=2, ensure_ascii=False)
+
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(output)
+        print(f"Results saved to: {args.output}")
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()