Initial commit: skills library

- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+"""Example script - delete if not needed."""
+
+print("Hello from skill!")
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Integration script for agent-vision-awareness skill
+
+This script demonstrates the complete workflow:
+1. Detect visual content in user input
+2. Extract image paths
+3. Analyze images using the vision analyzer
+4. Return structured results
+
+This replaces the problematic custom agent delegation approach.
+"""
+
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any, List
+
+
+def process_user_input(
+    user_input: str, user_request: str = "", config: Dict[str, str] = None
+) -> Dict[str, Any]:
+    """
+    Process user input for visual content and return analysis results.
+
+    Args:
+        user_input: The user's input text that may contain visual content references
+        user_request: The original user request/context (optional)
+        config: Configuration for the vision analyzer (optional)
+
+    Returns:
+        Dictionary containing detection results and analysis
+    """
+    try:
+        # Import the detector and analyzer
+        from .vision_detector import VisionContentDetector, DetectionConfidence
+        from .standalone_vision_analyzer import StandaloneVisionAnalyzer
+
+        # Initialize components
+        detector = VisionContentDetector()
+        analyzer = StandaloneVisionAnalyzer(config)
+
+        # Step 1: Detect visual content
+        confidence, detected_items = detector.detect_visual_content(user_input)
+        result = {
+            "status": "success",
+            "confidence": confidence.value,
+            "detected_items": detected_items,
+            "analysis_results": [],
+            "errors": [],
+        }
+
+        if confidence == DetectionConfidence.NONE:
+            result["message"] = "No visual content detected"
+            return result
+
+        # Step 2: Extract image paths
+        image_paths = detector.extract_image_paths(user_input)
+        if not image_paths:
+            result["message"] = "Visual content detected but no valid image paths found"
+            result["errors"].append("No valid image paths found")
+            return result
+
+        # Step 3: Determine analysis mode
+        combined_text = (user_request + " " + user_input).lower()
+        if any(
+            word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
+        ):
+            mode = "ocr"
+        elif any(
+            word in combined_text
+            for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
+        ):
+            mode = "chart"
+        elif any(
+            word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
+        ):
+            mode = "fashion"
+        elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
+            mode = "product"
+        elif any(
+            word in combined_text
+            for word in ["scene", "场景", "环境", "location", "place"]
+        ):
+            mode = "scene"
+        else:
+            mode = "describe"
+
+        # Step 4: Analyze each image
+        for image_path in image_paths:
+            try:
+                # Handle relative paths
+                if not os.path.isabs(image_path):
+                    image_path = os.path.join(os.getcwd(), image_path)
+
+                # Analyze the image
+                if mode == "custom":
+                    analysis_result = analyzer.analyze_with_mode(
+                        Path(image_path),
+                        "custom",
+                        user_request or "Please analyze this image.",
+                    )
+                else:
+                    analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
+
+                result["analysis_results"].append(
+                    {"image_path": image_path, "mode": mode, "result": analysis_result}
+                )
+
+            except Exception as e:
+                error_msg = f"Failed to analyze {image_path}: {str(e)}"
+                result["errors"].append(error_msg)
+                print(f"Error: {error_msg}", file=sys.stderr)
+
+        return result
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e),
+            "message": f"Processing failed: {str(e)}",
+        }
+
+
+def main():
+    """Command line interface for testing."""
+    import argparse
+    import json
+
+    parser = argparse.ArgumentParser(description="Process visual content in user input")
+    parser.add_argument("input", help="User input containing visual content references")
+    parser.add_argument("--request", "-r", help="Original user request/context")
+    parser.addiction_group = parser.add_mutually_exclusive_group()
+    parser.addiction_group.add_argument("--api-key", help="API key for vision service")
+    parser.addiction_group.add_argument("--config-file", help="Configuration file path")
+    parser.add_argument("--output", "-o", help="Output file for results")
+
+    args = parser.parse_args()
+
+    # Build configuration
+    config = {}
+    if args.api_key:
+        config["api_key"] = args.api_key
+        config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
+        config["model"] = "doubao-seed-code"
+    elif args.config_file:
+        import json
+
+        with open(args.config_file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+    # Process the input
+    result = process_user_input(args.input, args.request or "", config)
+
+    # Output results
+    output = json.dumps(result, indent=2, ensure_ascii=False)
+
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(output)
+        print(f"Results saved to: {args.output}")
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Standalone Vision Analyzer - Simplified version for agent-vision-awareness skill
+
+This is a self-contained version of the vision analyzer that doesn't depend on
+the image-service skill structure, making it easier to integrate directly.
+"""
+
+import base64
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Any, Optional
+import httpx
+
+
+class StandaloneVisionAnalyzer:
+    """Standalone vision analyzer using direct API calls."""
+
+    # Predefined analysis modes
+    ANALYSIS_MODES = {
+        "describe": "请详细描述这张图片的内容，包括：人物、场景、物品、颜色、布局等所有细节。",
+        "ocr": "请仔细识别这张图片中的所有文字内容，按照文字在图片中的位置顺序输出。如果是中文，请保持原文输出。",
+        "chart": "请分析这张图表的内容，包括：图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
+        "fashion": "请分析这张图片中人物的穿搭，包括：服装款式、颜色搭配、配饰、整体风格等。",
+        "product": "请分析这张产品图片，包括：产品类型、外观特征、功能特点、品牌信息等。",
+        "scene": "请描述这张图片的场景，包括：地点、环境、氛围、时间（白天/夜晚）等。",
+        "custom": "用户自定义问题",
+    }
+
+    def __init__(self, config: Optional[Dict[str, str]] = None):
+        """
+        Initialize the analyzer.
+
+        Args:
+            config: Configuration dictionary with api_key, base_url, model
+        """
+        if config is None:
+            config = self._load_config()
+
+        self.api_key = (
+            config.get("api_key")
+            or config.get("VOLCENGINE_API_KEY")
+            or "b0359bed-09f2-49e2-a53c-32ba057412e3"
+        )
+        self.base_url = (
+            config.get("base_url") or "https://ark.cn-beijing.volces.com/api/coding/v3"
+        )
+        self.model = config.get("model") or "doubao-seed-code"
+
+        if not self.api_key or not self.base_url:
+            raise ValueError("Missing required API configuration: api_key and base_url")
+
+    def _load_config(self) -> Dict[str, str]:
+        """Load configuration from environment variables or config file."""
+        config = {}
+
+        # Load from environment variables
+        config["api_key"] = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
+            "DASHSCOPE_API_KEY"
+        )
+        config["base_url"] = os.environ.get("VISION_API_BASE_URL")
+        config["model"] = os.environ.get("VISION_MODEL")
+
+        return config
+
+    def encode_image(self, image_path: Path) -> str:
+        """Encode image to base64."""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    def analyze(self, image_path: Path, question: str) -> str:
+        """
+        Analyze image content.
+
+        Args:
+            image_path: Path to the image file
+            question: Question/prompt for analysis
+
+        Returns:
+            Analysis result text
+        """
+        if not image_path.exists():
+            raise FileNotFoundError(f"Image not found: {image_path}")
+
+        base64_image = self.encode_image(image_path)
+
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{base64_image}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": 2000,
+        }
+
+        try:
+            with httpx.Client(timeout=30.0) as client:
+                response = client.post(
+                    f"{self.base_url}/chat/completions", headers=headers, json=payload
+                )
+                response.raise_for_status()
+                result = response.json()
+                return result["choices"][0]["message"]["content"]
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code == 404:
+                raise ValueError(
+                    f"API endpoint not found, check base_url: {self.base_url}"
+                )
+            elif e.response.status_code == 401:
+                raise ValueError("Invalid or expired API key")
+            else:
+                raise RuntimeError(f"API request failed: {e}")
+        except Exception as e:
+            raise RuntimeError(f"Analysis failed: {e}")
+
+    def analyze_with_mode(
+        self,
+        image_path: Path,
+        mode: str = "describe",
+        custom_question: Optional[str] = None,
+    ) -> str:
+        """
+        Analyze image with predefined mode.
+
+        Args:
+            image_path: Path to the image file
+            mode: Analysis mode (describe, ocr, chart, fashion, product, scene, custom)
+            custom_question: Custom question for custom mode
+
+        Returns:
+            Analysis result text
+        """
+        if mode not in self.ANALYSIS_MODES:
+            raise ValueError(
+                f"Unsupported mode: {mode}, available: {list(self.ANALYSIS_MODES.keys())}"
+            )
+
+        if mode == "custom":
+            if not custom_question:
+                raise ValueError("Custom mode requires custom_question parameter")
+            question = custom_question
+        else:
+            question = self.ANALYSIS_MODES[mode]
+
+        return self.analyze(image_path, question)
+
+
+def main():
+    """Command line interface."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Standalone Vision Analyzer")
+    parser.add_argument("image", help="Image path")
+    parser.add_argument(
+        "--mode",
+        "-m",
+        choices=["describe", "ocr", "chart", "fashion", "product", "scene", "custom"],
+        default="describe",
+        help="Analysis mode",
+    )
+    parser.add_argument("--question", "-q", help="Custom question for custom mode")
+    parser.add_argument("--output", "-o", help="Output file")
+
+    args = parser.parse_args()
+
+    image_path = Path(args.image)
+    if not image_path.exists():
+        print(f"Error: Image not found: {image_path}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        analyzer = StandaloneVisionAnalyzer()
+
+        if args.mode == "custom":
+            if not args.question:
+                print(
+                    "Error: Custom mode requires --question parameter", file=sys.stderr
+                )
+                sys.exit(1)
+            result = analyzer.analyze_with_mode(image_path, "custom", args.question)
+        else:
+            result = analyzer.analyze_with_mode(image_path, args.mode)
+
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(result)
+            print(f"Result saved to: {args.output}")
+        else:
+            print("Analysis Result:")
+            print("-" * 50)
+            print(result)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Test script for agent-vision-awareness skill
+
+This script tests the vision detection and processing capabilities.
+"""
+
+import os
+import sys
+from pathlib import Path
+
+
+def test_detection():
+    """Test visual content detection."""
+    print("Testing visual content detection...")
+
+    from .vision_detector import VisionContentDetector, DetectionConfidence
+
+    detector = VisionContentDetector()
+
+    test_cases = [
+        ("帮我分析这个截图 error.png", DetectionConfidence.HIGH),
+        ("描述这张图片的内容", DetectionConfidence.LOW),
+        ("根据架构图 design/architecture.png 生成部署方案", DetectionConfidence.HIGH),
+        ("写一个 Python 脚本", DetectionConfidence.NONE),
+        ("![diagram](flow.png) 显示什么？", DetectionConfidence.HIGH),
+    ]
+
+    for test_input, expected_confidence in test_cases:
+        confidence, items = detector.detect_visual_content(test_input)
+        status = "✅" if confidence == expected_confidence else "❌"
+        print(f"{status} Input: {test_input}")
+        print(f"   Expected: {expected_confidence.value}, Got: {confidence.value}")
+        if items:
+            print(f"   Detected: {items}")
+        print()
+
+    return True
+
+
+def test_integration():
+    """Test integration with vision analyzer (if API key available)."""
+    print("Testing vision integration...")
+
+    # Check if API key is available
+    api_key = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
+        "DASHSCOPE_API_KEY"
+    )
+    if not api_key:
+        print("⚠️  No API key found. Skipping integration test.")
+        print(
+            "   Set VOLCENGINE_API_KEY or DASHSCOPE_API_KEY environment variable to test."
+        )
+        return False
+
+    try:
+        from .integrate_vision import process_user_input
+
+        # Test with a simple request (won't actually process image without file)
+        result = process_user_input(
+            "测试视觉处理",
+            "这是一个测试",
+            config={
+                "api_key": api_key,
+                "base_url": "https://ark.cn-beijing.volces.com/api/coding/v3",
+                "model": "doubao-seed-code",
+            },
+        )
+
+        if result["status"] == "success":
+            print("✅ Integration test passed (configuration valid)")
+            return True
+        else:
+            print(f"❌ Integration test failed: {result.get('error', 'Unknown error')}")
+            return False
+
+    except Exception as e:
+        print(f"❌ Integration test failed: {e}")
+        return False
+
+
+def main():
+    """Run all tests."""
+    print("🧪 Testing Agent Vision Awareness Skill")
+    print("=" * 50)
+
+    success = True
+
+    # Test detection
+    success &= test_detection()
+
+    # Test integration (if possible)
+    success &= test_integration()
+
+    print("=" * 50)
+    if success:
+        print("✅ All tests passed!")
+    else:
+        print("⚠️  Some tests failed or were skipped.")
+
+    return success
+
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
+
+This script implements the detection logic described in the skill documentation,
+but integrates with the actual working vision processing implementation
+using direct API calls rather than custom agent delegation.
+"""
+
+import re
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from enum import Enum
+
+
+class DetectionConfidence(Enum):
+    HIGH = "high"
+    MEDIUM = "medium"
+    LOW = "low"
+    NONE = "none"
+
+
+class VisionContentDetector:
+    """Detects visual content in user input based on various patterns."""
+
+    # Image file extensions (case-insensitive)
+    IMAGE_EXTENSIONS = [
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".webp",
+        ".svg",
+        ".ico",
+        ".tiff",
+        ".tif",
+        ".heic",
+        ".heif",
+        ".raw",
+        ".psd",
+        ".ai",
+        ".eps",
+    ]
+
+    # Document files with potential visual content
+    DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
+
+    # Chinese visual keywords
+    CHINESE_KEYWORDS = {
+        "high": [
+            "图片",
+            "图像",
+            "照片",
+            "截图",
+            "图表",
+            "图示",
+            "图形",
+            "影像",
+            "画面",
+        ],
+        "medium": [
+            "流程图",
+            "架构图",
+            "时序图",
+            "ER 图",
+            "思维导图",
+            "柱状图",
+            "饼图",
+            "折线图",
+            "设计图",
+            "原型图",
+            "线框图",
+            "界面",
+            "UI",
+            "UX",
+            "表格",
+            "表单",
+            "清单",
+            "列表",
+        ],
+        "low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
+    }
+
+    # English visual keywords
+    ENGLISH_KEYWORDS = {
+        "high": [
+            "image",
+            "photo",
+            "picture",
+            "screenshot",
+            "snapshot",
+            "capture",
+            "diagram",
+            "chart",
+            "graph",
+            "plot",
+            "figure",
+        ],
+        "medium": [
+            "flowchart",
+            "architecture",
+            "sequence diagram",
+            "ER diagram",
+            "mind map",
+            "bar chart",
+            "pie chart",
+            "line graph",
+            "design",
+            "mockup",
+            "wireframe",
+            "interface",
+            "UI",
+            "UX",
+            "layout",
+            "table",
+            "form",
+            "list",
+            "grid",
+        ],
+        "low": ["show", "display", "visualize", "view", "look at", "see"],
+    }
+
+    # Technical visual keywords
+    TECHNICAL_KEYWORDS = [
+        "schema",
+        "model",
+        "blueprint",
+        "spec",
+        "technical drawing",
+        "dashboard",
+        "widget",
+        "panel",
+        "visualization",
+        "map",
+        "heatmap",
+        "scatter plot",
+        "histogram",
+        "infographic",
+        "poster",
+        "banner",
+        "thumbnail",
+    ]
+
+    def __init__(self):
+        """Initialize the detector with compiled regex patterns."""
+        self._compile_patterns()
+
+    def _compile_patterns(self):
+        """Compile regex patterns for performance."""
+        # File extension pattern
+        ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
+        self.file_ext_pattern = re.compile(
+            rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
+        )
+
+        # Markdown image syntax
+        self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
+
+        # Base64 image data
+        self.base64_img_pattern = re.compile(
+            r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
+        )
+
+        # Keyword + file reference
+        keyword_pattern = "|".join(
+            [
+                re.escape(k)
+                for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
+            ]
+        )
+        ext_pattern_short = "|".join(
+            re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
+        )  # Common ones
+        self.keyword_file_pattern = re.compile(
+            rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
+            re.IGNORECASE,
+        )
+
+    def detect_visual_content(
+        self, user_input: str
+    ) -> Tuple[DetectionConfidence, List[str]]:
+        """
+        Detect visual content in user input and return confidence level and detected items.
+
+        Args:
+            user_input: The user's input text
+
+        Returns:
+            Tuple of (confidence_level, detected_items)
+        """
+        detected_items = []
+        confidence_scores = []
+
+        # Check 1: File extensions
+        file_matches = self.file_ext_pattern.findall(user_input)
+        if file_matches:
+            detected_items.extend(file_matches)
+            confidence_scores.append(0.9)  # High confidence
+
+        # Check 2: Markdown image syntax
+        markdown_matches = self.markdown_img_pattern.findall(user_input)
+        if markdown_matches:
+            detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
+            confidence_scores.append(0.9)  # High confidence
+
+        # Check 3: Base64 image data
+        base64_matches = self.base64_img_pattern.findall(user_input)
+        if base64_matches:
+            detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
+            confidence_scores.append(0.9)  # High confidence
+
+        # Check 4: Visual keywords
+        keyword_confidence = self._check_keywords(user_input)
+        if keyword_confidence > 0:
+            confidence_scores.append(keyword_confidence)
+
+        # Check 5: URL images
+        url_images = self._detect_url_images(user_input)
+        if url_images:
+            detected_items.extend(url_images)
+            confidence_scores.append(0.8)  # Medium-high confidence
+
+        # Determine overall confidence
+        if not confidence_scores:
+            return DetectionConfidence.NONE, []
+
+        max_confidence = max(confidence_scores)
+        if max_confidence >= 0.9:
+            return DetectionConfidence.HIGH, detected_items
+        elif max_confidence >= 0.6:
+            return DetectionConfidence.MEDIUM, detected_items
+        else:
+            return DetectionConfidence.LOW, detected_items
+
+    def _check_keywords(self, user_input: str) -> float:
+        """Check for visual keywords and return confidence score."""
+        input_lower = user_input.lower()
+
+        # Check high priority keywords
+        for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
+            if keyword in input_lower:
+                return 0.8
+
+        # Check medium priority keywords
+        for keyword in (
+            self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
+        ):
+            if keyword in input_lower:
+                return 0.6
+
+        # Check technical keywords
+        for keyword in self.TECHNICAL_KEYWORDS:
+            if keyword.lower() in input_lower:
+                return 0.6
+
+        # Check low priority keywords
+        for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
+            if keyword in input_lower:
+                return 0.4
+
+        return 0.0
+
+    def _detect_url_images(self, user_input: str) -> List[str]:
+        """Detect image URLs in the input."""
+        url_pattern = re.compile(
+            r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
+        )
+        return url_pattern.findall(user_input)
+
+    def extract_image_paths(self, user_input: str) -> List[str]:
+        """
+        Extract actual image paths/URLs from user input.
+
+        Returns:
+            List of image paths or URLs
+        """
+        image_paths = []
+
+        # File paths with extensions
+        file_matches = self.file_ext_pattern.findall(user_input)
+        image_paths.extend(file_matches)
+
+        # Markdown image URLs
+        markdown_matches = self.markdown_img_pattern.findall(user_input)
+        image_paths.extend([url for alt, url in markdown_matches])
+
+        # Direct URLs
+        url_images = self._detect_url_images(user_input)
+        image_paths.extend(url_images)
+
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_paths = []
+        for path in image_paths:
+            if path not in seen:
+                unique_paths.append(path)
+                seen.add(path)
+
+        return unique_paths
+
+
+def main():
+    """Command line interface for testing."""
+    import argparse
+    import sys
+
+    parser = argparse.ArgumentParser(description="Detect visual content in user input")
+    parser.add_argument("input", help="User input to analyze")
+    parser.add_argument(
+        "--extract-paths",
+        action="store_true",
+        help="Extract and return image paths only",
+    )
+
+    args = parser.parse_args()
+
+    detector = VisionContentDetector()
+
+    if args.extract_paths:
+        paths = detector.extract_image_paths(args.input)
+        for path in paths:
+            print(path)
+    else:
+        confidence, items = detector.detect_visual_content(args.input)
+        print(f"Confidence: {confidence.value}")
+        if items:
+            print("Detected items:")
+            for item in items:
+                print(f"  - {item}")
+        else:
+            print("No visual content detected")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+import sys
+sys.stdout.reconfigure(errors='replace')
+sys.stderr.reconfigure(errors='replace')
+import os
+os.environ['PYTHONIOENCODING'] = 'utf-8'
+import base64
+import time
+import tempfile
+from pathlib import Path
+from openai import OpenAI
+
+# 统一临时目录
+TEMP_DIR = r'D:\F\NewI\opencode\daily-workspace\temp'
+os.makedirs(TEMP_DIR, exist_ok=True)
+
+# 从OpenCode配置读取火山方舟API Key
+CONFIG_PATH = r'C:\Users\hmo\.config\opencode\config.json'
+import json
+with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
+    config = json.load(f)
+API_KEY = config['provider']['volcengine']['options']['apiKey']
+BASE_URL = config['provider']['volcengine']['options']['baseURL']
+
+client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
+MODEL = 'doubao-seed-2.0-pro'
+
+def analyze_image(image_path_or_url, prompt="详细描述这张图片的内容"):
+    """
+    分析图片内容，支持本地路径和http/https URL
+    :param image_path_or_url: 图片路径或URL
+    :param prompt: 分析提示词
+    :return: 分析结果
+    """
+    try:
+        # 处理URL
+        if image_path_or_url.lower().startswith(('http://', 'https://')):
+            image_url = image_path_or_url
+        else:
+            # 处理本地路径
+            image_path = Path(image_path_or_url)
+            if not image_path.exists():
+                return f"错误：图片不存在 {image_path}"
+            # 转base64
+            with open(image_path, 'rb') as f:
+                image_base64 = base64.b64encode(f.read()).decode('utf-8')
+            image_url = f"data:image/{image_path.suffix.lstrip('.')};base64,{image_base64}"
+        
+        # 调用API
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image_url", "image_url": {"url": image_url}}
+                    ]
+                }
+            ],
+            max_tokens=1000
+        )
+        return response.choices[0].message.content
+    
+    except Exception as e:
+        return f"图片识别失败：{type(e).__name__}: {str(e)}"
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("用法：python vision_direct.py <图片路径/URL> [提示词]")
+        sys.exit(1)
+    
+    image_path = sys.argv[1]
+    prompt = sys.argv[2] if len(sys.argv) > 2 else "详细描述这张图片的内容"
+    
+    result = analyze_image(image_path, prompt)
+    print(result)
+    
+    # 保存到临时文件
+    output_file = os.path.join(TEMP_DIR, f"vision_result_{int(time.time())}.txt")
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(result)
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
+
+This script provides the actual implementation that replaces the problematic
+custom agent delegation approach described in the outdated documentation.
+"""
+
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from enum import Enum
+
+# Add the image-service scripts to path to reuse the vision analyzer
+sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
+
+try:
+    from vision_analyzer import VisionAnalyzer
+except ImportError:
+    # Fallback: try to import from current directory if vision_analyzer is copied here
+    try:
+        from .vision_analyzer import VisionAnalyzer
+    except ImportError:
+        raise ImportError(
+            "Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
+        )
+
+
+class AnalysisMode(Enum):
+    """Available analysis modes."""
+
+    DESCRIBE = "describe"
+    OCR = "ocr"
+    CHART = "chart"
+    FASHION = "fashion"
+    PRODUCT = "product"
+    SCENE = "scene"
+    CUSTOM = "custom"
+
+
+class VisionProcessor:
+    """Main vision processing class that integrates detection and analysis."""
+
+    def __init__(self, config: Optional[Dict[str, str]] = None):
+        """
+        Initialize the vision processor.
+
+        Args:
+            config: Configuration dictionary for the VisionAnalyzer
+        """
+        self.analyzer = VisionAnalyzer(config)
+        self.detector = None  # Will be created when needed
+
+    def process_visual_content(
+        self, user_input: str, user_request: str = ""
+    ) -> Dict[str, Any]:
+        """
+        Process visual content in user input.
+
+        Args:
+            user_input: The user's input text that may contain visual content references
+            user_request: The original user request/context
+
+        Returns:
+            Dictionary containing analysis results and metadata
+        """
+        from .vision_detector import VisionContentDetector, DetectionConfidence
+
+        # Initialize detector if not already done
+        if self.detector is None:
+            self.detector = VisionContentDetector()
+
+        # Detect visual content
+        confidence, detected_items = self.detector.detect_visual_content(user_input)
+        result = {
+            "confidence": confidence.value,
+            "detected_items": detected_items,
+            "analysis_results": [],
+            "errors": [],
+        }
+
+        if confidence == DetectionConfidence.NONE:
+            result["message"] = "No visual content detected"
+            return result
+
+        # Extract image paths
+        image_paths = self.detector.extract_image_paths(user_input)
+        if not image_paths:
+            result["message"] = "Visual content detected but no valid image paths found"
+            result["errors"].append("No valid image paths found")
+            return result
+
+        # Determine analysis mode based on user request
+        analysis_mode = self._determine_analysis_mode(user_request, user_input)
+
+        # Process each image
+        for image_path in image_paths:
+            try:
+                # Handle URLs by downloading first (simplified - in practice would need download logic)
+                if image_path.startswith(("http://", "https://")):
+                    # In a real implementation, you'd download the URL to a temp file
+                    # For now, we'll assume local paths only
+                    result["errors"].append(
+                        f"URL handling not implemented: {image_path}"
+                    )
+                    continue
+
+                # Ensure path is absolute
+                if not os.path.isabs(image_path):
+                    # Try to resolve relative to current working directory
+                    image_path = os.path.join(os.getcwd(), image_path)
+
+                # Analyze the image
+                if analysis_mode == AnalysisMode.CUSTOM:
+                    # Use the user request as the custom question
+                    analysis_result = self.analyzer.analyze_with_mode(
+                        Path(image_path),
+                        "custom",
+                        user_request or "Please analyze this image.",
+                    )
+                else:
+                    analysis_result = self.analyzer.analyze_with_mode(
+                        Path(image_path), analysis_mode.value
+                    )
+
+                result["analysis_results"].append(
+                    {
+                        "image_path": image_path,
+                        "analysis_mode": analysis_mode.value,
+                        "result": analysis_result,
+                    }
+                )
+
+            except Exception as e:
+                error_msg = f"Failed to analyze {image_path}: {str(e)}"
+                result["errors"].append(error_msg)
+                print(f"Error: {error_msg}", file=sys.stderr)
+
+        return result
+
+    def _determine_analysis_mode(
+        self, user_request: str, user_input: str
+    ) -> AnalysisMode:
+        """
+        Determine the appropriate analysis mode based on user context.
+
+        Args:
+            user_request: The user's original request
+            user_input: The full input containing visual content references
+
+        Returns:
+            AnalysisMode enum value
+        """
+        combined_text = (user_request + " " + user_input).lower()
+
+        # Check for specific keywords to determine mode
+        if any(
+            word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
+        ):
+            return AnalysisMode.OCR
+        elif any(
+            word in combined_text
+            for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
+        ):
+            return AnalysisMode.CHART
+        elif any(
+            word in combined_text
+            for word in ["fashion", "服装", "穿搭", "style", "style"]
+        ):
+            return AnalysisMode.FASHION
+        elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
+            return AnalysisMode.PRODUCT
+        elif any(
+            word in combined_text
+            for word in ["scene", "场景", "环境", "location", "place"]
+        ):
+            return AnalysisMode.SCENE
+        else:
+            return AnalysisMode.DESCRIBE
+
+
+def main():
+    """Command line interface for testing."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Process visual content in user input")
+    parser.add_argument("input", help="User input containing visual content references")
+    parser.add_argument("--request", "-r", help="Original user request/context")
+    parser.add_argument("--output", "-o", help="Output file for results")
+
+    args = parser.parse_args()
+
+    try:
+        processor = VisionProcessor()
+        result = processor.process_visual_content(args.input, args.request or "")
+
+        import json
+
+        output = json.dumps(result, indent=2, ensure_ascii=False)
+
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(output)
+            print(f"Results saved to: {args.output}")
+        else:
+            print(output)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()