Initial commit: skills library

- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
+
+This script implements the detection logic described in the skill documentation,
+but integrates with the actual working vision processing implementation
+using direct API calls rather than custom agent delegation.
+"""
+
+import re
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from enum import Enum
+
+
+class DetectionConfidence(Enum):
+    HIGH = "high"
+    MEDIUM = "medium"
+    LOW = "low"
+    NONE = "none"
+
+
+class VisionContentDetector:
+    """Detects visual content in user input based on various patterns."""
+
+    # Image file extensions (case-insensitive)
+    IMAGE_EXTENSIONS = [
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".webp",
+        ".svg",
+        ".ico",
+        ".tiff",
+        ".tif",
+        ".heic",
+        ".heif",
+        ".raw",
+        ".psd",
+        ".ai",
+        ".eps",
+    ]
+
+    # Document files with potential visual content
+    DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
+
+    # Chinese visual keywords
+    CHINESE_KEYWORDS = {
+        "high": [
+            "图片",
+            "图像",
+            "照片",
+            "截图",
+            "图表",
+            "图示",
+            "图形",
+            "影像",
+            "画面",
+        ],
+        "medium": [
+            "流程图",
+            "架构图",
+            "时序图",
+            "ER 图",
+            "思维导图",
+            "柱状图",
+            "饼图",
+            "折线图",
+            "设计图",
+            "原型图",
+            "线框图",
+            "界面",
+            "UI",
+            "UX",
+            "表格",
+            "表单",
+            "清单",
+            "列表",
+        ],
+        "low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
+    }
+
+    # English visual keywords
+    ENGLISH_KEYWORDS = {
+        "high": [
+            "image",
+            "photo",
+            "picture",
+            "screenshot",
+            "snapshot",
+            "capture",
+            "diagram",
+            "chart",
+            "graph",
+            "plot",
+            "figure",
+        ],
+        "medium": [
+            "flowchart",
+            "architecture",
+            "sequence diagram",
+            "ER diagram",
+            "mind map",
+            "bar chart",
+            "pie chart",
+            "line graph",
+            "design",
+            "mockup",
+            "wireframe",
+            "interface",
+            "UI",
+            "UX",
+            "layout",
+            "table",
+            "form",
+            "list",
+            "grid",
+        ],
+        "low": ["show", "display", "visualize", "view", "look at", "see"],
+    }
+
+    # Technical visual keywords
+    TECHNICAL_KEYWORDS = [
+        "schema",
+        "model",
+        "blueprint",
+        "spec",
+        "technical drawing",
+        "dashboard",
+        "widget",
+        "panel",
+        "visualization",
+        "map",
+        "heatmap",
+        "scatter plot",
+        "histogram",
+        "infographic",
+        "poster",
+        "banner",
+        "thumbnail",
+    ]
+
+    def __init__(self):
+        """Initialize the detector with compiled regex patterns."""
+        self._compile_patterns()
+
+    def _compile_patterns(self):
+        """Compile regex patterns for performance."""
+        # File extension pattern
+        ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
+        self.file_ext_pattern = re.compile(
+            rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
+        )
+
+        # Markdown image syntax
+        self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
+
+        # Base64 image data
+        self.base64_img_pattern = re.compile(
+            r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
+        )
+
+        # Keyword + file reference
+        keyword_pattern = "|".join(
+            [
+                re.escape(k)
+                for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
+            ]
+        )
+        ext_pattern_short = "|".join(
+            re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
+        )  # Common ones
+        self.keyword_file_pattern = re.compile(
+            rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
+            re.IGNORECASE,
+        )
+
+    def detect_visual_content(
+        self, user_input: str
+    ) -> Tuple[DetectionConfidence, List[str]]:
+        """
+        Detect visual content in user input and return confidence level and detected items.
+
+        Args:
+            user_input: The user's input text
+
+        Returns:
+            Tuple of (confidence_level, detected_items)
+        """
+        detected_items = []
+        confidence_scores = []
+
+        # Check 1: File extensions
+        file_matches = self.file_ext_pattern.findall(user_input)
+        if file_matches:
+            detected_items.extend(file_matches)
+            confidence_scores.append(0.9)  # High confidence
+
+        # Check 2: Markdown image syntax
+        markdown_matches = self.markdown_img_pattern.findall(user_input)
+        if markdown_matches:
+            detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
+            confidence_scores.append(0.9)  # High confidence
+
+        # Check 3: Base64 image data
+        base64_matches = self.base64_img_pattern.findall(user_input)
+        if base64_matches:
+            detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
+            confidence_scores.append(0.9)  # High confidence
+
+        # Check 4: Visual keywords
+        keyword_confidence = self._check_keywords(user_input)
+        if keyword_confidence > 0:
+            confidence_scores.append(keyword_confidence)
+
+        # Check 5: URL images
+        url_images = self._detect_url_images(user_input)
+        if url_images:
+            detected_items.extend(url_images)
+            confidence_scores.append(0.8)  # Medium-high confidence
+
+        # Determine overall confidence
+        if not confidence_scores:
+            return DetectionConfidence.NONE, []
+
+        max_confidence = max(confidence_scores)
+        if max_confidence >= 0.9:
+            return DetectionConfidence.HIGH, detected_items
+        elif max_confidence >= 0.6:
+            return DetectionConfidence.MEDIUM, detected_items
+        else:
+            return DetectionConfidence.LOW, detected_items
+
+    def _check_keywords(self, user_input: str) -> float:
+        """Check for visual keywords and return confidence score."""
+        input_lower = user_input.lower()
+
+        # Check high priority keywords
+        for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
+            if keyword in input_lower:
+                return 0.8
+
+        # Check medium priority keywords
+        for keyword in (
+            self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
+        ):
+            if keyword in input_lower:
+                return 0.6
+
+        # Check technical keywords
+        for keyword in self.TECHNICAL_KEYWORDS:
+            if keyword.lower() in input_lower:
+                return 0.6
+
+        # Check low priority keywords
+        for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
+            if keyword in input_lower:
+                return 0.4
+
+        return 0.0
+
+    def _detect_url_images(self, user_input: str) -> List[str]:
+        """Detect image URLs in the input."""
+        url_pattern = re.compile(
+            r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
+        )
+        return url_pattern.findall(user_input)
+
+    def extract_image_paths(self, user_input: str) -> List[str]:
+        """
+        Extract actual image paths/URLs from user input.
+
+        Returns:
+            List of image paths or URLs
+        """
+        image_paths = []
+
+        # File paths with extensions
+        file_matches = self.file_ext_pattern.findall(user_input)
+        image_paths.extend(file_matches)
+
+        # Markdown image URLs
+        markdown_matches = self.markdown_img_pattern.findall(user_input)
+        image_paths.extend([url for alt, url in markdown_matches])
+
+        # Direct URLs
+        url_images = self._detect_url_images(user_input)
+        image_paths.extend(url_images)
+
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_paths = []
+        for path in image_paths:
+            if path not in seen:
+                unique_paths.append(path)
+                seen.add(path)
+
+        return unique_paths
+
+
+def main():
+    """Command line interface for testing."""
+    import argparse
+    import sys
+
+    parser = argparse.ArgumentParser(description="Detect visual content in user input")
+    parser.add_argument("input", help="User input to analyze")
+    parser.add_argument(
+        "--extract-paths",
+        action="store_true",
+        help="Extract and return image paths only",
+    )
+
+    args = parser.parse_args()
+
+    detector = VisionContentDetector()
+
+    if args.extract_paths:
+        paths = detector.extract_image_paths(args.input)
+        for path in paths:
+            print(path)
+    else:
+        confidence, items = detector.detect_visual_content(args.input)
+        print(f"Confidence: {confidence.value}")
+        if items:
+            print("Detected items:")
+            for item in items:
+                print(f"  - {item}")
+        else:
+            print("No visual content detected")
+
+
+if __name__ == "__main__":
+    main()