skills/agent-vision-awareness/scripts/vision_detector.py

#!/usr/bin/env python3
"""
Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill

This script implements the detection logic described in the skill documentation,
but integrates with the actual working vision processing implementation
using direct API calls rather than custom agent delegation.
"""

import re
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from enum import Enum


class DetectionConfidence(Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"
    NONE = "none"


class VisionContentDetector:
    """Detects visual content in user input based on various patterns."""

    # Image file extensions (case-insensitive)
    IMAGE_EXTENSIONS = [
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".bmp",
        ".webp",
        ".svg",
        ".ico",
        ".tiff",
        ".tif",
        ".heic",
        ".heif",
        ".raw",
        ".psd",
        ".ai",
        ".eps",
    ]

    # Document files with potential visual content
    DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]

    # Chinese visual keywords
    CHINESE_KEYWORDS = {
        "high": [
            "图片",
            "图像",
            "照片",
            "截图",
            "图表",
            "图示",
            "图形",
            "影像",
            "画面",
        ],
        "medium": [
            "流程图",
            "架构图",
            "时序图",
            "ER 图",
            "思维导图",
            "柱状图",
            "饼图",
            "折线图",
            "设计图",
            "原型图",
            "线框图",
            "界面",
            "UI",
            "UX",
            "表格",
            "表单",
            "清单",
            "列表",
        ],
        "low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
    }

    # English visual keywords
    ENGLISH_KEYWORDS = {
        "high": [
            "image",
            "photo",
            "picture",
            "screenshot",
            "snapshot",
            "capture",
            "diagram",
            "chart",
            "graph",
            "plot",
            "figure",
        ],
        "medium": [
            "flowchart",
            "architecture",
            "sequence diagram",
            "ER diagram",
            "mind map",
            "bar chart",
            "pie chart",
            "line graph",
            "design",
            "mockup",
            "wireframe",
            "interface",
            "UI",
            "UX",
            "layout",
            "table",
            "form",
            "list",
            "grid",
        ],
        "low": ["show", "display", "visualize", "view", "look at", "see"],
    }

    # Technical visual keywords
    TECHNICAL_KEYWORDS = [
        "schema",
        "model",
        "blueprint",
        "spec",
        "technical drawing",
        "dashboard",
        "widget",
        "panel",
        "visualization",
        "map",
        "heatmap",
        "scatter plot",
        "histogram",
        "infographic",
        "poster",
        "banner",
        "thumbnail",
    ]

    def __init__(self):
        """Initialize the detector with compiled regex patterns."""
        self._compile_patterns()

    def _compile_patterns(self):
        """Compile regex patterns for performance."""
        # File extension pattern
        ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
        self.file_ext_pattern = re.compile(
            rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
        )

        # Markdown image syntax
        self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")

        # Base64 image data
        self.base64_img_pattern = re.compile(
            r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
        )

        # Keyword + file reference
        keyword_pattern = "|".join(
            [
                re.escape(k)
                for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
            ]
        )
        ext_pattern_short = "|".join(
            re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
        )  # Common ones
        self.keyword_file_pattern = re.compile(
            rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
            re.IGNORECASE,
        )

    def detect_visual_content(
        self, user_input: str
    ) -> Tuple[DetectionConfidence, List[str]]:
        """
        Detect visual content in user input and return confidence level and detected items.

        Args:
            user_input: The user's input text

        Returns:
            Tuple of (confidence_level, detected_items)
        """
        detected_items = []
        confidence_scores = []

        # Check 1: File extensions
        file_matches = self.file_ext_pattern.findall(user_input)
        if file_matches:
            detected_items.extend(file_matches)
            confidence_scores.append(0.9)  # High confidence

        # Check 2: Markdown image syntax
        markdown_matches = self.markdown_img_pattern.findall(user_input)
        if markdown_matches:
            detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
            confidence_scores.append(0.9)  # High confidence

        # Check 3: Base64 image data
        base64_matches = self.base64_img_pattern.findall(user_input)
        if base64_matches:
            detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
            confidence_scores.append(0.9)  # High confidence

        # Check 4: Visual keywords
        keyword_confidence = self._check_keywords(user_input)
        if keyword_confidence > 0:
            confidence_scores.append(keyword_confidence)

        # Check 5: URL images
        url_images = self._detect_url_images(user_input)
        if url_images:
            detected_items.extend(url_images)
            confidence_scores.append(0.8)  # Medium-high confidence

        # Determine overall confidence
        if not confidence_scores:
            return DetectionConfidence.NONE, []

        max_confidence = max(confidence_scores)
        if max_confidence >= 0.9:
            return DetectionConfidence.HIGH, detected_items
        elif max_confidence >= 0.6:
            return DetectionConfidence.MEDIUM, detected_items
        else:
            return DetectionConfidence.LOW, detected_items

    def _check_keywords(self, user_input: str) -> float:
        """Check for visual keywords and return confidence score."""
        input_lower = user_input.lower()

        # Check high priority keywords
        for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
            if keyword in input_lower:
                return 0.8

        # Check medium priority keywords
        for keyword in (
            self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
        ):
            if keyword in input_lower:
                return 0.6

        # Check technical keywords
        for keyword in self.TECHNICAL_KEYWORDS:
            if keyword.lower() in input_lower:
                return 0.6

        # Check low priority keywords
        for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
            if keyword in input_lower:
                return 0.4

        return 0.0

    def _detect_url_images(self, user_input: str) -> List[str]:
        """Detect image URLs in the input."""
        url_pattern = re.compile(
            r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
        )
        return url_pattern.findall(user_input)

    def extract_image_paths(self, user_input: str) -> List[str]:
        """
        Extract actual image paths/URLs from user input.

        Returns:
            List of image paths or URLs
        """
        image_paths = []

        # File paths with extensions
        file_matches = self.file_ext_pattern.findall(user_input)
        image_paths.extend(file_matches)

        # Markdown image URLs
        markdown_matches = self.markdown_img_pattern.findall(user_input)
        image_paths.extend([url for alt, url in markdown_matches])

        # Direct URLs
        url_images = self._detect_url_images(user_input)
        image_paths.extend(url_images)

        # Remove duplicates while preserving order
        seen = set()
        unique_paths = []
        for path in image_paths:
            if path not in seen:
                unique_paths.append(path)
                seen.add(path)

        return unique_paths


def main():
    """Command line interface for testing."""
    import argparse
    import sys

    parser = argparse.ArgumentParser(description="Detect visual content in user input")
    parser.add_argument("input", help="User input to analyze")
    parser.add_argument(
        "--extract-paths",
        action="store_true",
        help="Extract and return image paths only",
    )

    args = parser.parse_args()

    detector = VisionContentDetector()

    if args.extract_paths:
        paths = detector.extract_image_paths(args.input)
        for path in paths:
            print(path)
    else:
        confidence, items = detector.detect_visual_content(args.input)
        print(f"Confidence: {confidence.value}")
        if items:
            print("Detected items:")
            for item in items:
                print(f"  - {item}")
        else:
            print("No visual content detected")


if __name__ == "__main__":
    main()