#!/usr/bin/env python3 """ Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill This script implements the detection logic described in the skill documentation, but integrates with the actual working vision processing implementation using direct API calls rather than custom agent delegation. """ import re from pathlib import Path from typing import List, Dict, Tuple, Optional from enum import Enum class DetectionConfidence(Enum): HIGH = "high" MEDIUM = "medium" LOW = "low" NONE = "none" class VisionContentDetector: """Detects visual content in user input based on various patterns.""" # Image file extensions (case-insensitive) IMAGE_EXTENSIONS = [ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg", ".ico", ".tiff", ".tif", ".heic", ".heif", ".raw", ".psd", ".ai", ".eps", ] # Document files with potential visual content DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"] # Chinese visual keywords CHINESE_KEYWORDS = { "high": [ "图片", "图像", "照片", "截图", "图表", "图示", "图形", "影像", "画面", ], "medium": [ "流程图", "架构图", "时序图", "ER 图", "思维导图", "柱状图", "饼图", "折线图", "设计图", "原型图", "线框图", "界面", "UI", "UX", "表格", "表单", "清单", "列表", ], "low": ["显示", "展示", "呈现", "可视化", "看图", "读图"], } # English visual keywords ENGLISH_KEYWORDS = { "high": [ "image", "photo", "picture", "screenshot", "snapshot", "capture", "diagram", "chart", "graph", "plot", "figure", ], "medium": [ "flowchart", "architecture", "sequence diagram", "ER diagram", "mind map", "bar chart", "pie chart", "line graph", "design", "mockup", "wireframe", "interface", "UI", "UX", "layout", "table", "form", "list", "grid", ], "low": ["show", "display", "visualize", "view", "look at", "see"], } # Technical visual keywords TECHNICAL_KEYWORDS = [ "schema", "model", "blueprint", "spec", "technical drawing", "dashboard", "widget", "panel", "visualization", "map", "heatmap", "scatter plot", "histogram", "infographic", "poster", "banner", "thumbnail", ] def __init__(self): """Initialize the detector with compiled regex patterns.""" self._compile_patterns() def _compile_patterns(self): """Compile regex patterns for performance.""" # File extension pattern ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS) self.file_ext_pattern = re.compile( rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE ) # Markdown image syntax self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)") # Base64 image data self.base64_img_pattern = re.compile( r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+" ) # Keyword + file reference keyword_pattern = "|".join( [ re.escape(k) for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"] ] ) ext_pattern_short = "|".join( re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7] ) # Common ones self.keyword_file_pattern = re.compile( rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})", re.IGNORECASE, ) def detect_visual_content( self, user_input: str ) -> Tuple[DetectionConfidence, List[str]]: """ Detect visual content in user input and return confidence level and detected items. Args: user_input: The user's input text Returns: Tuple of (confidence_level, detected_items) """ detected_items = [] confidence_scores = [] # Check 1: File extensions file_matches = self.file_ext_pattern.findall(user_input) if file_matches: detected_items.extend(file_matches) confidence_scores.append(0.9) # High confidence # Check 2: Markdown image syntax markdown_matches = self.markdown_img_pattern.findall(user_input) if markdown_matches: detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches]) confidence_scores.append(0.9) # High confidence # Check 3: Base64 image data base64_matches = self.base64_img_pattern.findall(user_input) if base64_matches: detected_items.extend([f"base64:{fmt}" for fmt in base64_matches]) confidence_scores.append(0.9) # High confidence # Check 4: Visual keywords keyword_confidence = self._check_keywords(user_input) if keyword_confidence > 0: confidence_scores.append(keyword_confidence) # Check 5: URL images url_images = self._detect_url_images(user_input) if url_images: detected_items.extend(url_images) confidence_scores.append(0.8) # Medium-high confidence # Determine overall confidence if not confidence_scores: return DetectionConfidence.NONE, [] max_confidence = max(confidence_scores) if max_confidence >= 0.9: return DetectionConfidence.HIGH, detected_items elif max_confidence >= 0.6: return DetectionConfidence.MEDIUM, detected_items else: return DetectionConfidence.LOW, detected_items def _check_keywords(self, user_input: str) -> float: """Check for visual keywords and return confidence score.""" input_lower = user_input.lower() # Check high priority keywords for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]: if keyword in input_lower: return 0.8 # Check medium priority keywords for keyword in ( self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"] ): if keyword in input_lower: return 0.6 # Check technical keywords for keyword in self.TECHNICAL_KEYWORDS: if keyword.lower() in input_lower: return 0.6 # Check low priority keywords for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]: if keyword in input_lower: return 0.4 return 0.0 def _detect_url_images(self, user_input: str) -> List[str]: """Detect image URLs in the input.""" url_pattern = re.compile( r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE ) return url_pattern.findall(user_input) def extract_image_paths(self, user_input: str) -> List[str]: """ Extract actual image paths/URLs from user input. Returns: List of image paths or URLs """ image_paths = [] # File paths with extensions file_matches = self.file_ext_pattern.findall(user_input) image_paths.extend(file_matches) # Markdown image URLs markdown_matches = self.markdown_img_pattern.findall(user_input) image_paths.extend([url for alt, url in markdown_matches]) # Direct URLs url_images = self._detect_url_images(user_input) image_paths.extend(url_images) # Remove duplicates while preserving order seen = set() unique_paths = [] for path in image_paths: if path not in seen: unique_paths.append(path) seen.add(path) return unique_paths def main(): """Command line interface for testing.""" import argparse import sys parser = argparse.ArgumentParser(description="Detect visual content in user input") parser.add_argument("input", help="User input to analyze") parser.add_argument( "--extract-paths", action="store_true", help="Extract and return image paths only", ) args = parser.parse_args() detector = VisionContentDetector() if args.extract_paths: paths = detector.extract_image_paths(args.input) for path in paths: print(path) else: confidence, items = detector.detect_visual_content(args.input) print(f"Confidence: {confidence.value}") if items: print("Detected items:") for item in items: print(f" - {item}") else: print("No visual content detected") if __name__ == "__main__": main()