04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
337 lines
9.4 KiB
Python
337 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
|
|
|
|
This script implements the detection logic described in the skill documentation,
|
|
but integrates with the actual working vision processing implementation
|
|
using direct API calls rather than custom agent delegation.
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Dict, Tuple, Optional
|
|
from enum import Enum
|
|
|
|
|
|
class DetectionConfidence(Enum):
|
|
HIGH = "high"
|
|
MEDIUM = "medium"
|
|
LOW = "low"
|
|
NONE = "none"
|
|
|
|
|
|
class VisionContentDetector:
|
|
"""Detects visual content in user input based on various patterns."""
|
|
|
|
# Image file extensions (case-insensitive)
|
|
IMAGE_EXTENSIONS = [
|
|
".png",
|
|
".jpg",
|
|
".jpeg",
|
|
".gif",
|
|
".bmp",
|
|
".webp",
|
|
".svg",
|
|
".ico",
|
|
".tiff",
|
|
".tif",
|
|
".heic",
|
|
".heif",
|
|
".raw",
|
|
".psd",
|
|
".ai",
|
|
".eps",
|
|
]
|
|
|
|
# Document files with potential visual content
|
|
DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
|
|
|
|
# Chinese visual keywords
|
|
CHINESE_KEYWORDS = {
|
|
"high": [
|
|
"图片",
|
|
"图像",
|
|
"照片",
|
|
"截图",
|
|
"图表",
|
|
"图示",
|
|
"图形",
|
|
"影像",
|
|
"画面",
|
|
],
|
|
"medium": [
|
|
"流程图",
|
|
"架构图",
|
|
"时序图",
|
|
"ER 图",
|
|
"思维导图",
|
|
"柱状图",
|
|
"饼图",
|
|
"折线图",
|
|
"设计图",
|
|
"原型图",
|
|
"线框图",
|
|
"界面",
|
|
"UI",
|
|
"UX",
|
|
"表格",
|
|
"表单",
|
|
"清单",
|
|
"列表",
|
|
],
|
|
"low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
|
|
}
|
|
|
|
# English visual keywords
|
|
ENGLISH_KEYWORDS = {
|
|
"high": [
|
|
"image",
|
|
"photo",
|
|
"picture",
|
|
"screenshot",
|
|
"snapshot",
|
|
"capture",
|
|
"diagram",
|
|
"chart",
|
|
"graph",
|
|
"plot",
|
|
"figure",
|
|
],
|
|
"medium": [
|
|
"flowchart",
|
|
"architecture",
|
|
"sequence diagram",
|
|
"ER diagram",
|
|
"mind map",
|
|
"bar chart",
|
|
"pie chart",
|
|
"line graph",
|
|
"design",
|
|
"mockup",
|
|
"wireframe",
|
|
"interface",
|
|
"UI",
|
|
"UX",
|
|
"layout",
|
|
"table",
|
|
"form",
|
|
"list",
|
|
"grid",
|
|
],
|
|
"low": ["show", "display", "visualize", "view", "look at", "see"],
|
|
}
|
|
|
|
# Technical visual keywords
|
|
TECHNICAL_KEYWORDS = [
|
|
"schema",
|
|
"model",
|
|
"blueprint",
|
|
"spec",
|
|
"technical drawing",
|
|
"dashboard",
|
|
"widget",
|
|
"panel",
|
|
"visualization",
|
|
"map",
|
|
"heatmap",
|
|
"scatter plot",
|
|
"histogram",
|
|
"infographic",
|
|
"poster",
|
|
"banner",
|
|
"thumbnail",
|
|
]
|
|
|
|
def __init__(self):
|
|
"""Initialize the detector with compiled regex patterns."""
|
|
self._compile_patterns()
|
|
|
|
def _compile_patterns(self):
|
|
"""Compile regex patterns for performance."""
|
|
# File extension pattern
|
|
ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
|
|
self.file_ext_pattern = re.compile(
|
|
rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
|
|
)
|
|
|
|
# Markdown image syntax
|
|
self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
|
|
|
|
# Base64 image data
|
|
self.base64_img_pattern = re.compile(
|
|
r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
|
|
)
|
|
|
|
# Keyword + file reference
|
|
keyword_pattern = "|".join(
|
|
[
|
|
re.escape(k)
|
|
for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
|
|
]
|
|
)
|
|
ext_pattern_short = "|".join(
|
|
re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
|
|
) # Common ones
|
|
self.keyword_file_pattern = re.compile(
|
|
rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
def detect_visual_content(
|
|
self, user_input: str
|
|
) -> Tuple[DetectionConfidence, List[str]]:
|
|
"""
|
|
Detect visual content in user input and return confidence level and detected items.
|
|
|
|
Args:
|
|
user_input: The user's input text
|
|
|
|
Returns:
|
|
Tuple of (confidence_level, detected_items)
|
|
"""
|
|
detected_items = []
|
|
confidence_scores = []
|
|
|
|
# Check 1: File extensions
|
|
file_matches = self.file_ext_pattern.findall(user_input)
|
|
if file_matches:
|
|
detected_items.extend(file_matches)
|
|
confidence_scores.append(0.9) # High confidence
|
|
|
|
# Check 2: Markdown image syntax
|
|
markdown_matches = self.markdown_img_pattern.findall(user_input)
|
|
if markdown_matches:
|
|
detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
|
|
confidence_scores.append(0.9) # High confidence
|
|
|
|
# Check 3: Base64 image data
|
|
base64_matches = self.base64_img_pattern.findall(user_input)
|
|
if base64_matches:
|
|
detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
|
|
confidence_scores.append(0.9) # High confidence
|
|
|
|
# Check 4: Visual keywords
|
|
keyword_confidence = self._check_keywords(user_input)
|
|
if keyword_confidence > 0:
|
|
confidence_scores.append(keyword_confidence)
|
|
|
|
# Check 5: URL images
|
|
url_images = self._detect_url_images(user_input)
|
|
if url_images:
|
|
detected_items.extend(url_images)
|
|
confidence_scores.append(0.8) # Medium-high confidence
|
|
|
|
# Determine overall confidence
|
|
if not confidence_scores:
|
|
return DetectionConfidence.NONE, []
|
|
|
|
max_confidence = max(confidence_scores)
|
|
if max_confidence >= 0.9:
|
|
return DetectionConfidence.HIGH, detected_items
|
|
elif max_confidence >= 0.6:
|
|
return DetectionConfidence.MEDIUM, detected_items
|
|
else:
|
|
return DetectionConfidence.LOW, detected_items
|
|
|
|
def _check_keywords(self, user_input: str) -> float:
|
|
"""Check for visual keywords and return confidence score."""
|
|
input_lower = user_input.lower()
|
|
|
|
# Check high priority keywords
|
|
for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
|
|
if keyword in input_lower:
|
|
return 0.8
|
|
|
|
# Check medium priority keywords
|
|
for keyword in (
|
|
self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
|
|
):
|
|
if keyword in input_lower:
|
|
return 0.6
|
|
|
|
# Check technical keywords
|
|
for keyword in self.TECHNICAL_KEYWORDS:
|
|
if keyword.lower() in input_lower:
|
|
return 0.6
|
|
|
|
# Check low priority keywords
|
|
for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
|
|
if keyword in input_lower:
|
|
return 0.4
|
|
|
|
return 0.0
|
|
|
|
def _detect_url_images(self, user_input: str) -> List[str]:
|
|
"""Detect image URLs in the input."""
|
|
url_pattern = re.compile(
|
|
r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
|
|
)
|
|
return url_pattern.findall(user_input)
|
|
|
|
def extract_image_paths(self, user_input: str) -> List[str]:
|
|
"""
|
|
Extract actual image paths/URLs from user input.
|
|
|
|
Returns:
|
|
List of image paths or URLs
|
|
"""
|
|
image_paths = []
|
|
|
|
# File paths with extensions
|
|
file_matches = self.file_ext_pattern.findall(user_input)
|
|
image_paths.extend(file_matches)
|
|
|
|
# Markdown image URLs
|
|
markdown_matches = self.markdown_img_pattern.findall(user_input)
|
|
image_paths.extend([url for alt, url in markdown_matches])
|
|
|
|
# Direct URLs
|
|
url_images = self._detect_url_images(user_input)
|
|
image_paths.extend(url_images)
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_paths = []
|
|
for path in image_paths:
|
|
if path not in seen:
|
|
unique_paths.append(path)
|
|
seen.add(path)
|
|
|
|
return unique_paths
|
|
|
|
|
|
def main():
|
|
"""Command line interface for testing."""
|
|
import argparse
|
|
import sys
|
|
|
|
parser = argparse.ArgumentParser(description="Detect visual content in user input")
|
|
parser.add_argument("input", help="User input to analyze")
|
|
parser.add_argument(
|
|
"--extract-paths",
|
|
action="store_true",
|
|
help="Extract and return image paths only",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
detector = VisionContentDetector()
|
|
|
|
if args.extract_paths:
|
|
paths = detector.extract_image_paths(args.input)
|
|
for path in paths:
|
|
print(path)
|
|
else:
|
|
confidence, items = detector.detect_visual_content(args.input)
|
|
print(f"Confidence: {confidence.value}")
|
|
if items:
|
|
print("Detected items:")
|
|
for item in items:
|
|
print(f" - {item}")
|
|
else:
|
|
print("No visual content detected")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|