Files
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

337 lines
9.4 KiB
Python

#!/usr/bin/env python3
"""
Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
This script implements the detection logic described in the skill documentation,
but integrates with the actual working vision processing implementation
using direct API calls rather than custom agent delegation.
"""
import re
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from enum import Enum
class DetectionConfidence(Enum):
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
NONE = "none"
class VisionContentDetector:
"""Detects visual content in user input based on various patterns."""
# Image file extensions (case-insensitive)
IMAGE_EXTENSIONS = [
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".webp",
".svg",
".ico",
".tiff",
".tif",
".heic",
".heif",
".raw",
".psd",
".ai",
".eps",
]
# Document files with potential visual content
DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
# Chinese visual keywords
CHINESE_KEYWORDS = {
"high": [
"图片",
"图像",
"照片",
"截图",
"图表",
"图示",
"图形",
"影像",
"画面",
],
"medium": [
"流程图",
"架构图",
"时序图",
"ER 图",
"思维导图",
"柱状图",
"饼图",
"折线图",
"设计图",
"原型图",
"线框图",
"界面",
"UI",
"UX",
"表格",
"表单",
"清单",
"列表",
],
"low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
}
# English visual keywords
ENGLISH_KEYWORDS = {
"high": [
"image",
"photo",
"picture",
"screenshot",
"snapshot",
"capture",
"diagram",
"chart",
"graph",
"plot",
"figure",
],
"medium": [
"flowchart",
"architecture",
"sequence diagram",
"ER diagram",
"mind map",
"bar chart",
"pie chart",
"line graph",
"design",
"mockup",
"wireframe",
"interface",
"UI",
"UX",
"layout",
"table",
"form",
"list",
"grid",
],
"low": ["show", "display", "visualize", "view", "look at", "see"],
}
# Technical visual keywords
TECHNICAL_KEYWORDS = [
"schema",
"model",
"blueprint",
"spec",
"technical drawing",
"dashboard",
"widget",
"panel",
"visualization",
"map",
"heatmap",
"scatter plot",
"histogram",
"infographic",
"poster",
"banner",
"thumbnail",
]
def __init__(self):
"""Initialize the detector with compiled regex patterns."""
self._compile_patterns()
def _compile_patterns(self):
"""Compile regex patterns for performance."""
# File extension pattern
ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
self.file_ext_pattern = re.compile(
rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
)
# Markdown image syntax
self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
# Base64 image data
self.base64_img_pattern = re.compile(
r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
)
# Keyword + file reference
keyword_pattern = "|".join(
[
re.escape(k)
for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
]
)
ext_pattern_short = "|".join(
re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
) # Common ones
self.keyword_file_pattern = re.compile(
rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
re.IGNORECASE,
)
def detect_visual_content(
self, user_input: str
) -> Tuple[DetectionConfidence, List[str]]:
"""
Detect visual content in user input and return confidence level and detected items.
Args:
user_input: The user's input text
Returns:
Tuple of (confidence_level, detected_items)
"""
detected_items = []
confidence_scores = []
# Check 1: File extensions
file_matches = self.file_ext_pattern.findall(user_input)
if file_matches:
detected_items.extend(file_matches)
confidence_scores.append(0.9) # High confidence
# Check 2: Markdown image syntax
markdown_matches = self.markdown_img_pattern.findall(user_input)
if markdown_matches:
detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
confidence_scores.append(0.9) # High confidence
# Check 3: Base64 image data
base64_matches = self.base64_img_pattern.findall(user_input)
if base64_matches:
detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
confidence_scores.append(0.9) # High confidence
# Check 4: Visual keywords
keyword_confidence = self._check_keywords(user_input)
if keyword_confidence > 0:
confidence_scores.append(keyword_confidence)
# Check 5: URL images
url_images = self._detect_url_images(user_input)
if url_images:
detected_items.extend(url_images)
confidence_scores.append(0.8) # Medium-high confidence
# Determine overall confidence
if not confidence_scores:
return DetectionConfidence.NONE, []
max_confidence = max(confidence_scores)
if max_confidence >= 0.9:
return DetectionConfidence.HIGH, detected_items
elif max_confidence >= 0.6:
return DetectionConfidence.MEDIUM, detected_items
else:
return DetectionConfidence.LOW, detected_items
def _check_keywords(self, user_input: str) -> float:
"""Check for visual keywords and return confidence score."""
input_lower = user_input.lower()
# Check high priority keywords
for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
if keyword in input_lower:
return 0.8
# Check medium priority keywords
for keyword in (
self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
):
if keyword in input_lower:
return 0.6
# Check technical keywords
for keyword in self.TECHNICAL_KEYWORDS:
if keyword.lower() in input_lower:
return 0.6
# Check low priority keywords
for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
if keyword in input_lower:
return 0.4
return 0.0
def _detect_url_images(self, user_input: str) -> List[str]:
"""Detect image URLs in the input."""
url_pattern = re.compile(
r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
)
return url_pattern.findall(user_input)
def extract_image_paths(self, user_input: str) -> List[str]:
"""
Extract actual image paths/URLs from user input.
Returns:
List of image paths or URLs
"""
image_paths = []
# File paths with extensions
file_matches = self.file_ext_pattern.findall(user_input)
image_paths.extend(file_matches)
# Markdown image URLs
markdown_matches = self.markdown_img_pattern.findall(user_input)
image_paths.extend([url for alt, url in markdown_matches])
# Direct URLs
url_images = self._detect_url_images(user_input)
image_paths.extend(url_images)
# Remove duplicates while preserving order
seen = set()
unique_paths = []
for path in image_paths:
if path not in seen:
unique_paths.append(path)
seen.add(path)
return unique_paths
def main():
"""Command line interface for testing."""
import argparse
import sys
parser = argparse.ArgumentParser(description="Detect visual content in user input")
parser.add_argument("input", help="User input to analyze")
parser.add_argument(
"--extract-paths",
action="store_true",
help="Extract and return image paths only",
)
args = parser.parse_args()
detector = VisionContentDetector()
if args.extract_paths:
paths = detector.extract_image_paths(args.input)
for path in paths:
print(path)
else:
confidence, items = detector.detect_visual_content(args.input)
print(f"Confidence: {confidence.value}")
if items:
print("Detected items:")
for item in items:
print(f" - {item}")
else:
print("No visual content detected")
if __name__ == "__main__":
main()