Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,336 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
|
||||
|
||||
This script implements the detection logic described in the skill documentation,
|
||||
but integrates with the actual working vision processing implementation
|
||||
using direct API calls rather than custom agent delegation.
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class DetectionConfidence(Enum):
|
||||
HIGH = "high"
|
||||
MEDIUM = "medium"
|
||||
LOW = "low"
|
||||
NONE = "none"
|
||||
|
||||
|
||||
class VisionContentDetector:
|
||||
"""Detects visual content in user input based on various patterns."""
|
||||
|
||||
# Image file extensions (case-insensitive)
|
||||
IMAGE_EXTENSIONS = [
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".webp",
|
||||
".svg",
|
||||
".ico",
|
||||
".tiff",
|
||||
".tif",
|
||||
".heic",
|
||||
".heif",
|
||||
".raw",
|
||||
".psd",
|
||||
".ai",
|
||||
".eps",
|
||||
]
|
||||
|
||||
# Document files with potential visual content
|
||||
DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
|
||||
|
||||
# Chinese visual keywords
|
||||
CHINESE_KEYWORDS = {
|
||||
"high": [
|
||||
"图片",
|
||||
"图像",
|
||||
"照片",
|
||||
"截图",
|
||||
"图表",
|
||||
"图示",
|
||||
"图形",
|
||||
"影像",
|
||||
"画面",
|
||||
],
|
||||
"medium": [
|
||||
"流程图",
|
||||
"架构图",
|
||||
"时序图",
|
||||
"ER 图",
|
||||
"思维导图",
|
||||
"柱状图",
|
||||
"饼图",
|
||||
"折线图",
|
||||
"设计图",
|
||||
"原型图",
|
||||
"线框图",
|
||||
"界面",
|
||||
"UI",
|
||||
"UX",
|
||||
"表格",
|
||||
"表单",
|
||||
"清单",
|
||||
"列表",
|
||||
],
|
||||
"low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
|
||||
}
|
||||
|
||||
# English visual keywords
|
||||
ENGLISH_KEYWORDS = {
|
||||
"high": [
|
||||
"image",
|
||||
"photo",
|
||||
"picture",
|
||||
"screenshot",
|
||||
"snapshot",
|
||||
"capture",
|
||||
"diagram",
|
||||
"chart",
|
||||
"graph",
|
||||
"plot",
|
||||
"figure",
|
||||
],
|
||||
"medium": [
|
||||
"flowchart",
|
||||
"architecture",
|
||||
"sequence diagram",
|
||||
"ER diagram",
|
||||
"mind map",
|
||||
"bar chart",
|
||||
"pie chart",
|
||||
"line graph",
|
||||
"design",
|
||||
"mockup",
|
||||
"wireframe",
|
||||
"interface",
|
||||
"UI",
|
||||
"UX",
|
||||
"layout",
|
||||
"table",
|
||||
"form",
|
||||
"list",
|
||||
"grid",
|
||||
],
|
||||
"low": ["show", "display", "visualize", "view", "look at", "see"],
|
||||
}
|
||||
|
||||
# Technical visual keywords
|
||||
TECHNICAL_KEYWORDS = [
|
||||
"schema",
|
||||
"model",
|
||||
"blueprint",
|
||||
"spec",
|
||||
"technical drawing",
|
||||
"dashboard",
|
||||
"widget",
|
||||
"panel",
|
||||
"visualization",
|
||||
"map",
|
||||
"heatmap",
|
||||
"scatter plot",
|
||||
"histogram",
|
||||
"infographic",
|
||||
"poster",
|
||||
"banner",
|
||||
"thumbnail",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the detector with compiled regex patterns."""
|
||||
self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Compile regex patterns for performance."""
|
||||
# File extension pattern
|
||||
ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
|
||||
self.file_ext_pattern = re.compile(
|
||||
rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
|
||||
)
|
||||
|
||||
# Markdown image syntax
|
||||
self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
|
||||
|
||||
# Base64 image data
|
||||
self.base64_img_pattern = re.compile(
|
||||
r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
|
||||
)
|
||||
|
||||
# Keyword + file reference
|
||||
keyword_pattern = "|".join(
|
||||
[
|
||||
re.escape(k)
|
||||
for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
|
||||
]
|
||||
)
|
||||
ext_pattern_short = "|".join(
|
||||
re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
|
||||
) # Common ones
|
||||
self.keyword_file_pattern = re.compile(
|
||||
rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def detect_visual_content(
|
||||
self, user_input: str
|
||||
) -> Tuple[DetectionConfidence, List[str]]:
|
||||
"""
|
||||
Detect visual content in user input and return confidence level and detected items.
|
||||
|
||||
Args:
|
||||
user_input: The user's input text
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence_level, detected_items)
|
||||
"""
|
||||
detected_items = []
|
||||
confidence_scores = []
|
||||
|
||||
# Check 1: File extensions
|
||||
file_matches = self.file_ext_pattern.findall(user_input)
|
||||
if file_matches:
|
||||
detected_items.extend(file_matches)
|
||||
confidence_scores.append(0.9) # High confidence
|
||||
|
||||
# Check 2: Markdown image syntax
|
||||
markdown_matches = self.markdown_img_pattern.findall(user_input)
|
||||
if markdown_matches:
|
||||
detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
|
||||
confidence_scores.append(0.9) # High confidence
|
||||
|
||||
# Check 3: Base64 image data
|
||||
base64_matches = self.base64_img_pattern.findall(user_input)
|
||||
if base64_matches:
|
||||
detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
|
||||
confidence_scores.append(0.9) # High confidence
|
||||
|
||||
# Check 4: Visual keywords
|
||||
keyword_confidence = self._check_keywords(user_input)
|
||||
if keyword_confidence > 0:
|
||||
confidence_scores.append(keyword_confidence)
|
||||
|
||||
# Check 5: URL images
|
||||
url_images = self._detect_url_images(user_input)
|
||||
if url_images:
|
||||
detected_items.extend(url_images)
|
||||
confidence_scores.append(0.8) # Medium-high confidence
|
||||
|
||||
# Determine overall confidence
|
||||
if not confidence_scores:
|
||||
return DetectionConfidence.NONE, []
|
||||
|
||||
max_confidence = max(confidence_scores)
|
||||
if max_confidence >= 0.9:
|
||||
return DetectionConfidence.HIGH, detected_items
|
||||
elif max_confidence >= 0.6:
|
||||
return DetectionConfidence.MEDIUM, detected_items
|
||||
else:
|
||||
return DetectionConfidence.LOW, detected_items
|
||||
|
||||
def _check_keywords(self, user_input: str) -> float:
|
||||
"""Check for visual keywords and return confidence score."""
|
||||
input_lower = user_input.lower()
|
||||
|
||||
# Check high priority keywords
|
||||
for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
|
||||
if keyword in input_lower:
|
||||
return 0.8
|
||||
|
||||
# Check medium priority keywords
|
||||
for keyword in (
|
||||
self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
|
||||
):
|
||||
if keyword in input_lower:
|
||||
return 0.6
|
||||
|
||||
# Check technical keywords
|
||||
for keyword in self.TECHNICAL_KEYWORDS:
|
||||
if keyword.lower() in input_lower:
|
||||
return 0.6
|
||||
|
||||
# Check low priority keywords
|
||||
for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
|
||||
if keyword in input_lower:
|
||||
return 0.4
|
||||
|
||||
return 0.0
|
||||
|
||||
def _detect_url_images(self, user_input: str) -> List[str]:
|
||||
"""Detect image URLs in the input."""
|
||||
url_pattern = re.compile(
|
||||
r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
|
||||
)
|
||||
return url_pattern.findall(user_input)
|
||||
|
||||
def extract_image_paths(self, user_input: str) -> List[str]:
|
||||
"""
|
||||
Extract actual image paths/URLs from user input.
|
||||
|
||||
Returns:
|
||||
List of image paths or URLs
|
||||
"""
|
||||
image_paths = []
|
||||
|
||||
# File paths with extensions
|
||||
file_matches = self.file_ext_pattern.findall(user_input)
|
||||
image_paths.extend(file_matches)
|
||||
|
||||
# Markdown image URLs
|
||||
markdown_matches = self.markdown_img_pattern.findall(user_input)
|
||||
image_paths.extend([url for alt, url in markdown_matches])
|
||||
|
||||
# Direct URLs
|
||||
url_images = self._detect_url_images(user_input)
|
||||
image_paths.extend(url_images)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_paths = []
|
||||
for path in image_paths:
|
||||
if path not in seen:
|
||||
unique_paths.append(path)
|
||||
seen.add(path)
|
||||
|
||||
return unique_paths
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface for testing."""
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Detect visual content in user input")
|
||||
parser.add_argument("input", help="User input to analyze")
|
||||
parser.add_argument(
|
||||
"--extract-paths",
|
||||
action="store_true",
|
||||
help="Extract and return image paths only",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
detector = VisionContentDetector()
|
||||
|
||||
if args.extract_paths:
|
||||
paths = detector.extract_image_paths(args.input)
|
||||
for path in paths:
|
||||
print(path)
|
||||
else:
|
||||
confidence, items = detector.detect_visual_content(args.input)
|
||||
print(f"Confidence: {confidence.value}")
|
||||
if items:
|
||||
print("Detected items:")
|
||||
for item in items:
|
||||
print(f" - {item}")
|
||||
else:
|
||||
print("No visual content detected")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user