Initial commit: skills library

- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
This commit is contained in:
hmo
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,4 @@
#!/usr/bin/env python3
"""Example script - delete if not needed."""
print("Hello from skill!")
@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""
Integration script for agent-vision-awareness skill
This script demonstrates the complete workflow:
1. Detect visual content in user input
2. Extract image paths
3. Analyze images using the vision analyzer
4. Return structured results
This replaces the problematic custom agent delegation approach.
"""
import sys
import os
from pathlib import Path
from typing import Dict, Any, List
def process_user_input(
user_input: str, user_request: str = "", config: Dict[str, str] = None
) -> Dict[str, Any]:
"""
Process user input for visual content and return analysis results.
Args:
user_input: The user's input text that may contain visual content references
user_request: The original user request/context (optional)
config: Configuration for the vision analyzer (optional)
Returns:
Dictionary containing detection results and analysis
"""
try:
# Import the detector and analyzer
from .vision_detector import VisionContentDetector, DetectionConfidence
from .standalone_vision_analyzer import StandaloneVisionAnalyzer
# Initialize components
detector = VisionContentDetector()
analyzer = StandaloneVisionAnalyzer(config)
# Step 1: Detect visual content
confidence, detected_items = detector.detect_visual_content(user_input)
result = {
"status": "success",
"confidence": confidence.value,
"detected_items": detected_items,
"analysis_results": [],
"errors": [],
}
if confidence == DetectionConfidence.NONE:
result["message"] = "No visual content detected"
return result
# Step 2: Extract image paths
image_paths = detector.extract_image_paths(user_input)
if not image_paths:
result["message"] = "Visual content detected but no valid image paths found"
result["errors"].append("No valid image paths found")
return result
# Step 3: Determine analysis mode
combined_text = (user_request + " " + user_input).lower()
if any(
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
):
mode = "ocr"
elif any(
word in combined_text
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
):
mode = "chart"
elif any(
word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
):
mode = "fashion"
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
mode = "product"
elif any(
word in combined_text
for word in ["scene", "场景", "环境", "location", "place"]
):
mode = "scene"
else:
mode = "describe"
# Step 4: Analyze each image
for image_path in image_paths:
try:
# Handle relative paths
if not os.path.isabs(image_path):
image_path = os.path.join(os.getcwd(), image_path)
# Analyze the image
if mode == "custom":
analysis_result = analyzer.analyze_with_mode(
Path(image_path),
"custom",
user_request or "Please analyze this image.",
)
else:
analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
result["analysis_results"].append(
{"image_path": image_path, "mode": mode, "result": analysis_result}
)
except Exception as e:
error_msg = f"Failed to analyze {image_path}: {str(e)}"
result["errors"].append(error_msg)
print(f"Error: {error_msg}", file=sys.stderr)
return result
except Exception as e:
return {
"status": "error",
"error": str(e),
"message": f"Processing failed: {str(e)}",
}
def main():
"""Command line interface for testing."""
import argparse
import json
parser = argparse.ArgumentParser(description="Process visual content in user input")
parser.add_argument("input", help="User input containing visual content references")
parser.add_argument("--request", "-r", help="Original user request/context")
parser.addiction_group = parser.add_mutually_exclusive_group()
parser.addiction_group.add_argument("--api-key", help="API key for vision service")
parser.addiction_group.add_argument("--config-file", help="Configuration file path")
parser.add_argument("--output", "-o", help="Output file for results")
args = parser.parse_args()
# Build configuration
config = {}
if args.api_key:
config["api_key"] = args.api_key
config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
config["model"] = "doubao-seed-code"
elif args.config_file:
import json
with open(args.config_file, "r", encoding="utf-8") as f:
config = json.load(f)
# Process the input
result = process_user_input(args.input, args.request or "", config)
# Output results
output = json.dumps(result, indent=2, ensure_ascii=False)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Results saved to: {args.output}")
else:
print(output)
if __name__ == "__main__":
main()
@@ -0,0 +1,216 @@
#!/usr/bin/env python3
"""
Standalone Vision Analyzer - Simplified version for agent-vision-awareness skill
This is a self-contained version of the vision analyzer that doesn't depend on
the image-service skill structure, making it easier to integrate directly.
"""
import base64
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, Optional
import httpx
class StandaloneVisionAnalyzer:
"""Standalone vision analyzer using direct API calls."""
# Predefined analysis modes
ANALYSIS_MODES = {
"describe": "请详细描述这张图片的内容,包括:人物、场景、物品、颜色、布局等所有细节。",
"ocr": "请仔细识别这张图片中的所有文字内容,按照文字在图片中的位置顺序输出。如果是中文,请保持原文输出。",
"chart": "请分析这张图表的内容,包括:图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
"fashion": "请分析这张图片中人物的穿搭,包括:服装款式、颜色搭配、配饰、整体风格等。",
"product": "请分析这张产品图片,包括:产品类型、外观特征、功能特点、品牌信息等。",
"scene": "请描述这张图片的场景,包括:地点、环境、氛围、时间(白天/夜晚)等。",
"custom": "用户自定义问题",
}
def __init__(self, config: Optional[Dict[str, str]] = None):
"""
Initialize the analyzer.
Args:
config: Configuration dictionary with api_key, base_url, model
"""
if config is None:
config = self._load_config()
self.api_key = (
config.get("api_key")
or config.get("VOLCENGINE_API_KEY")
or "b0359bed-09f2-49e2-a53c-32ba057412e3"
)
self.base_url = (
config.get("base_url") or "https://ark.cn-beijing.volces.com/api/coding/v3"
)
self.model = config.get("model") or "doubao-seed-code"
if not self.api_key or not self.base_url:
raise ValueError("Missing required API configuration: api_key and base_url")
def _load_config(self) -> Dict[str, str]:
"""Load configuration from environment variables or config file."""
config = {}
# Load from environment variables
config["api_key"] = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
"DASHSCOPE_API_KEY"
)
config["base_url"] = os.environ.get("VISION_API_BASE_URL")
config["model"] = os.environ.get("VISION_MODEL")
return config
def encode_image(self, image_path: Path) -> str:
"""Encode image to base64."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def analyze(self, image_path: Path, question: str) -> str:
"""
Analyze image content.
Args:
image_path: Path to the image file
question: Question/prompt for analysis
Returns:
Analysis result text
"""
if not image_path.exists():
raise FileNotFoundError(f"Image not found: {image_path}")
base64_image = self.encode_image(image_path)
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}"
},
},
],
}
],
"max_tokens": 2000,
}
try:
with httpx.Client(timeout=30.0) as client:
response = client.post(
f"{self.base_url}/chat/completions", headers=headers, json=payload
)
response.raise_for_status()
result = response.json()
return result["choices"][0]["message"]["content"]
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
raise ValueError(
f"API endpoint not found, check base_url: {self.base_url}"
)
elif e.response.status_code == 401:
raise ValueError("Invalid or expired API key")
else:
raise RuntimeError(f"API request failed: {e}")
except Exception as e:
raise RuntimeError(f"Analysis failed: {e}")
def analyze_with_mode(
self,
image_path: Path,
mode: str = "describe",
custom_question: Optional[str] = None,
) -> str:
"""
Analyze image with predefined mode.
Args:
image_path: Path to the image file
mode: Analysis mode (describe, ocr, chart, fashion, product, scene, custom)
custom_question: Custom question for custom mode
Returns:
Analysis result text
"""
if mode not in self.ANALYSIS_MODES:
raise ValueError(
f"Unsupported mode: {mode}, available: {list(self.ANALYSIS_MODES.keys())}"
)
if mode == "custom":
if not custom_question:
raise ValueError("Custom mode requires custom_question parameter")
question = custom_question
else:
question = self.ANALYSIS_MODES[mode]
return self.analyze(image_path, question)
def main():
"""Command line interface."""
import argparse
parser = argparse.ArgumentParser(description="Standalone Vision Analyzer")
parser.add_argument("image", help="Image path")
parser.add_argument(
"--mode",
"-m",
choices=["describe", "ocr", "chart", "fashion", "product", "scene", "custom"],
default="describe",
help="Analysis mode",
)
parser.add_argument("--question", "-q", help="Custom question for custom mode")
parser.add_argument("--output", "-o", help="Output file")
args = parser.parse_args()
image_path = Path(args.image)
if not image_path.exists():
print(f"Error: Image not found: {image_path}", file=sys.stderr)
sys.exit(1)
try:
analyzer = StandaloneVisionAnalyzer()
if args.mode == "custom":
if not args.question:
print(
"Error: Custom mode requires --question parameter", file=sys.stderr
)
sys.exit(1)
result = analyzer.analyze_with_mode(image_path, "custom", args.question)
else:
result = analyzer.analyze_with_mode(image_path, args.mode)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result)
print(f"Result saved to: {args.output}")
else:
print("Analysis Result:")
print("-" * 50)
print(result)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Test script for agent-vision-awareness skill
This script tests the vision detection and processing capabilities.
"""
import os
import sys
from pathlib import Path
def test_detection():
"""Test visual content detection."""
print("Testing visual content detection...")
from .vision_detector import VisionContentDetector, DetectionConfidence
detector = VisionContentDetector()
test_cases = [
("帮我分析这个截图 error.png", DetectionConfidence.HIGH),
("描述这张图片的内容", DetectionConfidence.LOW),
("根据架构图 design/architecture.png 生成部署方案", DetectionConfidence.HIGH),
("写一个 Python 脚本", DetectionConfidence.NONE),
("![diagram](flow.png) 显示什么?", DetectionConfidence.HIGH),
]
for test_input, expected_confidence in test_cases:
confidence, items = detector.detect_visual_content(test_input)
status = "" if confidence == expected_confidence else ""
print(f"{status} Input: {test_input}")
print(f" Expected: {expected_confidence.value}, Got: {confidence.value}")
if items:
print(f" Detected: {items}")
print()
return True
def test_integration():
"""Test integration with vision analyzer (if API key available)."""
print("Testing vision integration...")
# Check if API key is available
api_key = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
"DASHSCOPE_API_KEY"
)
if not api_key:
print("⚠️ No API key found. Skipping integration test.")
print(
" Set VOLCENGINE_API_KEY or DASHSCOPE_API_KEY environment variable to test."
)
return False
try:
from .integrate_vision import process_user_input
# Test with a simple request (won't actually process image without file)
result = process_user_input(
"测试视觉处理",
"这是一个测试",
config={
"api_key": api_key,
"base_url": "https://ark.cn-beijing.volces.com/api/coding/v3",
"model": "doubao-seed-code",
},
)
if result["status"] == "success":
print("✅ Integration test passed (configuration valid)")
return True
else:
print(f"❌ Integration test failed: {result.get('error', 'Unknown error')}")
return False
except Exception as e:
print(f"❌ Integration test failed: {e}")
return False
def main():
"""Run all tests."""
print("🧪 Testing Agent Vision Awareness Skill")
print("=" * 50)
success = True
# Test detection
success &= test_detection()
# Test integration (if possible)
success &= test_integration()
print("=" * 50)
if success:
print("✅ All tests passed!")
else:
print("⚠️ Some tests failed or were skipped.")
return success
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)
@@ -0,0 +1,336 @@
#!/usr/bin/env python3
"""
Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
This script implements the detection logic described in the skill documentation,
but integrates with the actual working vision processing implementation
using direct API calls rather than custom agent delegation.
"""
import re
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from enum import Enum
class DetectionConfidence(Enum):
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
NONE = "none"
class VisionContentDetector:
"""Detects visual content in user input based on various patterns."""
# Image file extensions (case-insensitive)
IMAGE_EXTENSIONS = [
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".webp",
".svg",
".ico",
".tiff",
".tif",
".heic",
".heif",
".raw",
".psd",
".ai",
".eps",
]
# Document files with potential visual content
DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
# Chinese visual keywords
CHINESE_KEYWORDS = {
"high": [
"图片",
"图像",
"照片",
"截图",
"图表",
"图示",
"图形",
"影像",
"画面",
],
"medium": [
"流程图",
"架构图",
"时序图",
"ER 图",
"思维导图",
"柱状图",
"饼图",
"折线图",
"设计图",
"原型图",
"线框图",
"界面",
"UI",
"UX",
"表格",
"表单",
"清单",
"列表",
],
"low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
}
# English visual keywords
ENGLISH_KEYWORDS = {
"high": [
"image",
"photo",
"picture",
"screenshot",
"snapshot",
"capture",
"diagram",
"chart",
"graph",
"plot",
"figure",
],
"medium": [
"flowchart",
"architecture",
"sequence diagram",
"ER diagram",
"mind map",
"bar chart",
"pie chart",
"line graph",
"design",
"mockup",
"wireframe",
"interface",
"UI",
"UX",
"layout",
"table",
"form",
"list",
"grid",
],
"low": ["show", "display", "visualize", "view", "look at", "see"],
}
# Technical visual keywords
TECHNICAL_KEYWORDS = [
"schema",
"model",
"blueprint",
"spec",
"technical drawing",
"dashboard",
"widget",
"panel",
"visualization",
"map",
"heatmap",
"scatter plot",
"histogram",
"infographic",
"poster",
"banner",
"thumbnail",
]
def __init__(self):
"""Initialize the detector with compiled regex patterns."""
self._compile_patterns()
def _compile_patterns(self):
"""Compile regex patterns for performance."""
# File extension pattern
ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
self.file_ext_pattern = re.compile(
rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
)
# Markdown image syntax
self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
# Base64 image data
self.base64_img_pattern = re.compile(
r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
)
# Keyword + file reference
keyword_pattern = "|".join(
[
re.escape(k)
for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
]
)
ext_pattern_short = "|".join(
re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
) # Common ones
self.keyword_file_pattern = re.compile(
rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
re.IGNORECASE,
)
def detect_visual_content(
self, user_input: str
) -> Tuple[DetectionConfidence, List[str]]:
"""
Detect visual content in user input and return confidence level and detected items.
Args:
user_input: The user's input text
Returns:
Tuple of (confidence_level, detected_items)
"""
detected_items = []
confidence_scores = []
# Check 1: File extensions
file_matches = self.file_ext_pattern.findall(user_input)
if file_matches:
detected_items.extend(file_matches)
confidence_scores.append(0.9) # High confidence
# Check 2: Markdown image syntax
markdown_matches = self.markdown_img_pattern.findall(user_input)
if markdown_matches:
detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
confidence_scores.append(0.9) # High confidence
# Check 3: Base64 image data
base64_matches = self.base64_img_pattern.findall(user_input)
if base64_matches:
detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
confidence_scores.append(0.9) # High confidence
# Check 4: Visual keywords
keyword_confidence = self._check_keywords(user_input)
if keyword_confidence > 0:
confidence_scores.append(keyword_confidence)
# Check 5: URL images
url_images = self._detect_url_images(user_input)
if url_images:
detected_items.extend(url_images)
confidence_scores.append(0.8) # Medium-high confidence
# Determine overall confidence
if not confidence_scores:
return DetectionConfidence.NONE, []
max_confidence = max(confidence_scores)
if max_confidence >= 0.9:
return DetectionConfidence.HIGH, detected_items
elif max_confidence >= 0.6:
return DetectionConfidence.MEDIUM, detected_items
else:
return DetectionConfidence.LOW, detected_items
def _check_keywords(self, user_input: str) -> float:
"""Check for visual keywords and return confidence score."""
input_lower = user_input.lower()
# Check high priority keywords
for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
if keyword in input_lower:
return 0.8
# Check medium priority keywords
for keyword in (
self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
):
if keyword in input_lower:
return 0.6
# Check technical keywords
for keyword in self.TECHNICAL_KEYWORDS:
if keyword.lower() in input_lower:
return 0.6
# Check low priority keywords
for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
if keyword in input_lower:
return 0.4
return 0.0
def _detect_url_images(self, user_input: str) -> List[str]:
"""Detect image URLs in the input."""
url_pattern = re.compile(
r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
)
return url_pattern.findall(user_input)
def extract_image_paths(self, user_input: str) -> List[str]:
"""
Extract actual image paths/URLs from user input.
Returns:
List of image paths or URLs
"""
image_paths = []
# File paths with extensions
file_matches = self.file_ext_pattern.findall(user_input)
image_paths.extend(file_matches)
# Markdown image URLs
markdown_matches = self.markdown_img_pattern.findall(user_input)
image_paths.extend([url for alt, url in markdown_matches])
# Direct URLs
url_images = self._detect_url_images(user_input)
image_paths.extend(url_images)
# Remove duplicates while preserving order
seen = set()
unique_paths = []
for path in image_paths:
if path not in seen:
unique_paths.append(path)
seen.add(path)
return unique_paths
def main():
"""Command line interface for testing."""
import argparse
import sys
parser = argparse.ArgumentParser(description="Detect visual content in user input")
parser.add_argument("input", help="User input to analyze")
parser.add_argument(
"--extract-paths",
action="store_true",
help="Extract and return image paths only",
)
args = parser.parse_args()
detector = VisionContentDetector()
if args.extract_paths:
paths = detector.extract_image_paths(args.input)
for path in paths:
print(path)
else:
confidence, items = detector.detect_visual_content(args.input)
print(f"Confidence: {confidence.value}")
if items:
print("Detected items:")
for item in items:
print(f" - {item}")
else:
print("No visual content detected")
if __name__ == "__main__":
main()
@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
import sys
sys.stdout.reconfigure(errors='replace')
sys.stderr.reconfigure(errors='replace')
import os
os.environ['PYTHONIOENCODING'] = 'utf-8'
import base64
import time
import tempfile
from pathlib import Path
from openai import OpenAI
# 统一临时目录
TEMP_DIR = r'D:\F\NewI\opencode\daily-workspace\temp'
os.makedirs(TEMP_DIR, exist_ok=True)
# 从OpenCode配置读取火山方舟API Key
CONFIG_PATH = r'C:\Users\hmo\.config\opencode\config.json'
import json
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
config = json.load(f)
API_KEY = config['provider']['volcengine']['options']['apiKey']
BASE_URL = config['provider']['volcengine']['options']['baseURL']
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
MODEL = 'doubao-seed-2.0-pro'
def analyze_image(image_path_or_url, prompt="详细描述这张图片的内容"):
"""
分析图片内容,支持本地路径和http/https URL
:param image_path_or_url: 图片路径或URL
:param prompt: 分析提示词
:return: 分析结果
"""
try:
# 处理URL
if image_path_or_url.lower().startswith(('http://', 'https://')):
image_url = image_path_or_url
else:
# 处理本地路径
image_path = Path(image_path_or_url)
if not image_path.exists():
return f"错误:图片不存在 {image_path}"
# 转base64
with open(image_path, 'rb') as f:
image_base64 = base64.b64encode(f.read()).decode('utf-8')
image_url = f"data:image/{image_path.suffix.lstrip('.')};base64,{image_base64}"
# 调用API
response = client.chat.completions.create(
model=MODEL,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
except Exception as e:
return f"图片识别失败:{type(e).__name__}: {str(e)}"
if __name__ == "__main__":
if len(sys.argv) < 2:
print("用法:python vision_direct.py <图片路径/URL> [提示词]")
sys.exit(1)
image_path = sys.argv[1]
prompt = sys.argv[2] if len(sys.argv) > 2 else "详细描述这张图片的内容"
result = analyze_image(image_path, prompt)
print(result)
# 保存到临时文件
output_file = os.path.join(TEMP_DIR, f"vision_result_{int(time.time())}.txt")
with open(output_file, 'w', encoding='utf-8') as f:
f.write(result)
@@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
This script provides the actual implementation that replaces the problematic
custom agent delegation approach described in the outdated documentation.
"""
import sys
import os
from pathlib import Path
from typing import Dict, Any, Optional, List
from enum import Enum
# Add the image-service scripts to path to reuse the vision analyzer
sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
try:
from vision_analyzer import VisionAnalyzer
except ImportError:
# Fallback: try to import from current directory if vision_analyzer is copied here
try:
from .vision_analyzer import VisionAnalyzer
except ImportError:
raise ImportError(
"Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
)
class AnalysisMode(Enum):
"""Available analysis modes."""
DESCRIBE = "describe"
OCR = "ocr"
CHART = "chart"
FASHION = "fashion"
PRODUCT = "product"
SCENE = "scene"
CUSTOM = "custom"
class VisionProcessor:
"""Main vision processing class that integrates detection and analysis."""
def __init__(self, config: Optional[Dict[str, str]] = None):
"""
Initialize the vision processor.
Args:
config: Configuration dictionary for the VisionAnalyzer
"""
self.analyzer = VisionAnalyzer(config)
self.detector = None # Will be created when needed
def process_visual_content(
self, user_input: str, user_request: str = ""
) -> Dict[str, Any]:
"""
Process visual content in user input.
Args:
user_input: The user's input text that may contain visual content references
user_request: The original user request/context
Returns:
Dictionary containing analysis results and metadata
"""
from .vision_detector import VisionContentDetector, DetectionConfidence
# Initialize detector if not already done
if self.detector is None:
self.detector = VisionContentDetector()
# Detect visual content
confidence, detected_items = self.detector.detect_visual_content(user_input)
result = {
"confidence": confidence.value,
"detected_items": detected_items,
"analysis_results": [],
"errors": [],
}
if confidence == DetectionConfidence.NONE:
result["message"] = "No visual content detected"
return result
# Extract image paths
image_paths = self.detector.extract_image_paths(user_input)
if not image_paths:
result["message"] = "Visual content detected but no valid image paths found"
result["errors"].append("No valid image paths found")
return result
# Determine analysis mode based on user request
analysis_mode = self._determine_analysis_mode(user_request, user_input)
# Process each image
for image_path in image_paths:
try:
# Handle URLs by downloading first (simplified - in practice would need download logic)
if image_path.startswith(("http://", "https://")):
# In a real implementation, you'd download the URL to a temp file
# For now, we'll assume local paths only
result["errors"].append(
f"URL handling not implemented: {image_path}"
)
continue
# Ensure path is absolute
if not os.path.isabs(image_path):
# Try to resolve relative to current working directory
image_path = os.path.join(os.getcwd(), image_path)
# Analyze the image
if analysis_mode == AnalysisMode.CUSTOM:
# Use the user request as the custom question
analysis_result = self.analyzer.analyze_with_mode(
Path(image_path),
"custom",
user_request or "Please analyze this image.",
)
else:
analysis_result = self.analyzer.analyze_with_mode(
Path(image_path), analysis_mode.value
)
result["analysis_results"].append(
{
"image_path": image_path,
"analysis_mode": analysis_mode.value,
"result": analysis_result,
}
)
except Exception as e:
error_msg = f"Failed to analyze {image_path}: {str(e)}"
result["errors"].append(error_msg)
print(f"Error: {error_msg}", file=sys.stderr)
return result
def _determine_analysis_mode(
self, user_request: str, user_input: str
) -> AnalysisMode:
"""
Determine the appropriate analysis mode based on user context.
Args:
user_request: The user's original request
user_input: The full input containing visual content references
Returns:
AnalysisMode enum value
"""
combined_text = (user_request + " " + user_input).lower()
# Check for specific keywords to determine mode
if any(
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
):
return AnalysisMode.OCR
elif any(
word in combined_text
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
):
return AnalysisMode.CHART
elif any(
word in combined_text
for word in ["fashion", "服装", "穿搭", "style", "style"]
):
return AnalysisMode.FASHION
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
return AnalysisMode.PRODUCT
elif any(
word in combined_text
for word in ["scene", "场景", "环境", "location", "place"]
):
return AnalysisMode.SCENE
else:
return AnalysisMode.DESCRIBE
def main():
"""Command line interface for testing."""
import argparse
parser = argparse.ArgumentParser(description="Process visual content in user input")
parser.add_argument("input", help="User input containing visual content references")
parser.add_argument("--request", "-r", help="Original user request/context")
parser.add_argument("--output", "-o", help="Output file for results")
args = parser.parse_args()
try:
processor = VisionProcessor()
result = processor.process_visual_content(args.input, args.request or "")
import json
output = json.dumps(result, indent=2, ensure_ascii=False)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Results saved to: {args.output}")
else:
print(output)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()