Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Example script - delete if not needed."""
|
||||
|
||||
print("Hello from skill!")
|
||||
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Integration script for agent-vision-awareness skill
|
||||
|
||||
This script demonstrates the complete workflow:
|
||||
1. Detect visual content in user input
|
||||
2. Extract image paths
|
||||
3. Analyze images using the vision analyzer
|
||||
4. Return structured results
|
||||
|
||||
This replaces the problematic custom agent delegation approach.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
def process_user_input(
|
||||
user_input: str, user_request: str = "", config: Dict[str, str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process user input for visual content and return analysis results.
|
||||
|
||||
Args:
|
||||
user_input: The user's input text that may contain visual content references
|
||||
user_request: The original user request/context (optional)
|
||||
config: Configuration for the vision analyzer (optional)
|
||||
|
||||
Returns:
|
||||
Dictionary containing detection results and analysis
|
||||
"""
|
||||
try:
|
||||
# Import the detector and analyzer
|
||||
from .vision_detector import VisionContentDetector, DetectionConfidence
|
||||
from .standalone_vision_analyzer import StandaloneVisionAnalyzer
|
||||
|
||||
# Initialize components
|
||||
detector = VisionContentDetector()
|
||||
analyzer = StandaloneVisionAnalyzer(config)
|
||||
|
||||
# Step 1: Detect visual content
|
||||
confidence, detected_items = detector.detect_visual_content(user_input)
|
||||
result = {
|
||||
"status": "success",
|
||||
"confidence": confidence.value,
|
||||
"detected_items": detected_items,
|
||||
"analysis_results": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if confidence == DetectionConfidence.NONE:
|
||||
result["message"] = "No visual content detected"
|
||||
return result
|
||||
|
||||
# Step 2: Extract image paths
|
||||
image_paths = detector.extract_image_paths(user_input)
|
||||
if not image_paths:
|
||||
result["message"] = "Visual content detected but no valid image paths found"
|
||||
result["errors"].append("No valid image paths found")
|
||||
return result
|
||||
|
||||
# Step 3: Determine analysis mode
|
||||
combined_text = (user_request + " " + user_input).lower()
|
||||
if any(
|
||||
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
|
||||
):
|
||||
mode = "ocr"
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
|
||||
):
|
||||
mode = "chart"
|
||||
elif any(
|
||||
word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
|
||||
):
|
||||
mode = "fashion"
|
||||
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
|
||||
mode = "product"
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["scene", "场景", "环境", "location", "place"]
|
||||
):
|
||||
mode = "scene"
|
||||
else:
|
||||
mode = "describe"
|
||||
|
||||
# Step 4: Analyze each image
|
||||
for image_path in image_paths:
|
||||
try:
|
||||
# Handle relative paths
|
||||
if not os.path.isabs(image_path):
|
||||
image_path = os.path.join(os.getcwd(), image_path)
|
||||
|
||||
# Analyze the image
|
||||
if mode == "custom":
|
||||
analysis_result = analyzer.analyze_with_mode(
|
||||
Path(image_path),
|
||||
"custom",
|
||||
user_request or "Please analyze this image.",
|
||||
)
|
||||
else:
|
||||
analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
|
||||
|
||||
result["analysis_results"].append(
|
||||
{"image_path": image_path, "mode": mode, "result": analysis_result}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to analyze {image_path}: {str(e)}"
|
||||
result["errors"].append(error_msg)
|
||||
print(f"Error: {error_msg}", file=sys.stderr)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
"message": f"Processing failed: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface for testing."""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process visual content in user input")
|
||||
parser.add_argument("input", help="User input containing visual content references")
|
||||
parser.add_argument("--request", "-r", help="Original user request/context")
|
||||
parser.addiction_group = parser.add_mutually_exclusive_group()
|
||||
parser.addiction_group.add_argument("--api-key", help="API key for vision service")
|
||||
parser.addiction_group.add_argument("--config-file", help="Configuration file path")
|
||||
parser.add_argument("--output", "-o", help="Output file for results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build configuration
|
||||
config = {}
|
||||
if args.api_key:
|
||||
config["api_key"] = args.api_key
|
||||
config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
|
||||
config["model"] = "doubao-seed-code"
|
||||
elif args.config_file:
|
||||
import json
|
||||
|
||||
with open(args.config_file, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Process the input
|
||||
result = process_user_input(args.input, args.request or "", config)
|
||||
|
||||
# Output results
|
||||
output = json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"Results saved to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,216 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone Vision Analyzer - Simplified version for agent-vision-awareness skill
|
||||
|
||||
This is a self-contained version of the vision analyzer that doesn't depend on
|
||||
the image-service skill structure, making it easier to integrate directly.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
import httpx
|
||||
|
||||
|
||||
class StandaloneVisionAnalyzer:
|
||||
"""Standalone vision analyzer using direct API calls."""
|
||||
|
||||
# Predefined analysis modes
|
||||
ANALYSIS_MODES = {
|
||||
"describe": "请详细描述这张图片的内容,包括:人物、场景、物品、颜色、布局等所有细节。",
|
||||
"ocr": "请仔细识别这张图片中的所有文字内容,按照文字在图片中的位置顺序输出。如果是中文,请保持原文输出。",
|
||||
"chart": "请分析这张图表的内容,包括:图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
|
||||
"fashion": "请分析这张图片中人物的穿搭,包括:服装款式、颜色搭配、配饰、整体风格等。",
|
||||
"product": "请分析这张产品图片,包括:产品类型、外观特征、功能特点、品牌信息等。",
|
||||
"scene": "请描述这张图片的场景,包括:地点、环境、氛围、时间(白天/夜晚)等。",
|
||||
"custom": "用户自定义问题",
|
||||
}
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the analyzer.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary with api_key, base_url, model
|
||||
"""
|
||||
if config is None:
|
||||
config = self._load_config()
|
||||
|
||||
self.api_key = (
|
||||
config.get("api_key")
|
||||
or config.get("VOLCENGINE_API_KEY")
|
||||
or "b0359bed-09f2-49e2-a53c-32ba057412e3"
|
||||
)
|
||||
self.base_url = (
|
||||
config.get("base_url") or "https://ark.cn-beijing.volces.com/api/coding/v3"
|
||||
)
|
||||
self.model = config.get("model") or "doubao-seed-code"
|
||||
|
||||
if not self.api_key or not self.base_url:
|
||||
raise ValueError("Missing required API configuration: api_key and base_url")
|
||||
|
||||
def _load_config(self) -> Dict[str, str]:
|
||||
"""Load configuration from environment variables or config file."""
|
||||
config = {}
|
||||
|
||||
# Load from environment variables
|
||||
config["api_key"] = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
|
||||
"DASHSCOPE_API_KEY"
|
||||
)
|
||||
config["base_url"] = os.environ.get("VISION_API_BASE_URL")
|
||||
config["model"] = os.environ.get("VISION_MODEL")
|
||||
|
||||
return config
|
||||
|
||||
def encode_image(self, image_path: Path) -> str:
|
||||
"""Encode image to base64."""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
|
||||
def analyze(self, image_path: Path, question: str) -> str:
|
||||
"""
|
||||
Analyze image content.
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
question: Question/prompt for analysis
|
||||
|
||||
Returns:
|
||||
Analysis result text
|
||||
"""
|
||||
if not image_path.exists():
|
||||
raise FileNotFoundError(f"Image not found: {image_path}")
|
||||
|
||||
base64_image = self.encode_image(image_path)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": question},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{base64_image}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"max_tokens": 2000,
|
||||
}
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=30.0) as client:
|
||||
response = client.post(
|
||||
f"{self.base_url}/chat/completions", headers=headers, json=payload
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result["choices"][0]["message"]["content"]
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 404:
|
||||
raise ValueError(
|
||||
f"API endpoint not found, check base_url: {self.base_url}"
|
||||
)
|
||||
elif e.response.status_code == 401:
|
||||
raise ValueError("Invalid or expired API key")
|
||||
else:
|
||||
raise RuntimeError(f"API request failed: {e}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Analysis failed: {e}")
|
||||
|
||||
def analyze_with_mode(
|
||||
self,
|
||||
image_path: Path,
|
||||
mode: str = "describe",
|
||||
custom_question: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Analyze image with predefined mode.
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
mode: Analysis mode (describe, ocr, chart, fashion, product, scene, custom)
|
||||
custom_question: Custom question for custom mode
|
||||
|
||||
Returns:
|
||||
Analysis result text
|
||||
"""
|
||||
if mode not in self.ANALYSIS_MODES:
|
||||
raise ValueError(
|
||||
f"Unsupported mode: {mode}, available: {list(self.ANALYSIS_MODES.keys())}"
|
||||
)
|
||||
|
||||
if mode == "custom":
|
||||
if not custom_question:
|
||||
raise ValueError("Custom mode requires custom_question parameter")
|
||||
question = custom_question
|
||||
else:
|
||||
question = self.ANALYSIS_MODES[mode]
|
||||
|
||||
return self.analyze(image_path, question)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Standalone Vision Analyzer")
|
||||
parser.add_argument("image", help="Image path")
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
"-m",
|
||||
choices=["describe", "ocr", "chart", "fashion", "product", "scene", "custom"],
|
||||
default="describe",
|
||||
help="Analysis mode",
|
||||
)
|
||||
parser.add_argument("--question", "-q", help="Custom question for custom mode")
|
||||
parser.add_argument("--output", "-o", help="Output file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
image_path = Path(args.image)
|
||||
if not image_path.exists():
|
||||
print(f"Error: Image not found: {image_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
analyzer = StandaloneVisionAnalyzer()
|
||||
|
||||
if args.mode == "custom":
|
||||
if not args.question:
|
||||
print(
|
||||
"Error: Custom mode requires --question parameter", file=sys.stderr
|
||||
)
|
||||
sys.exit(1)
|
||||
result = analyzer.analyze_with_mode(image_path, "custom", args.question)
|
||||
else:
|
||||
result = analyzer.analyze_with_mode(image_path, args.mode)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(result)
|
||||
print(f"Result saved to: {args.output}")
|
||||
else:
|
||||
print("Analysis Result:")
|
||||
print("-" * 50)
|
||||
print(result)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for agent-vision-awareness skill
|
||||
|
||||
This script tests the vision detection and processing capabilities.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_detection():
|
||||
"""Test visual content detection."""
|
||||
print("Testing visual content detection...")
|
||||
|
||||
from .vision_detector import VisionContentDetector, DetectionConfidence
|
||||
|
||||
detector = VisionContentDetector()
|
||||
|
||||
test_cases = [
|
||||
("帮我分析这个截图 error.png", DetectionConfidence.HIGH),
|
||||
("描述这张图片的内容", DetectionConfidence.LOW),
|
||||
("根据架构图 design/architecture.png 生成部署方案", DetectionConfidence.HIGH),
|
||||
("写一个 Python 脚本", DetectionConfidence.NONE),
|
||||
(" 显示什么?", DetectionConfidence.HIGH),
|
||||
]
|
||||
|
||||
for test_input, expected_confidence in test_cases:
|
||||
confidence, items = detector.detect_visual_content(test_input)
|
||||
status = "✅" if confidence == expected_confidence else "❌"
|
||||
print(f"{status} Input: {test_input}")
|
||||
print(f" Expected: {expected_confidence.value}, Got: {confidence.value}")
|
||||
if items:
|
||||
print(f" Detected: {items}")
|
||||
print()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def test_integration():
|
||||
"""Test integration with vision analyzer (if API key available)."""
|
||||
print("Testing vision integration...")
|
||||
|
||||
# Check if API key is available
|
||||
api_key = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
|
||||
"DASHSCOPE_API_KEY"
|
||||
)
|
||||
if not api_key:
|
||||
print("⚠️ No API key found. Skipping integration test.")
|
||||
print(
|
||||
" Set VOLCENGINE_API_KEY or DASHSCOPE_API_KEY environment variable to test."
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
from .integrate_vision import process_user_input
|
||||
|
||||
# Test with a simple request (won't actually process image without file)
|
||||
result = process_user_input(
|
||||
"测试视觉处理",
|
||||
"这是一个测试",
|
||||
config={
|
||||
"api_key": api_key,
|
||||
"base_url": "https://ark.cn-beijing.volces.com/api/coding/v3",
|
||||
"model": "doubao-seed-code",
|
||||
},
|
||||
)
|
||||
|
||||
if result["status"] == "success":
|
||||
print("✅ Integration test passed (configuration valid)")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ Integration test failed: {result.get('error', 'Unknown error')}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Integration test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all tests."""
|
||||
print("🧪 Testing Agent Vision Awareness Skill")
|
||||
print("=" * 50)
|
||||
|
||||
success = True
|
||||
|
||||
# Test detection
|
||||
success &= test_detection()
|
||||
|
||||
# Test integration (if possible)
|
||||
success &= test_integration()
|
||||
|
||||
print("=" * 50)
|
||||
if success:
|
||||
print("✅ All tests passed!")
|
||||
else:
|
||||
print("⚠️ Some tests failed or were skipped.")
|
||||
|
||||
return success
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
@@ -0,0 +1,336 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Vision Content Detector - Detects visual content in user input for agent-vision-awareness skill
|
||||
|
||||
This script implements the detection logic described in the skill documentation,
|
||||
but integrates with the actual working vision processing implementation
|
||||
using direct API calls rather than custom agent delegation.
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class DetectionConfidence(Enum):
|
||||
HIGH = "high"
|
||||
MEDIUM = "medium"
|
||||
LOW = "low"
|
||||
NONE = "none"
|
||||
|
||||
|
||||
class VisionContentDetector:
|
||||
"""Detects visual content in user input based on various patterns."""
|
||||
|
||||
# Image file extensions (case-insensitive)
|
||||
IMAGE_EXTENSIONS = [
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".webp",
|
||||
".svg",
|
||||
".ico",
|
||||
".tiff",
|
||||
".tif",
|
||||
".heic",
|
||||
".heif",
|
||||
".raw",
|
||||
".psd",
|
||||
".ai",
|
||||
".eps",
|
||||
]
|
||||
|
||||
# Document files with potential visual content
|
||||
DOCUMENT_EXTENSIONS = [".pdf", ".ppt", ".pptx", ".vsdx", ".drawio"]
|
||||
|
||||
# Chinese visual keywords
|
||||
CHINESE_KEYWORDS = {
|
||||
"high": [
|
||||
"图片",
|
||||
"图像",
|
||||
"照片",
|
||||
"截图",
|
||||
"图表",
|
||||
"图示",
|
||||
"图形",
|
||||
"影像",
|
||||
"画面",
|
||||
],
|
||||
"medium": [
|
||||
"流程图",
|
||||
"架构图",
|
||||
"时序图",
|
||||
"ER 图",
|
||||
"思维导图",
|
||||
"柱状图",
|
||||
"饼图",
|
||||
"折线图",
|
||||
"设计图",
|
||||
"原型图",
|
||||
"线框图",
|
||||
"界面",
|
||||
"UI",
|
||||
"UX",
|
||||
"表格",
|
||||
"表单",
|
||||
"清单",
|
||||
"列表",
|
||||
],
|
||||
"low": ["显示", "展示", "呈现", "可视化", "看图", "读图"],
|
||||
}
|
||||
|
||||
# English visual keywords
|
||||
ENGLISH_KEYWORDS = {
|
||||
"high": [
|
||||
"image",
|
||||
"photo",
|
||||
"picture",
|
||||
"screenshot",
|
||||
"snapshot",
|
||||
"capture",
|
||||
"diagram",
|
||||
"chart",
|
||||
"graph",
|
||||
"plot",
|
||||
"figure",
|
||||
],
|
||||
"medium": [
|
||||
"flowchart",
|
||||
"architecture",
|
||||
"sequence diagram",
|
||||
"ER diagram",
|
||||
"mind map",
|
||||
"bar chart",
|
||||
"pie chart",
|
||||
"line graph",
|
||||
"design",
|
||||
"mockup",
|
||||
"wireframe",
|
||||
"interface",
|
||||
"UI",
|
||||
"UX",
|
||||
"layout",
|
||||
"table",
|
||||
"form",
|
||||
"list",
|
||||
"grid",
|
||||
],
|
||||
"low": ["show", "display", "visualize", "view", "look at", "see"],
|
||||
}
|
||||
|
||||
# Technical visual keywords
|
||||
TECHNICAL_KEYWORDS = [
|
||||
"schema",
|
||||
"model",
|
||||
"blueprint",
|
||||
"spec",
|
||||
"technical drawing",
|
||||
"dashboard",
|
||||
"widget",
|
||||
"panel",
|
||||
"visualization",
|
||||
"map",
|
||||
"heatmap",
|
||||
"scatter plot",
|
||||
"histogram",
|
||||
"infographic",
|
||||
"poster",
|
||||
"banner",
|
||||
"thumbnail",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the detector with compiled regex patterns."""
|
||||
self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Compile regex patterns for performance."""
|
||||
# File extension pattern
|
||||
ext_pattern = "|".join(re.escape(ext) for ext in self.IMAGE_EXTENSIONS)
|
||||
self.file_ext_pattern = re.compile(
|
||||
rf"[\w\-\.\/]+?\.(?:{ext_pattern})", re.IGNORECASE
|
||||
)
|
||||
|
||||
# Markdown image syntax
|
||||
self.markdown_img_pattern = re.compile(r"!\[([^\]]*)\]\(([^\)]+)\)")
|
||||
|
||||
# Base64 image data
|
||||
self.base64_img_pattern = re.compile(
|
||||
r"data:image\/(png|jpeg|gif|webp);base64,[A-Za-z0-9+/=]+"
|
||||
)
|
||||
|
||||
# Keyword + file reference
|
||||
keyword_pattern = "|".join(
|
||||
[
|
||||
re.escape(k)
|
||||
for k in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]
|
||||
]
|
||||
)
|
||||
ext_pattern_short = "|".join(
|
||||
re.escape(ext) for ext in self.IMAGE_EXTENSIONS[:7]
|
||||
) # Common ones
|
||||
self.keyword_file_pattern = re.compile(
|
||||
rf"({keyword_pattern}).*?[\w\-\.\/]+\.(?:{ext_pattern_short})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def detect_visual_content(
|
||||
self, user_input: str
|
||||
) -> Tuple[DetectionConfidence, List[str]]:
|
||||
"""
|
||||
Detect visual content in user input and return confidence level and detected items.
|
||||
|
||||
Args:
|
||||
user_input: The user's input text
|
||||
|
||||
Returns:
|
||||
Tuple of (confidence_level, detected_items)
|
||||
"""
|
||||
detected_items = []
|
||||
confidence_scores = []
|
||||
|
||||
# Check 1: File extensions
|
||||
file_matches = self.file_ext_pattern.findall(user_input)
|
||||
if file_matches:
|
||||
detected_items.extend(file_matches)
|
||||
confidence_scores.append(0.9) # High confidence
|
||||
|
||||
# Check 2: Markdown image syntax
|
||||
markdown_matches = self.markdown_img_pattern.findall(user_input)
|
||||
if markdown_matches:
|
||||
detected_items.extend([f"{alt}:{url}" for alt, url in markdown_matches])
|
||||
confidence_scores.append(0.9) # High confidence
|
||||
|
||||
# Check 3: Base64 image data
|
||||
base64_matches = self.base64_img_pattern.findall(user_input)
|
||||
if base64_matches:
|
||||
detected_items.extend([f"base64:{fmt}" for fmt in base64_matches])
|
||||
confidence_scores.append(0.9) # High confidence
|
||||
|
||||
# Check 4: Visual keywords
|
||||
keyword_confidence = self._check_keywords(user_input)
|
||||
if keyword_confidence > 0:
|
||||
confidence_scores.append(keyword_confidence)
|
||||
|
||||
# Check 5: URL images
|
||||
url_images = self._detect_url_images(user_input)
|
||||
if url_images:
|
||||
detected_items.extend(url_images)
|
||||
confidence_scores.append(0.8) # Medium-high confidence
|
||||
|
||||
# Determine overall confidence
|
||||
if not confidence_scores:
|
||||
return DetectionConfidence.NONE, []
|
||||
|
||||
max_confidence = max(confidence_scores)
|
||||
if max_confidence >= 0.9:
|
||||
return DetectionConfidence.HIGH, detected_items
|
||||
elif max_confidence >= 0.6:
|
||||
return DetectionConfidence.MEDIUM, detected_items
|
||||
else:
|
||||
return DetectionConfidence.LOW, detected_items
|
||||
|
||||
def _check_keywords(self, user_input: str) -> float:
|
||||
"""Check for visual keywords and return confidence score."""
|
||||
input_lower = user_input.lower()
|
||||
|
||||
# Check high priority keywords
|
||||
for keyword in self.CHINESE_KEYWORDS["high"] + self.ENGLISH_KEYWORDS["high"]:
|
||||
if keyword in input_lower:
|
||||
return 0.8
|
||||
|
||||
# Check medium priority keywords
|
||||
for keyword in (
|
||||
self.CHINESE_KEYWORDS["medium"] + self.ENGLISH_KEYWORDS["medium"]
|
||||
):
|
||||
if keyword in input_lower:
|
||||
return 0.6
|
||||
|
||||
# Check technical keywords
|
||||
for keyword in self.TECHNICAL_KEYWORDS:
|
||||
if keyword.lower() in input_lower:
|
||||
return 0.6
|
||||
|
||||
# Check low priority keywords
|
||||
for keyword in self.CHINESE_KEYWORDS["low"] + self.ENGLISH_KEYWORDS["low"]:
|
||||
if keyword in input_lower:
|
||||
return 0.4
|
||||
|
||||
return 0.0
|
||||
|
||||
def _detect_url_images(self, user_input: str) -> List[str]:
|
||||
"""Detect image URLs in the input."""
|
||||
url_pattern = re.compile(
|
||||
r"https?://[^\s]+?\.(?:png|jpg|jpeg|gif|bmp|webp)", re.IGNORECASE
|
||||
)
|
||||
return url_pattern.findall(user_input)
|
||||
|
||||
def extract_image_paths(self, user_input: str) -> List[str]:
|
||||
"""
|
||||
Extract actual image paths/URLs from user input.
|
||||
|
||||
Returns:
|
||||
List of image paths or URLs
|
||||
"""
|
||||
image_paths = []
|
||||
|
||||
# File paths with extensions
|
||||
file_matches = self.file_ext_pattern.findall(user_input)
|
||||
image_paths.extend(file_matches)
|
||||
|
||||
# Markdown image URLs
|
||||
markdown_matches = self.markdown_img_pattern.findall(user_input)
|
||||
image_paths.extend([url for alt, url in markdown_matches])
|
||||
|
||||
# Direct URLs
|
||||
url_images = self._detect_url_images(user_input)
|
||||
image_paths.extend(url_images)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_paths = []
|
||||
for path in image_paths:
|
||||
if path not in seen:
|
||||
unique_paths.append(path)
|
||||
seen.add(path)
|
||||
|
||||
return unique_paths
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface for testing."""
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Detect visual content in user input")
|
||||
parser.add_argument("input", help="User input to analyze")
|
||||
parser.add_argument(
|
||||
"--extract-paths",
|
||||
action="store_true",
|
||||
help="Extract and return image paths only",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
detector = VisionContentDetector()
|
||||
|
||||
if args.extract_paths:
|
||||
paths = detector.extract_image_paths(args.input)
|
||||
for path in paths:
|
||||
print(path)
|
||||
else:
|
||||
confidence, items = detector.detect_visual_content(args.input)
|
||||
print(f"Confidence: {confidence.value}")
|
||||
if items:
|
||||
print("Detected items:")
|
||||
for item in items:
|
||||
print(f" - {item}")
|
||||
else:
|
||||
print("No visual content detected")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
sys.stdout.reconfigure(errors='replace')
|
||||
sys.stderr.reconfigure(errors='replace')
|
||||
import os
|
||||
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||
import base64
|
||||
import time
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from openai import OpenAI
|
||||
|
||||
# 统一临时目录
|
||||
TEMP_DIR = r'D:\F\NewI\opencode\daily-workspace\temp'
|
||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||
|
||||
# 从OpenCode配置读取火山方舟API Key
|
||||
CONFIG_PATH = r'C:\Users\hmo\.config\opencode\config.json'
|
||||
import json
|
||||
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
|
||||
config = json.load(f)
|
||||
API_KEY = config['provider']['volcengine']['options']['apiKey']
|
||||
BASE_URL = config['provider']['volcengine']['options']['baseURL']
|
||||
|
||||
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
|
||||
MODEL = 'doubao-seed-2.0-pro'
|
||||
|
||||
def analyze_image(image_path_or_url, prompt="详细描述这张图片的内容"):
|
||||
"""
|
||||
分析图片内容,支持本地路径和http/https URL
|
||||
:param image_path_or_url: 图片路径或URL
|
||||
:param prompt: 分析提示词
|
||||
:return: 分析结果
|
||||
"""
|
||||
try:
|
||||
# 处理URL
|
||||
if image_path_or_url.lower().startswith(('http://', 'https://')):
|
||||
image_url = image_path_or_url
|
||||
else:
|
||||
# 处理本地路径
|
||||
image_path = Path(image_path_or_url)
|
||||
if not image_path.exists():
|
||||
return f"错误:图片不存在 {image_path}"
|
||||
# 转base64
|
||||
with open(image_path, 'rb') as f:
|
||||
image_base64 = base64.b64encode(f.read()).decode('utf-8')
|
||||
image_url = f"data:image/{image_path.suffix.lstrip('.')};base64,{image_base64}"
|
||||
|
||||
# 调用API
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens=1000
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
except Exception as e:
|
||||
return f"图片识别失败:{type(e).__name__}: {str(e)}"
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("用法:python vision_direct.py <图片路径/URL> [提示词]")
|
||||
sys.exit(1)
|
||||
|
||||
image_path = sys.argv[1]
|
||||
prompt = sys.argv[2] if len(sys.argv) > 2 else "详细描述这张图片的内容"
|
||||
|
||||
result = analyze_image(image_path, prompt)
|
||||
print(result)
|
||||
|
||||
# 保存到临时文件
|
||||
output_file = os.path.join(TEMP_DIR, f"vision_result_{int(time.time())}.txt")
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(result)
|
||||
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
|
||||
|
||||
This script provides the actual implementation that replaces the problematic
|
||||
custom agent delegation approach described in the outdated documentation.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
from enum import Enum
|
||||
|
||||
# Add the image-service scripts to path to reuse the vision analyzer
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
|
||||
|
||||
try:
|
||||
from vision_analyzer import VisionAnalyzer
|
||||
except ImportError:
|
||||
# Fallback: try to import from current directory if vision_analyzer is copied here
|
||||
try:
|
||||
from .vision_analyzer import VisionAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
|
||||
)
|
||||
|
||||
|
||||
class AnalysisMode(Enum):
|
||||
"""Available analysis modes."""
|
||||
|
||||
DESCRIBE = "describe"
|
||||
OCR = "ocr"
|
||||
CHART = "chart"
|
||||
FASHION = "fashion"
|
||||
PRODUCT = "product"
|
||||
SCENE = "scene"
|
||||
CUSTOM = "custom"
|
||||
|
||||
|
||||
class VisionProcessor:
|
||||
"""Main vision processing class that integrates detection and analysis."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the vision processor.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary for the VisionAnalyzer
|
||||
"""
|
||||
self.analyzer = VisionAnalyzer(config)
|
||||
self.detector = None # Will be created when needed
|
||||
|
||||
def process_visual_content(
|
||||
self, user_input: str, user_request: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process visual content in user input.
|
||||
|
||||
Args:
|
||||
user_input: The user's input text that may contain visual content references
|
||||
user_request: The original user request/context
|
||||
|
||||
Returns:
|
||||
Dictionary containing analysis results and metadata
|
||||
"""
|
||||
from .vision_detector import VisionContentDetector, DetectionConfidence
|
||||
|
||||
# Initialize detector if not already done
|
||||
if self.detector is None:
|
||||
self.detector = VisionContentDetector()
|
||||
|
||||
# Detect visual content
|
||||
confidence, detected_items = self.detector.detect_visual_content(user_input)
|
||||
result = {
|
||||
"confidence": confidence.value,
|
||||
"detected_items": detected_items,
|
||||
"analysis_results": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if confidence == DetectionConfidence.NONE:
|
||||
result["message"] = "No visual content detected"
|
||||
return result
|
||||
|
||||
# Extract image paths
|
||||
image_paths = self.detector.extract_image_paths(user_input)
|
||||
if not image_paths:
|
||||
result["message"] = "Visual content detected but no valid image paths found"
|
||||
result["errors"].append("No valid image paths found")
|
||||
return result
|
||||
|
||||
# Determine analysis mode based on user request
|
||||
analysis_mode = self._determine_analysis_mode(user_request, user_input)
|
||||
|
||||
# Process each image
|
||||
for image_path in image_paths:
|
||||
try:
|
||||
# Handle URLs by downloading first (simplified - in practice would need download logic)
|
||||
if image_path.startswith(("http://", "https://")):
|
||||
# In a real implementation, you'd download the URL to a temp file
|
||||
# For now, we'll assume local paths only
|
||||
result["errors"].append(
|
||||
f"URL handling not implemented: {image_path}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Ensure path is absolute
|
||||
if not os.path.isabs(image_path):
|
||||
# Try to resolve relative to current working directory
|
||||
image_path = os.path.join(os.getcwd(), image_path)
|
||||
|
||||
# Analyze the image
|
||||
if analysis_mode == AnalysisMode.CUSTOM:
|
||||
# Use the user request as the custom question
|
||||
analysis_result = self.analyzer.analyze_with_mode(
|
||||
Path(image_path),
|
||||
"custom",
|
||||
user_request or "Please analyze this image.",
|
||||
)
|
||||
else:
|
||||
analysis_result = self.analyzer.analyze_with_mode(
|
||||
Path(image_path), analysis_mode.value
|
||||
)
|
||||
|
||||
result["analysis_results"].append(
|
||||
{
|
||||
"image_path": image_path,
|
||||
"analysis_mode": analysis_mode.value,
|
||||
"result": analysis_result,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to analyze {image_path}: {str(e)}"
|
||||
result["errors"].append(error_msg)
|
||||
print(f"Error: {error_msg}", file=sys.stderr)
|
||||
|
||||
return result
|
||||
|
||||
def _determine_analysis_mode(
|
||||
self, user_request: str, user_input: str
|
||||
) -> AnalysisMode:
|
||||
"""
|
||||
Determine the appropriate analysis mode based on user context.
|
||||
|
||||
Args:
|
||||
user_request: The user's original request
|
||||
user_input: The full input containing visual content references
|
||||
|
||||
Returns:
|
||||
AnalysisMode enum value
|
||||
"""
|
||||
combined_text = (user_request + " " + user_input).lower()
|
||||
|
||||
# Check for specific keywords to determine mode
|
||||
if any(
|
||||
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
|
||||
):
|
||||
return AnalysisMode.OCR
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
|
||||
):
|
||||
return AnalysisMode.CHART
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["fashion", "服装", "穿搭", "style", "style"]
|
||||
):
|
||||
return AnalysisMode.FASHION
|
||||
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
|
||||
return AnalysisMode.PRODUCT
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["scene", "场景", "环境", "location", "place"]
|
||||
):
|
||||
return AnalysisMode.SCENE
|
||||
else:
|
||||
return AnalysisMode.DESCRIBE
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface for testing."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process visual content in user input")
|
||||
parser.add_argument("input", help="User input containing visual content references")
|
||||
parser.add_argument("--request", "-r", help="Original user request/context")
|
||||
parser.add_argument("--output", "-o", help="Output file for results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
processor = VisionProcessor()
|
||||
result = processor.process_visual_content(args.input, args.request or "")
|
||||
|
||||
import json
|
||||
|
||||
output = json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"Results saved to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user