Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
|
||||
|
||||
This script provides the actual implementation that replaces the problematic
|
||||
custom agent delegation approach described in the outdated documentation.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
from enum import Enum
|
||||
|
||||
# Add the image-service scripts to path to reuse the vision analyzer
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
|
||||
|
||||
try:
|
||||
from vision_analyzer import VisionAnalyzer
|
||||
except ImportError:
|
||||
# Fallback: try to import from current directory if vision_analyzer is copied here
|
||||
try:
|
||||
from .vision_analyzer import VisionAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
|
||||
)
|
||||
|
||||
|
||||
class AnalysisMode(Enum):
|
||||
"""Available analysis modes."""
|
||||
|
||||
DESCRIBE = "describe"
|
||||
OCR = "ocr"
|
||||
CHART = "chart"
|
||||
FASHION = "fashion"
|
||||
PRODUCT = "product"
|
||||
SCENE = "scene"
|
||||
CUSTOM = "custom"
|
||||
|
||||
|
||||
class VisionProcessor:
|
||||
"""Main vision processing class that integrates detection and analysis."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Initialize the vision processor.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary for the VisionAnalyzer
|
||||
"""
|
||||
self.analyzer = VisionAnalyzer(config)
|
||||
self.detector = None # Will be created when needed
|
||||
|
||||
def process_visual_content(
|
||||
self, user_input: str, user_request: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process visual content in user input.
|
||||
|
||||
Args:
|
||||
user_input: The user's input text that may contain visual content references
|
||||
user_request: The original user request/context
|
||||
|
||||
Returns:
|
||||
Dictionary containing analysis results and metadata
|
||||
"""
|
||||
from .vision_detector import VisionContentDetector, DetectionConfidence
|
||||
|
||||
# Initialize detector if not already done
|
||||
if self.detector is None:
|
||||
self.detector = VisionContentDetector()
|
||||
|
||||
# Detect visual content
|
||||
confidence, detected_items = self.detector.detect_visual_content(user_input)
|
||||
result = {
|
||||
"confidence": confidence.value,
|
||||
"detected_items": detected_items,
|
||||
"analysis_results": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if confidence == DetectionConfidence.NONE:
|
||||
result["message"] = "No visual content detected"
|
||||
return result
|
||||
|
||||
# Extract image paths
|
||||
image_paths = self.detector.extract_image_paths(user_input)
|
||||
if not image_paths:
|
||||
result["message"] = "Visual content detected but no valid image paths found"
|
||||
result["errors"].append("No valid image paths found")
|
||||
return result
|
||||
|
||||
# Determine analysis mode based on user request
|
||||
analysis_mode = self._determine_analysis_mode(user_request, user_input)
|
||||
|
||||
# Process each image
|
||||
for image_path in image_paths:
|
||||
try:
|
||||
# Handle URLs by downloading first (simplified - in practice would need download logic)
|
||||
if image_path.startswith(("http://", "https://")):
|
||||
# In a real implementation, you'd download the URL to a temp file
|
||||
# For now, we'll assume local paths only
|
||||
result["errors"].append(
|
||||
f"URL handling not implemented: {image_path}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Ensure path is absolute
|
||||
if not os.path.isabs(image_path):
|
||||
# Try to resolve relative to current working directory
|
||||
image_path = os.path.join(os.getcwd(), image_path)
|
||||
|
||||
# Analyze the image
|
||||
if analysis_mode == AnalysisMode.CUSTOM:
|
||||
# Use the user request as the custom question
|
||||
analysis_result = self.analyzer.analyze_with_mode(
|
||||
Path(image_path),
|
||||
"custom",
|
||||
user_request or "Please analyze this image.",
|
||||
)
|
||||
else:
|
||||
analysis_result = self.analyzer.analyze_with_mode(
|
||||
Path(image_path), analysis_mode.value
|
||||
)
|
||||
|
||||
result["analysis_results"].append(
|
||||
{
|
||||
"image_path": image_path,
|
||||
"analysis_mode": analysis_mode.value,
|
||||
"result": analysis_result,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to analyze {image_path}: {str(e)}"
|
||||
result["errors"].append(error_msg)
|
||||
print(f"Error: {error_msg}", file=sys.stderr)
|
||||
|
||||
return result
|
||||
|
||||
def _determine_analysis_mode(
|
||||
self, user_request: str, user_input: str
|
||||
) -> AnalysisMode:
|
||||
"""
|
||||
Determine the appropriate analysis mode based on user context.
|
||||
|
||||
Args:
|
||||
user_request: The user's original request
|
||||
user_input: The full input containing visual content references
|
||||
|
||||
Returns:
|
||||
AnalysisMode enum value
|
||||
"""
|
||||
combined_text = (user_request + " " + user_input).lower()
|
||||
|
||||
# Check for specific keywords to determine mode
|
||||
if any(
|
||||
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
|
||||
):
|
||||
return AnalysisMode.OCR
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
|
||||
):
|
||||
return AnalysisMode.CHART
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["fashion", "服装", "穿搭", "style", "style"]
|
||||
):
|
||||
return AnalysisMode.FASHION
|
||||
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
|
||||
return AnalysisMode.PRODUCT
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["scene", "场景", "环境", "location", "place"]
|
||||
):
|
||||
return AnalysisMode.SCENE
|
||||
else:
|
||||
return AnalysisMode.DESCRIBE
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface for testing."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process visual content in user input")
|
||||
parser.add_argument("input", help="User input containing visual content references")
|
||||
parser.add_argument("--request", "-r", help="Original user request/context")
|
||||
parser.add_argument("--output", "-o", help="Output file for results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
processor = VisionProcessor()
|
||||
result = processor.process_visual_content(args.input, args.request or "")
|
||||
|
||||
import json
|
||||
|
||||
output = json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"Results saved to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user