04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
216 lines
7.2 KiB
Python
216 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
|
|
|
|
This script provides the actual implementation that replaces the problematic
|
|
custom agent delegation approach described in the outdated documentation.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, List
|
|
from enum import Enum
|
|
|
|
# Add the image-service scripts to path to reuse the vision analyzer
|
|
sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
|
|
|
|
try:
|
|
from vision_analyzer import VisionAnalyzer
|
|
except ImportError:
|
|
# Fallback: try to import from current directory if vision_analyzer is copied here
|
|
try:
|
|
from .vision_analyzer import VisionAnalyzer
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
|
|
)
|
|
|
|
|
|
class AnalysisMode(Enum):
|
|
"""Available analysis modes."""
|
|
|
|
DESCRIBE = "describe"
|
|
OCR = "ocr"
|
|
CHART = "chart"
|
|
FASHION = "fashion"
|
|
PRODUCT = "product"
|
|
SCENE = "scene"
|
|
CUSTOM = "custom"
|
|
|
|
|
|
class VisionProcessor:
|
|
"""Main vision processing class that integrates detection and analysis."""
|
|
|
|
def __init__(self, config: Optional[Dict[str, str]] = None):
|
|
"""
|
|
Initialize the vision processor.
|
|
|
|
Args:
|
|
config: Configuration dictionary for the VisionAnalyzer
|
|
"""
|
|
self.analyzer = VisionAnalyzer(config)
|
|
self.detector = None # Will be created when needed
|
|
|
|
def process_visual_content(
|
|
self, user_input: str, user_request: str = ""
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process visual content in user input.
|
|
|
|
Args:
|
|
user_input: The user's input text that may contain visual content references
|
|
user_request: The original user request/context
|
|
|
|
Returns:
|
|
Dictionary containing analysis results and metadata
|
|
"""
|
|
from .vision_detector import VisionContentDetector, DetectionConfidence
|
|
|
|
# Initialize detector if not already done
|
|
if self.detector is None:
|
|
self.detector = VisionContentDetector()
|
|
|
|
# Detect visual content
|
|
confidence, detected_items = self.detector.detect_visual_content(user_input)
|
|
result = {
|
|
"confidence": confidence.value,
|
|
"detected_items": detected_items,
|
|
"analysis_results": [],
|
|
"errors": [],
|
|
}
|
|
|
|
if confidence == DetectionConfidence.NONE:
|
|
result["message"] = "No visual content detected"
|
|
return result
|
|
|
|
# Extract image paths
|
|
image_paths = self.detector.extract_image_paths(user_input)
|
|
if not image_paths:
|
|
result["message"] = "Visual content detected but no valid image paths found"
|
|
result["errors"].append("No valid image paths found")
|
|
return result
|
|
|
|
# Determine analysis mode based on user request
|
|
analysis_mode = self._determine_analysis_mode(user_request, user_input)
|
|
|
|
# Process each image
|
|
for image_path in image_paths:
|
|
try:
|
|
# Handle URLs by downloading first (simplified - in practice would need download logic)
|
|
if image_path.startswith(("http://", "https://")):
|
|
# In a real implementation, you'd download the URL to a temp file
|
|
# For now, we'll assume local paths only
|
|
result["errors"].append(
|
|
f"URL handling not implemented: {image_path}"
|
|
)
|
|
continue
|
|
|
|
# Ensure path is absolute
|
|
if not os.path.isabs(image_path):
|
|
# Try to resolve relative to current working directory
|
|
image_path = os.path.join(os.getcwd(), image_path)
|
|
|
|
# Analyze the image
|
|
if analysis_mode == AnalysisMode.CUSTOM:
|
|
# Use the user request as the custom question
|
|
analysis_result = self.analyzer.analyze_with_mode(
|
|
Path(image_path),
|
|
"custom",
|
|
user_request or "Please analyze this image.",
|
|
)
|
|
else:
|
|
analysis_result = self.analyzer.analyze_with_mode(
|
|
Path(image_path), analysis_mode.value
|
|
)
|
|
|
|
result["analysis_results"].append(
|
|
{
|
|
"image_path": image_path,
|
|
"analysis_mode": analysis_mode.value,
|
|
"result": analysis_result,
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Failed to analyze {image_path}: {str(e)}"
|
|
result["errors"].append(error_msg)
|
|
print(f"Error: {error_msg}", file=sys.stderr)
|
|
|
|
return result
|
|
|
|
def _determine_analysis_mode(
|
|
self, user_request: str, user_input: str
|
|
) -> AnalysisMode:
|
|
"""
|
|
Determine the appropriate analysis mode based on user context.
|
|
|
|
Args:
|
|
user_request: The user's original request
|
|
user_input: The full input containing visual content references
|
|
|
|
Returns:
|
|
AnalysisMode enum value
|
|
"""
|
|
combined_text = (user_request + " " + user_input).lower()
|
|
|
|
# Check for specific keywords to determine mode
|
|
if any(
|
|
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
|
|
):
|
|
return AnalysisMode.OCR
|
|
elif any(
|
|
word in combined_text
|
|
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
|
|
):
|
|
return AnalysisMode.CHART
|
|
elif any(
|
|
word in combined_text
|
|
for word in ["fashion", "服装", "穿搭", "style", "style"]
|
|
):
|
|
return AnalysisMode.FASHION
|
|
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
|
|
return AnalysisMode.PRODUCT
|
|
elif any(
|
|
word in combined_text
|
|
for word in ["scene", "场景", "环境", "location", "place"]
|
|
):
|
|
return AnalysisMode.SCENE
|
|
else:
|
|
return AnalysisMode.DESCRIBE
|
|
|
|
|
|
def main():
|
|
"""Command line interface for testing."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Process visual content in user input")
|
|
parser.add_argument("input", help="User input containing visual content references")
|
|
parser.add_argument("--request", "-r", help="Original user request/context")
|
|
parser.add_argument("--output", "-o", help="Output file for results")
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
processor = VisionProcessor()
|
|
result = processor.process_visual_content(args.input, args.request or "")
|
|
|
|
import json
|
|
|
|
output = json.dumps(result, indent=2, ensure_ascii=False)
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
print(f"Results saved to: {args.output}")
|
|
else:
|
|
print(output)
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|