Files
skills/agent-vision-awareness/scripts/vision_processor.py
T
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

216 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Vision Processor - Integrates with image-service vision analyzer for agent-vision-awareness skill
This script provides the actual implementation that replaces the problematic
custom agent delegation approach described in the outdated documentation.
"""
import sys
import os
from pathlib import Path
from typing import Dict, Any, Optional, List
from enum import Enum
# Add the image-service scripts to path to reuse the vision analyzer
sys.path.append(str(Path(__file__).parent.parent.parent / "image-service" / "scripts"))
try:
from vision_analyzer import VisionAnalyzer
except ImportError:
# Fallback: try to import from current directory if vision_analyzer is copied here
try:
from .vision_analyzer import VisionAnalyzer
except ImportError:
raise ImportError(
"Cannot find VisionAnalyzer. Please ensure image-service is properly installed."
)
class AnalysisMode(Enum):
"""Available analysis modes."""
DESCRIBE = "describe"
OCR = "ocr"
CHART = "chart"
FASHION = "fashion"
PRODUCT = "product"
SCENE = "scene"
CUSTOM = "custom"
class VisionProcessor:
"""Main vision processing class that integrates detection and analysis."""
def __init__(self, config: Optional[Dict[str, str]] = None):
"""
Initialize the vision processor.
Args:
config: Configuration dictionary for the VisionAnalyzer
"""
self.analyzer = VisionAnalyzer(config)
self.detector = None # Will be created when needed
def process_visual_content(
self, user_input: str, user_request: str = ""
) -> Dict[str, Any]:
"""
Process visual content in user input.
Args:
user_input: The user's input text that may contain visual content references
user_request: The original user request/context
Returns:
Dictionary containing analysis results and metadata
"""
from .vision_detector import VisionContentDetector, DetectionConfidence
# Initialize detector if not already done
if self.detector is None:
self.detector = VisionContentDetector()
# Detect visual content
confidence, detected_items = self.detector.detect_visual_content(user_input)
result = {
"confidence": confidence.value,
"detected_items": detected_items,
"analysis_results": [],
"errors": [],
}
if confidence == DetectionConfidence.NONE:
result["message"] = "No visual content detected"
return result
# Extract image paths
image_paths = self.detector.extract_image_paths(user_input)
if not image_paths:
result["message"] = "Visual content detected but no valid image paths found"
result["errors"].append("No valid image paths found")
return result
# Determine analysis mode based on user request
analysis_mode = self._determine_analysis_mode(user_request, user_input)
# Process each image
for image_path in image_paths:
try:
# Handle URLs by downloading first (simplified - in practice would need download logic)
if image_path.startswith(("http://", "https://")):
# In a real implementation, you'd download the URL to a temp file
# For now, we'll assume local paths only
result["errors"].append(
f"URL handling not implemented: {image_path}"
)
continue
# Ensure path is absolute
if not os.path.isabs(image_path):
# Try to resolve relative to current working directory
image_path = os.path.join(os.getcwd(), image_path)
# Analyze the image
if analysis_mode == AnalysisMode.CUSTOM:
# Use the user request as the custom question
analysis_result = self.analyzer.analyze_with_mode(
Path(image_path),
"custom",
user_request or "Please analyze this image.",
)
else:
analysis_result = self.analyzer.analyze_with_mode(
Path(image_path), analysis_mode.value
)
result["analysis_results"].append(
{
"image_path": image_path,
"analysis_mode": analysis_mode.value,
"result": analysis_result,
}
)
except Exception as e:
error_msg = f"Failed to analyze {image_path}: {str(e)}"
result["errors"].append(error_msg)
print(f"Error: {error_msg}", file=sys.stderr)
return result
def _determine_analysis_mode(
self, user_request: str, user_input: str
) -> AnalysisMode:
"""
Determine the appropriate analysis mode based on user context.
Args:
user_request: The user's original request
user_input: The full input containing visual content references
Returns:
AnalysisMode enum value
"""
combined_text = (user_request + " " + user_input).lower()
# Check for specific keywords to determine mode
if any(
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
):
return AnalysisMode.OCR
elif any(
word in combined_text
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
):
return AnalysisMode.CHART
elif any(
word in combined_text
for word in ["fashion", "服装", "穿搭", "style", "style"]
):
return AnalysisMode.FASHION
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
return AnalysisMode.PRODUCT
elif any(
word in combined_text
for word in ["scene", "场景", "环境", "location", "place"]
):
return AnalysisMode.SCENE
else:
return AnalysisMode.DESCRIBE
def main():
"""Command line interface for testing."""
import argparse
parser = argparse.ArgumentParser(description="Process visual content in user input")
parser.add_argument("input", help="User input containing visual content references")
parser.add_argument("--request", "-r", help="Original user request/context")
parser.add_argument("--output", "-o", help="Output file for results")
args = parser.parse_args()
try:
processor = VisionProcessor()
result = processor.process_visual_content(args.input, args.request or "")
import json
output = json.dumps(result, indent=2, ensure_ascii=False)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Results saved to: {args.output}")
else:
print(output)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()