04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
168 lines
5.5 KiB
Python
168 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Integration script for agent-vision-awareness skill
|
|
|
|
This script demonstrates the complete workflow:
|
|
1. Detect visual content in user input
|
|
2. Extract image paths
|
|
3. Analyze images using the vision analyzer
|
|
4. Return structured results
|
|
|
|
This replaces the problematic custom agent delegation approach.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List
|
|
|
|
|
|
def process_user_input(
|
|
user_input: str, user_request: str = "", config: Dict[str, str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process user input for visual content and return analysis results.
|
|
|
|
Args:
|
|
user_input: The user's input text that may contain visual content references
|
|
user_request: The original user request/context (optional)
|
|
config: Configuration for the vision analyzer (optional)
|
|
|
|
Returns:
|
|
Dictionary containing detection results and analysis
|
|
"""
|
|
try:
|
|
# Import the detector and analyzer
|
|
from .vision_detector import VisionContentDetector, DetectionConfidence
|
|
from .standalone_vision_analyzer import StandaloneVisionAnalyzer
|
|
|
|
# Initialize components
|
|
detector = VisionContentDetector()
|
|
analyzer = StandaloneVisionAnalyzer(config)
|
|
|
|
# Step 1: Detect visual content
|
|
confidence, detected_items = detector.detect_visual_content(user_input)
|
|
result = {
|
|
"status": "success",
|
|
"confidence": confidence.value,
|
|
"detected_items": detected_items,
|
|
"analysis_results": [],
|
|
"errors": [],
|
|
}
|
|
|
|
if confidence == DetectionConfidence.NONE:
|
|
result["message"] = "No visual content detected"
|
|
return result
|
|
|
|
# Step 2: Extract image paths
|
|
image_paths = detector.extract_image_paths(user_input)
|
|
if not image_paths:
|
|
result["message"] = "Visual content detected but no valid image paths found"
|
|
result["errors"].append("No valid image paths found")
|
|
return result
|
|
|
|
# Step 3: Determine analysis mode
|
|
combined_text = (user_request + " " + user_input).lower()
|
|
if any(
|
|
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
|
|
):
|
|
mode = "ocr"
|
|
elif any(
|
|
word in combined_text
|
|
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
|
|
):
|
|
mode = "chart"
|
|
elif any(
|
|
word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
|
|
):
|
|
mode = "fashion"
|
|
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
|
|
mode = "product"
|
|
elif any(
|
|
word in combined_text
|
|
for word in ["scene", "场景", "环境", "location", "place"]
|
|
):
|
|
mode = "scene"
|
|
else:
|
|
mode = "describe"
|
|
|
|
# Step 4: Analyze each image
|
|
for image_path in image_paths:
|
|
try:
|
|
# Handle relative paths
|
|
if not os.path.isabs(image_path):
|
|
image_path = os.path.join(os.getcwd(), image_path)
|
|
|
|
# Analyze the image
|
|
if mode == "custom":
|
|
analysis_result = analyzer.analyze_with_mode(
|
|
Path(image_path),
|
|
"custom",
|
|
user_request or "Please analyze this image.",
|
|
)
|
|
else:
|
|
analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
|
|
|
|
result["analysis_results"].append(
|
|
{"image_path": image_path, "mode": mode, "result": analysis_result}
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Failed to analyze {image_path}: {str(e)}"
|
|
result["errors"].append(error_msg)
|
|
print(f"Error: {error_msg}", file=sys.stderr)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
return {
|
|
"status": "error",
|
|
"error": str(e),
|
|
"message": f"Processing failed: {str(e)}",
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Command line interface for testing."""
|
|
import argparse
|
|
import json
|
|
|
|
parser = argparse.ArgumentParser(description="Process visual content in user input")
|
|
parser.add_argument("input", help="User input containing visual content references")
|
|
parser.add_argument("--request", "-r", help="Original user request/context")
|
|
parser.addiction_group = parser.add_mutually_exclusive_group()
|
|
parser.addiction_group.add_argument("--api-key", help="API key for vision service")
|
|
parser.addiction_group.add_argument("--config-file", help="Configuration file path")
|
|
parser.add_argument("--output", "-o", help="Output file for results")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Build configuration
|
|
config = {}
|
|
if args.api_key:
|
|
config["api_key"] = args.api_key
|
|
config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
|
|
config["model"] = "doubao-seed-code"
|
|
elif args.config_file:
|
|
import json
|
|
|
|
with open(args.config_file, "r", encoding="utf-8") as f:
|
|
config = json.load(f)
|
|
|
|
# Process the input
|
|
result = process_user_input(args.input, args.request or "", config)
|
|
|
|
# Output results
|
|
output = json.dumps(result, indent=2, ensure_ascii=False)
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
print(f"Results saved to: {args.output}")
|
|
else:
|
|
print(output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|