Files
skills/agent-vision-awareness/scripts/integrate_vision.py
T
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

168 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Integration script for agent-vision-awareness skill
This script demonstrates the complete workflow:
1. Detect visual content in user input
2. Extract image paths
3. Analyze images using the vision analyzer
4. Return structured results
This replaces the problematic custom agent delegation approach.
"""
import sys
import os
from pathlib import Path
from typing import Dict, Any, List
def process_user_input(
user_input: str, user_request: str = "", config: Dict[str, str] = None
) -> Dict[str, Any]:
"""
Process user input for visual content and return analysis results.
Args:
user_input: The user's input text that may contain visual content references
user_request: The original user request/context (optional)
config: Configuration for the vision analyzer (optional)
Returns:
Dictionary containing detection results and analysis
"""
try:
# Import the detector and analyzer
from .vision_detector import VisionContentDetector, DetectionConfidence
from .standalone_vision_analyzer import StandaloneVisionAnalyzer
# Initialize components
detector = VisionContentDetector()
analyzer = StandaloneVisionAnalyzer(config)
# Step 1: Detect visual content
confidence, detected_items = detector.detect_visual_content(user_input)
result = {
"status": "success",
"confidence": confidence.value,
"detected_items": detected_items,
"analysis_results": [],
"errors": [],
}
if confidence == DetectionConfidence.NONE:
result["message"] = "No visual content detected"
return result
# Step 2: Extract image paths
image_paths = detector.extract_image_paths(user_input)
if not image_paths:
result["message"] = "Visual content detected but no valid image paths found"
result["errors"].append("No valid image paths found")
return result
# Step 3: Determine analysis mode
combined_text = (user_request + " " + user_input).lower()
if any(
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
):
mode = "ocr"
elif any(
word in combined_text
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
):
mode = "chart"
elif any(
word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
):
mode = "fashion"
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
mode = "product"
elif any(
word in combined_text
for word in ["scene", "场景", "环境", "location", "place"]
):
mode = "scene"
else:
mode = "describe"
# Step 4: Analyze each image
for image_path in image_paths:
try:
# Handle relative paths
if not os.path.isabs(image_path):
image_path = os.path.join(os.getcwd(), image_path)
# Analyze the image
if mode == "custom":
analysis_result = analyzer.analyze_with_mode(
Path(image_path),
"custom",
user_request or "Please analyze this image.",
)
else:
analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
result["analysis_results"].append(
{"image_path": image_path, "mode": mode, "result": analysis_result}
)
except Exception as e:
error_msg = f"Failed to analyze {image_path}: {str(e)}"
result["errors"].append(error_msg)
print(f"Error: {error_msg}", file=sys.stderr)
return result
except Exception as e:
return {
"status": "error",
"error": str(e),
"message": f"Processing failed: {str(e)}",
}
def main():
"""Command line interface for testing."""
import argparse
import json
parser = argparse.ArgumentParser(description="Process visual content in user input")
parser.add_argument("input", help="User input containing visual content references")
parser.add_argument("--request", "-r", help="Original user request/context")
parser.addiction_group = parser.add_mutually_exclusive_group()
parser.addiction_group.add_argument("--api-key", help="API key for vision service")
parser.addiction_group.add_argument("--config-file", help="Configuration file path")
parser.add_argument("--output", "-o", help="Output file for results")
args = parser.parse_args()
# Build configuration
config = {}
if args.api_key:
config["api_key"] = args.api_key
config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
config["model"] = "doubao-seed-code"
elif args.config_file:
import json
with open(args.config_file, "r", encoding="utf-8") as f:
config = json.load(f)
# Process the input
result = process_user_input(args.input, args.request or "", config)
# Output results
output = json.dumps(result, indent=2, ensure_ascii=False)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Results saved to: {args.output}")
else:
print(output)
if __name__ == "__main__":
main()