Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Integration script for agent-vision-awareness skill
|
||||
|
||||
This script demonstrates the complete workflow:
|
||||
1. Detect visual content in user input
|
||||
2. Extract image paths
|
||||
3. Analyze images using the vision analyzer
|
||||
4. Return structured results
|
||||
|
||||
This replaces the problematic custom agent delegation approach.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
def process_user_input(
|
||||
user_input: str, user_request: str = "", config: Dict[str, str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process user input for visual content and return analysis results.
|
||||
|
||||
Args:
|
||||
user_input: The user's input text that may contain visual content references
|
||||
user_request: The original user request/context (optional)
|
||||
config: Configuration for the vision analyzer (optional)
|
||||
|
||||
Returns:
|
||||
Dictionary containing detection results and analysis
|
||||
"""
|
||||
try:
|
||||
# Import the detector and analyzer
|
||||
from .vision_detector import VisionContentDetector, DetectionConfidence
|
||||
from .standalone_vision_analyzer import StandaloneVisionAnalyzer
|
||||
|
||||
# Initialize components
|
||||
detector = VisionContentDetector()
|
||||
analyzer = StandaloneVisionAnalyzer(config)
|
||||
|
||||
# Step 1: Detect visual content
|
||||
confidence, detected_items = detector.detect_visual_content(user_input)
|
||||
result = {
|
||||
"status": "success",
|
||||
"confidence": confidence.value,
|
||||
"detected_items": detected_items,
|
||||
"analysis_results": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if confidence == DetectionConfidence.NONE:
|
||||
result["message"] = "No visual content detected"
|
||||
return result
|
||||
|
||||
# Step 2: Extract image paths
|
||||
image_paths = detector.extract_image_paths(user_input)
|
||||
if not image_paths:
|
||||
result["message"] = "Visual content detected but no valid image paths found"
|
||||
result["errors"].append("No valid image paths found")
|
||||
return result
|
||||
|
||||
# Step 3: Determine analysis mode
|
||||
combined_text = (user_request + " " + user_input).lower()
|
||||
if any(
|
||||
word in combined_text for word in ["text", "文字", "ocr", "read", "识别"]
|
||||
):
|
||||
mode = "ocr"
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["chart", "graph", "plot", "图表", "数据", "趋势"]
|
||||
):
|
||||
mode = "chart"
|
||||
elif any(
|
||||
word in combined_text for word in ["fashion", "服装", "穿搭", "style"]
|
||||
):
|
||||
mode = "fashion"
|
||||
elif any(word in combined_text for word in ["product", "产品", "商品", "item"]):
|
||||
mode = "product"
|
||||
elif any(
|
||||
word in combined_text
|
||||
for word in ["scene", "场景", "环境", "location", "place"]
|
||||
):
|
||||
mode = "scene"
|
||||
else:
|
||||
mode = "describe"
|
||||
|
||||
# Step 4: Analyze each image
|
||||
for image_path in image_paths:
|
||||
try:
|
||||
# Handle relative paths
|
||||
if not os.path.isabs(image_path):
|
||||
image_path = os.path.join(os.getcwd(), image_path)
|
||||
|
||||
# Analyze the image
|
||||
if mode == "custom":
|
||||
analysis_result = analyzer.analyze_with_mode(
|
||||
Path(image_path),
|
||||
"custom",
|
||||
user_request or "Please analyze this image.",
|
||||
)
|
||||
else:
|
||||
analysis_result = analyzer.analyze_with_mode(Path(image_path), mode)
|
||||
|
||||
result["analysis_results"].append(
|
||||
{"image_path": image_path, "mode": mode, "result": analysis_result}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to analyze {image_path}: {str(e)}"
|
||||
result["errors"].append(error_msg)
|
||||
print(f"Error: {error_msg}", file=sys.stderr)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
"message": f"Processing failed: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""Command line interface for testing."""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process visual content in user input")
|
||||
parser.add_argument("input", help="User input containing visual content references")
|
||||
parser.add_argument("--request", "-r", help="Original user request/context")
|
||||
parser.addiction_group = parser.add_mutually_exclusive_group()
|
||||
parser.addiction_group.add_argument("--api-key", help="API key for vision service")
|
||||
parser.addiction_group.add_argument("--config-file", help="Configuration file path")
|
||||
parser.add_argument("--output", "-o", help="Output file for results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build configuration
|
||||
config = {}
|
||||
if args.api_key:
|
||||
config["api_key"] = args.api_key
|
||||
config["base_url"] = "https://ark.cn-beijing.volces.com/api/coding/v3"
|
||||
config["model"] = "doubao-seed-code"
|
||||
elif args.config_file:
|
||||
import json
|
||||
|
||||
with open(args.config_file, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Process the input
|
||||
result = process_user_input(args.input, args.request or "", config)
|
||||
|
||||
# Output results
|
||||
output = json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"Results saved to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user