Files
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

182 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
修复版 vision_analyzer.py - 使用火山方舟(VolcEngine)视觉模型
"""
import base64
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, Optional
import httpx
class VisionAnalyzer:
"""视觉分析器 - 使用火山方舟doubao-vision模型"""
# 预定义的分析模式
ANALYSIS_MODES = {
"describe": "请详细描述这张图片的内容,包括:人物、场景、物品、颜色、布局等所有细节。",
"ocr": "请仔细识别这张图片中的所有文字内容,按照文字在图片中的位置顺序输出。如果是中文,请保持原文输出。",
"chart": "请分析这张图表的内容,包括:图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
"fashion": "请分析这张图片中人物的穿搭,包括:服装款式、颜色搭配,配饰、整体风格等。",
"product": "请分析这张产品图片,包括:产品类型、外观特征、功能特点,品牌信息等。",
"scene": "请描述这张图片的场景,包括:地点、环境、氛围、时间(白天/夜晚)等。",
"custom": "用户自定义问题",
}
def __init__(self, config: Optional[Dict[str, str]] = None):
"""
初始化分析器 - 使用火山方舟API
"""
if config is None:
config = self._load_config()
# 使用火山方舟的配置
self.api_key = (
config.get("api_key")
or config.get("VOLCENGINE_API_KEY")
or config.get("DASHSCOPE_API_KEY") # 后备
or "b0359bed-09f2-49e2-a53c-32ba057412e3" # 硬编码后备
)
self.base_url = (
config.get("base_url")
or "https://ark.cn-beijing.volces.com/api/coding/v3" # 火山方舟 Coding Plan API
)
self.model = (
config.get("model")
or "doubao-seed-code" # 火山方舟Coding Plan唯一支持的视觉模型
)
if not self.api_key:
raise ValueError("缺少必要的 API 配置:api_key")
def _load_config(self) -> Dict[str, str]:
"""从配置文件或环境变量加载配置"""
config = {}
# 1. 从环境变量读取
config["api_key"] = os.environ.get("VOLCENGINE_API_KEY") or os.environ.get(
"DASHSCOPE_API_KEY"
)
config["base_url"] = os.environ.get("VISION_API_BASE_URL")
config["model"] = os.environ.get("VISION_MODEL")
# 2. 如果环境变量没有,尝试从配置文件读取
if not config["api_key"]:
try:
from .load_config import load_config
cfg = load_config()
config["api_key"] = cfg.get("IMAGE_API_KEY")
config["base_url"] = cfg.get("IMAGE_API_BASE_URL")
config["model"] = cfg.get("VISION_MODEL")
except ImportError:
pass
return config
def encode_image(self, image_path: Path) -> str:
"""将图片编码为base64"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def analyze(
self, image_path: str, mode: str = "describe", custom_query: str = None
) -> str:
"""分析图片"""
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"Image file not found: {image_path}")
# 获取提示词
if mode == "custom" and custom_query:
prompt = custom_query
else:
prompt = self.ANALYSIS_MODES.get(mode, self.ANALYSIS_MODES["describe"])
# 编码图片
image_base64 = self.encode_image(image_path)
# 构建请求 - 火山方舟使用标准OpenAI兼容格式
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
# 火山方舟的消息格式
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
{"type": "text", "text": prompt},
],
}
],
"max_tokens": 2000,
}
# 发送请求
try:
response = httpx.post(
f"{self.base_url}/chat/completions",
headers=headers,
json=payload,
timeout=120.0,
)
response.raise_for_status()
result = response.json()
return result["choices"][0]["message"]["content"]
except Exception as e:
print(f"API request failed: {e}")
raise
def main():
"""主函数"""
if len(sys.argv) < 2:
print(
"Usage: python vision_analyzer_fixed.py <image_path> [-m mode] [-q query]"
)
sys.exit(1)
image_path = sys.argv[1]
mode = "describe"
custom_query = None
# 解析命令行参数
i = 2
while i < len(sys.argv):
if sys.argv[i] == "-m":
mode = sys.argv[i + 1]
i += 2
elif sys.argv[i] == "-q":
custom_query = sys.argv[i + 1]
mode = "custom"
i += 2
else:
i += 1
try:
analyzer = VisionAnalyzer()
result = analyzer.analyze(image_path, mode, custom_query)
print(result)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()