04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
148 lines
4.6 KiB
Python
148 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
视觉分析器 - 简化版本,避免Unicode和配置问题
|
|
使用阿里云视觉模型分析图片内容
|
|
"""
|
|
|
|
import base64
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
import httpx
|
|
|
|
|
|
class VisionAnalyzer:
|
|
"""视觉分析器 - 使用阿里云qwen-vl模型"""
|
|
|
|
# 预定义的分析模式
|
|
ANALYSIS_MODES = {
|
|
"describe": "请详细描述这张图片的内容,包括:人物、场景、物品、颜色、布局等所有细节。",
|
|
"ocr": "请仔细识别这张图片中的所有文字内容,按照文字在图片中的位置顺序输出。如果是中文,请保持原文输出。",
|
|
"chart": "请分析这张图表的内容,包括:图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
|
|
"fashion": "请分析这张图片中人物的穿搭,包括:服装款式、颜色搭配、配饰、整体风格等。",
|
|
"product": "请分析这张产品图片,包括:产品类型、外观特征、功能特点、品牌信息等。",
|
|
"scene": "请描述这张图片的场景,包括:地点、环境、氛围、时间(白天/夜晚)等。",
|
|
"custom": "用户自定义问题",
|
|
}
|
|
|
|
def __init__(self):
|
|
"""初始化分析器"""
|
|
config = self._load_config()
|
|
|
|
self.api_key = (
|
|
config.get("VOLCENGINE_API_KEY") or "b0359bed-09f2-49e2-a53c-32ba057412e3"
|
|
)
|
|
self.base_url = config.get(
|
|
"VOLCENGINE_BASE_URL", "https://ark.cn-beijing.volces.com/api/coding/v3"
|
|
)
|
|
self.model = config.get("VISION_MODEL", "doubao-seed-code")
|
|
|
|
if not self.api_key:
|
|
raise ValueError("API key is required")
|
|
|
|
def _load_config(self) -> Dict[str, str]:
|
|
"""从配置文件加载配置"""
|
|
# 导入简化版的配置加载
|
|
sys.path.append(str(Path(__file__).parent))
|
|
from load_config_simple import load_config
|
|
|
|
return load_config()
|
|
|
|
def encode_image(self, image_path: Path) -> str:
|
|
"""将图片编码为base64"""
|
|
with open(image_path, "rb") as image_file:
|
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
|
|
def analyze(
|
|
self, image_path: str, mode: str = "describe", custom_query: str = None
|
|
) -> str:
|
|
"""分析图片"""
|
|
image_path = Path(image_path)
|
|
if not image_path.exists():
|
|
raise FileNotFoundError(f"Image file not found: {image_path}")
|
|
|
|
# 获取提示词
|
|
if mode == "custom" and custom_query:
|
|
prompt = custom_query
|
|
else:
|
|
prompt = self.ANALYSIS_MODES.get(mode, self.ANALYSIS_MODES["describe"])
|
|
|
|
# 编码图片
|
|
image_base64 = self.encode_image(image_path)
|
|
|
|
# 构建请求
|
|
headers = {
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
payload = {
|
|
"model": self.model,
|
|
"input": {
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": [{"image": image_base64}, {"text": prompt}],
|
|
}
|
|
]
|
|
},
|
|
"parameters": {"max_tokens": 2000},
|
|
}
|
|
|
|
# 发送请求
|
|
try:
|
|
response = httpx.post(
|
|
f"{self.base_url}/services/aigc/multimodal-generation/generation",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=120.0,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
return result["output"]["choices"][0]["message"]["content"]
|
|
|
|
except Exception as e:
|
|
print(f"API request failed: {e}")
|
|
raise
|
|
|
|
|
|
def main():
|
|
"""主函数"""
|
|
if len(sys.argv) < 2:
|
|
print(
|
|
"Usage: python vision_analyzer_simple.py <image_path> [-m mode] [-q query]"
|
|
)
|
|
sys.exit(1)
|
|
|
|
image_path = sys.argv[1]
|
|
mode = "describe"
|
|
custom_query = None
|
|
|
|
# 解析命令行参数
|
|
i = 2
|
|
while i < len(sys.argv):
|
|
if sys.argv[i] == "-m":
|
|
mode = sys.argv[i + 1]
|
|
i += 2
|
|
elif sys.argv[i] == "-q":
|
|
custom_query = sys.argv[i + 1]
|
|
mode = "custom"
|
|
i += 2
|
|
else:
|
|
i += 1
|
|
|
|
try:
|
|
analyzer = VisionAnalyzer()
|
|
result = analyzer.analyze(image_path, mode, custom_query)
|
|
print(result)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|