新增16个AI技能:包含图像生成、视频剪辑、数据分析、智能查询等功能模块

This commit is contained in:
hmo
2026-02-13 20:18:38 +08:00
parent 456cd45de4
commit 8200a17176
154 changed files with 14585 additions and 1 deletions

View File

@@ -0,0 +1,287 @@
#!/usr/bin/env python3
"""
图生文脚本 (Image-to-Text) - 视觉识别
使用 Qwen2.5-VL 模型分析图片内容并生成文字描述
Author: 翟星人
"""
import httpx
import base64
import json
import os
from typing import Dict, Any, Optional, Union, List
from pathlib import Path
class ImageToTextAnalyzer:
"""图生文分析器 - 视觉识别"""
# 预定义的分析模式
ANALYSIS_MODES = {
"describe": "请详细描述这张图片的内容,包括:人物、场景、物品、颜色、布局等所有细节。",
"ocr": "请仔细识别这张图片中的所有文字内容,按照文字在图片中的位置顺序输出。如果是中文,请保持原文输出。",
"chart": "请分析这张图表的内容,包括:图表类型、数据趋势、关键数据点、标题标签、以及数据的结论或洞察。",
"fashion": "请分析这张图片中人物的穿搭,包括:服装款式、颜色搭配、配饰、整体风格等。",
"product": "请分析这张产品图片,包括:产品类型、外观特征、功能特点、品牌信息等。",
"scene": "请描述这张图片的场景,包括:地点、环境、氛围、时间(白天/夜晚)等。"
}
def __init__(self, config: Optional[Dict[str, str]] = None):
"""
初始化分析器
Args:
config: 配置字典,包含 api_key, base_url, model
如果不传则从环境变量或配置文件读取
"""
if config is None:
config = self._load_config()
self.api_key = config.get('api_key') or config.get('VISION_API_KEY') or config.get('IMAGE_API_KEY')
self.base_url = config.get('base_url') or config.get('VISION_API_BASE_URL') or config.get('IMAGE_API_BASE_URL')
self.model = config.get('model') or config.get('VISION_MODEL') or 'qwen2.5-vl-72b-instruct'
if not self.api_key or not self.base_url:
raise ValueError("缺少必要的 API 配置api_key 和 base_url")
def _load_config(self) -> Dict[str, str]:
"""从配置文件或环境变量加载配置"""
config = {}
# 尝试从配置文件加载
config_path = Path(__file__).parent.parent / 'config' / 'settings.json'
if config_path.exists():
with open(config_path, 'r', encoding='utf-8') as f:
settings = json.load(f)
# 优先使用 vision_api 配置
vision_config = settings.get('vision_api', {})
if vision_config:
config['api_key'] = vision_config.get('key')
config['base_url'] = vision_config.get('base_url')
config['model'] = vision_config.get('model')
else:
# 回退到 image_api 配置
api_config = settings.get('image_api', {})
config['api_key'] = api_config.get('key')
config['base_url'] = api_config.get('base_url')
# 环境变量优先级更高
config['api_key'] = os.getenv('VISION_API_KEY', os.getenv('IMAGE_API_KEY', config.get('api_key')))
config['base_url'] = os.getenv('VISION_API_BASE_URL', os.getenv('IMAGE_API_BASE_URL', config.get('base_url')))
config['model'] = os.getenv('VISION_MODEL', config.get('model', 'qwen2.5-vl-72b-instruct'))
return config
@staticmethod
def image_to_base64(image_path: str) -> str:
"""
将图片文件转换为 base64 编码(带 data URL 前缀)
Args:
image_path: 图片文件路径
Returns:
base64 编码字符串(含 data URL 前缀)
"""
path = Path(image_path)
if not path.exists():
raise FileNotFoundError(f"图片文件不存在: {image_path}")
# 获取 MIME 类型
suffix = path.suffix.lower()
mime_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp'
}
mime_type = mime_types.get(suffix, 'image/png')
with open(image_path, 'rb') as f:
b64_str = base64.b64encode(f.read()).decode('utf-8')
return f"data:{mime_type};base64,{b64_str}"
def analyze(
self,
image: Union[str, bytes],
prompt: Optional[str] = None,
mode: str = "describe",
max_tokens: int = 2000,
temperature: float = 0.7
) -> Dict[str, Any]:
"""
分析图片并生成文字描述
Args:
image: 图片路径、URL 或 base64 字符串
prompt: 自定义分析提示词(如果提供则忽略 mode
mode: 分析模式 (describe/ocr/chart/fashion/product/scene)
max_tokens: 最大输出 token 数
temperature: 温度参数
Returns:
包含分析结果的字典
"""
# 确定使用的提示词
if prompt is None:
prompt = self.ANALYSIS_MODES.get(mode, self.ANALYSIS_MODES["describe"])
# 处理图片输入
if isinstance(image, str):
if os.path.isfile(image):
image_url = self.image_to_base64(image)
elif image.startswith('data:') or image.startswith('http'):
image_url = image
else:
# 假设是纯 base64 字符串
image_url = f"data:image/png;base64,{image}"
else:
image_url = f"data:image/png;base64,{base64.b64encode(image).decode('utf-8')}"
# 构建请求
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": image_url
}
}
]
}
],
"max_tokens": max_tokens,
"temperature": temperature
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
try:
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
)
response.raise_for_status()
result = response.json()
# 提取文本内容
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
return {
"success": True,
"content": content,
"mode": mode,
"usage": result.get("usage", {})
}
except httpx.HTTPStatusError as e:
return {
"success": False,
"error": f"HTTP 错误: {e.response.status_code}",
"detail": str(e)
}
except Exception as e:
return {
"success": False,
"error": "分析失败",
"detail": str(e)
}
def describe(self, image: Union[str, bytes]) -> Dict[str, Any]:
"""通用图片描述"""
return self.analyze(image, mode="describe")
def ocr(self, image: Union[str, bytes]) -> Dict[str, Any]:
"""文字识别 (OCR)"""
return self.analyze(image, mode="ocr")
def analyze_chart(self, image: Union[str, bytes]) -> Dict[str, Any]:
"""图表分析"""
return self.analyze(image, mode="chart")
def analyze_fashion(self, image: Union[str, bytes]) -> Dict[str, Any]:
"""穿搭分析"""
return self.analyze(image, mode="fashion")
def analyze_product(self, image: Union[str, bytes]) -> Dict[str, Any]:
"""产品分析"""
return self.analyze(image, mode="product")
def analyze_scene(self, image: Union[str, bytes]) -> Dict[str, Any]:
"""场景分析"""
return self.analyze(image, mode="scene")
def batch_analyze(
self,
images: List[str],
mode: str = "describe"
) -> List[Dict[str, Any]]:
"""
批量分析多张图片
Args:
images: 图片路径列表
mode: 分析模式
Returns:
分析结果列表
"""
results = []
for image in images:
result = self.analyze(image, mode=mode)
result["image"] = image
results.append(result)
return results
def main():
"""命令行入口"""
import argparse
parser = argparse.ArgumentParser(description='图生文分析工具(视觉识别)')
parser.add_argument('image', help='输入图片路径')
parser.add_argument('-m', '--mode', default='describe',
choices=['describe', 'ocr', 'chart', 'fashion', 'product', 'scene'],
help='分析模式')
parser.add_argument('-p', '--prompt', help='自定义分析提示词')
parser.add_argument('--max-tokens', type=int, default=2000, help='最大输出 token 数')
args = parser.parse_args()
analyzer = ImageToTextAnalyzer()
result = analyzer.analyze(
image=args.image,
prompt=args.prompt,
mode=args.mode,
max_tokens=args.max_tokens
)
if result["success"]:
print(f"\n=== 分析结果 ({result['mode']}) ===\n")
print(result["content"])
print(f"\n=== Token 使用 ===")
print(f"输入: {result['usage'].get('prompt_tokens', 'N/A')}")
print(f"输出: {result['usage'].get('completion_tokens', 'N/A')}")
else:
print(f"分析失败: {result['error']}")
print(f"详情: {result.get('detail', 'N/A')}")
if __name__ == "__main__":
main()