Initial commit to git.yoin

2026-02-11 22:02:47 +08:00
commit cf10ab6473
153 changed files with 14581 additions and 0 deletions
--- a/video-creator/scripts/scene_splitter.py
+++ b/video-creator/scripts/scene_splitter.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+场景拆分器 - 将口播文本拆分成细镜头
+基于时间戳对齐图片和字幕
+"""
+
+import json
+import re
+import argparse
+from pathlib import Path
+from typing import List, Dict
+
+
+def split_by_sentence_timestamps(timestamps: List[Dict]) -> List[Dict]:
+    """
+    直接使用 TTS 的 SentenceBoundary 时间戳作为镜头分割
+    
+    Args:
+        timestamps: TTS 输出的时间戳（包含 sentence 类型）
+    
+    Returns:
+        每个镜头的信息：text, start, end, duration
+    """
+    shots = []
+    for ts in timestamps:
+        if ts.get("type") == "sentence":
+            shots.append({
+                "text": ts["text"],
+                "start": ts["start"],
+                "end": ts["end"],
+                "duration": round(ts["end"] - ts["start"], 2)
+            })
+    
+    if not shots and timestamps:
+        total_start = timestamps[0]["start"]
+        total_end = timestamps[-1]["end"]
+        full_text = "".join(ts["text"] for ts in timestamps)
+        shots.append({
+            "text": full_text,
+            "start": total_start,
+            "end": total_end,
+            "duration": round(total_end - total_start, 2)
+        })
+    
+    return shots
+
+
+def split_long_shots(shots: List[Dict], max_duration: float = 6.0) -> List[Dict]:
+    """
+    将过长的镜头按标点符号进一步拆分
+    
+    Args:
+        shots: 镜头列表
+        max_duration: 最大镜头时长
+    
+    Returns:
+        拆分后的镜头列表
+    """
+    result = []
+    
+    for shot in shots:
+        if shot["duration"] <= max_duration:
+            result.append(shot)
+            continue
+        
+        text = shot["text"]
+        splits = re.split(r'([，。！？,!?])', text)
+        
+        sub_texts = []
+        current = ""
+        for i, part in enumerate(splits):
+            current += part
+            if i % 2 == 1 and current.strip():
+                sub_texts.append(current.strip())
+                current = ""
+        if current.strip():
+            sub_texts.append(current.strip())
+        
+        if len(sub_texts) <= 1:
+            result.append(shot)
+            continue
+        
+        total_chars = sum(len(t) for t in sub_texts)
+        current_time = shot["start"]
+        
+        for sub_text in sub_texts:
+            ratio = len(sub_text) / total_chars
+            sub_duration = shot["duration"] * ratio
+            result.append({
+                "text": sub_text,
+                "start": round(current_time, 2),
+                "end": round(current_time + sub_duration, 2),
+                "duration": round(sub_duration, 2)
+            })
+            current_time += sub_duration
+    
+    return result
+
+
+def merge_short_shots(shots: List[Dict], min_duration: float = 2.5) -> List[Dict]:
+    """合并过短的镜头"""
+    if not shots:
+        return shots
+    
+    merged = []
+    current = shots[0].copy()
+    
+    for shot in shots[1:]:
+        if current["duration"] < min_duration:
+            current["text"] += shot["text"]
+            current["end"] = shot["end"]
+            current["duration"] = round(current["end"] - current["start"], 2)
+        else:
+            merged.append(current)
+            current = shot.copy()
+    
+    merged.append(current)
+    return merged
+
+
+def generate_shot_prompts(shots: List[Dict], style: str, context: str = "") -> List[Dict]:
+    """
+    为每个镜头生成图片提示词
+    
+    Args:
+        shots: 镜头列表
+        style: 画风描述
+        context: 上下文（如角色描述）
+    
+    Returns:
+        带有图片提示词的镜头列表
+    """
+    for i, shot in enumerate(shots):
+        shot["image_prompt"] = f"{style}，{context}，画面：{shot['text']}。禁止出现任何文字"
+        shot["index"] = i + 1
+    
+    return shots
+
+
+def generate_srt(shots: List[Dict], output_path: str):
+    """生成 SRT 字幕文件"""
+    def format_time(seconds: float) -> str:
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+    
+    with open(output_path, "w", encoding="utf-8") as f:
+        for i, shot in enumerate(shots, 1):
+            f.write(f"{i}\n")
+            f.write(f"{format_time(shot['start'])} --> {format_time(shot['end'])}\n")
+            f.write(f"{shot['text']}\n\n")
+    
+    print(f"  ✓ 字幕: {output_path}")
+
+
+def process_scene(text: str, timestamps_path: str, style: str, context: str = "", output_dir: str = ".") -> Dict:
+    """
+    处理单个场景，输出镜头配置
+    
+    Args:
+        text: 场景口播文本
+        timestamps_path: TTS 时间戳 JSON 文件
+        style: 画风
+        context: 上下文
+        output_dir: 输出目录
+    
+    Returns:
+        场景配置字典
+    """
+    with open(timestamps_path, "r", encoding="utf-8") as f:
+        timestamps = json.load(f)
+    
+    shots = split_by_sentence_timestamps(timestamps)
+    
+    shots = split_long_shots(shots, max_duration=6.0)
+    
+    shots = merge_short_shots(shots, min_duration=2.5)
+    
+    shots = generate_shot_prompts(shots, style, context)
+    
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    srt_path = output_path / "subtitles.srt"
+    generate_srt(shots, str(srt_path))
+    
+    config_path = output_path / "shots.json"
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(shots, f, ensure_ascii=False, indent=2)
+    print(f"  ✓ 镜头配置: {config_path}")
+    
+    return {"shots": shots, "srt_path": str(srt_path)}
+
+
+def main():
+    parser = argparse.ArgumentParser(description='场景拆分器')
+    parser.add_argument('--text', type=str, required=True, help='口播文本')
+    parser.add_argument('--timestamps', type=str, required=True, help='TTS时间戳JSON文件')
+    parser.add_argument('--style', type=str, default='', help='画风描述')
+    parser.add_argument('--context', type=str, default='', help='上下文（角色等）')
+    parser.add_argument('--output-dir', type=str, default='.', help='输出目录')
+    
+    args = parser.parse_args()
+    
+    result = process_scene(
+        text=args.text,
+        timestamps_path=args.timestamps,
+        style=args.style,
+        context=args.context,
+        output_dir=args.output_dir
+    )
+    
+    print(f"\n拆分完成，共 {len(result['shots'])} 个镜头：")
+    for shot in result["shots"]:
+        print(f"  [{shot['duration']:.1f}s] {shot['text']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/video-creator/scripts/tts_generator.py
+++ b/video-creator/scripts/tts_generator.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+TTS 语音生成器 - 使用 edge-tts
+支持时间戳输出，用于字幕同步和镜头切换
+"""
+
+import asyncio
+import argparse
+import os
+import json
+import yaml
+import edge_tts
+
+
+async def generate_tts(text: str, voice: str, output_path: str, rate: str = "+0%", pitch: str = "+0Hz", with_timestamps: bool = False):
+    """生成单条语音，可选输出时间戳"""
+    communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
+    
+    if with_timestamps:
+        timestamps = []
+        audio_chunks = []
+        
+        async for chunk in communicate.stream():
+            chunk_type = chunk.get("type", "")
+            if chunk_type == "audio":
+                audio_chunks.append(chunk.get("data", b""))
+            elif chunk_type == "WordBoundary":
+                timestamps.append({
+                    "text": chunk.get("text", ""),
+                    "start": chunk.get("offset", 0) / 10000000,
+                    "end": (chunk.get("offset", 0) + chunk.get("duration", 0)) / 10000000
+                })
+            elif chunk_type == "SentenceBoundary":
+                timestamps.append({
+                    "text": chunk.get("text", ""),
+                    "start": chunk.get("offset", 0) / 10000000,
+                    "end": (chunk.get("offset", 0) + chunk.get("duration", 0)) / 10000000,
+                    "type": "sentence"
+                })
+        
+        with open(output_path, "wb") as f:
+            for data in audio_chunks:
+                f.write(data)
+        
+        ts_path = output_path.rsplit(".", 1)[0] + ".json"
+        with open(ts_path, "w", encoding="utf-8") as f:
+            json.dump(timestamps, f, ensure_ascii=False, indent=2)
+        
+        print(f"  ✓ 生成: {output_path} + 时间戳")
+        return timestamps
+    else:
+        await communicate.save(output_path)
+        print(f"  ✓ 生成: {output_path}")
+        return None
+
+
+async def generate_batch(config_path: str, output_dir: str):
+    """批量生成语音"""
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    
+    os.makedirs(output_dir, exist_ok=True)
+    
+    voice_config = config.get('voice', {})
+    voice_name = voice_config.get('name', 'zh-CN-YunxiNeural')
+    rate = voice_config.get('rate', '+0%')
+    pitch = voice_config.get('pitch', '+0Hz')
+    
+    scenes = config.get('scenes', [])
+    tasks = []
+    
+    for i, scene in enumerate(scenes):
+        text = scene.get('text', '')
+        if not text:
+            continue
+        output_path = os.path.join(output_dir, f"{i:03d}.mp3")
+        tasks.append(generate_tts(text, voice_name, output_path, rate, pitch))
+    
+    print(f"开始生成 {len(tasks)} 条语音...")
+    await asyncio.gather(*tasks)
+    print(f"✓ 完成！语音文件保存在: {output_dir}")
+
+
+async def list_voices():
+    """列出所有可用音色"""
+    voices = await edge_tts.list_voices()
+    zh_voices = [v for v in voices if v['Locale'].startswith('zh')]
+    
+    print("\n中文可用音色：")
+    print("-" * 60)
+    for v in zh_voices:
+        gender = "♂" if v['Gender'] == 'Male' else "♀"
+        print(f"{gender} {v['ShortName']:<30} {v['Locale']}")
+    print("-" * 60)
+    print(f"共 {len(zh_voices)} 个中文音色")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Edge-TTS 语音生成器')
+    parser.add_argument('--text', type=str, help='要转换的文本')
+    parser.add_argument('--voice', type=str, default='zh-CN-YunxiNeural', help='音色名称')
+    parser.add_argument('--rate', type=str, default='+0%', help='语速调整')
+    parser.add_argument('--pitch', type=str, default='+0Hz', help='音调调整')
+    parser.add_argument('--output', type=str, help='输出文件路径')
+    parser.add_argument('--timestamps', action='store_true', help='输出时间戳JSON文件')
+    parser.add_argument('--config', type=str, help='配置文件路径(批量生成)')
+    parser.add_argument('--output-dir', type=str, default='temp/audio', help='批量输出目录')
+    parser.add_argument('--list-voices', action='store_true', help='列出可用音色')
+    
+    args = parser.parse_args()
+    
+    if args.list_voices:
+        asyncio.run(list_voices())
+    elif args.config:
+        asyncio.run(generate_batch(args.config, args.output_dir))
+    elif args.text and args.output:
+        asyncio.run(generate_tts(args.text, args.voice, args.output, args.rate, args.pitch, args.timestamps))
+    else:
+        parser.print_help()
+
+
+if __name__ == '__main__':
+    main()
--- a/video-creator/scripts/video_maker.py
+++ b/video-creator/scripts/video_maker.py
@@ -0,0 +1,530 @@
+#!/usr/bin/env python3
+"""
+视频生成器 - 图片+音频合成视频
+支持：淡入淡出转场、自动拼接片尾、添加BGM
+
+用法:
+    python video_maker.py config.yaml
+    python video_maker.py config.yaml --no-outro  # 不加片尾
+    python video_maker.py config.yaml --no-bgm    # 不加BGM
+"""
+import argparse
+import os
+import subprocess
+import sys
+import yaml
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+SKILL_DIR = SCRIPT_DIR.parent
+ASSETS_DIR = SKILL_DIR / "assets"
+BGM_DEFAULT = ASSETS_DIR / "bgm_technology.mp3"
+BGM_EPIC = ASSETS_DIR / "bgm_epic.mp3"
+
+VALID_ASPECT_RATIOS = [
+    "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"
+]
+
+RATIO_TO_SIZE = {
+    "1:1": (1024, 1024),
+    "2:3": (832, 1248),
+    "3:2": (1248, 832),
+    "3:4": (1080, 1440),
+    "4:3": (1440, 1080),
+    "4:5": (864, 1080),
+    "5:4": (1080, 864),
+    "9:16": (1080, 1920),
+    "16:9": (1920, 1080),
+    "21:9": (1536, 672),
+}
+
+def get_outro_path(ratio):
+    """根据比例获取片尾路径，优先精确匹配，否则按方向匹配，最后兜底"""
+    ratio_file = ASSETS_DIR / f"outro_{ratio.replace(':', 'x')}.mp4"
+    if ratio_file.exists():
+        return ratio_file
+    
+    w, h = RATIO_TO_SIZE.get(ratio, (1920, 1080))
+    if h > w:
+        candidates = ["outro_9x16.mp4", "outro_3x4.mp4"]
+    elif w > h:
+        candidates = ["outro.mp4", "outro_3x4.mp4"]
+    else:
+        candidates = ["outro_1x1.mp4", "outro.mp4"]
+    
+    for name in candidates:
+        fallback = ASSETS_DIR / name
+        if fallback.exists():
+            return fallback
+    
+    return ASSETS_DIR / "outro.mp4"
+
+
+def run_cmd(cmd, desc=""):
+    """执行命令并返回结果"""
+    if desc:
+        print(f"  {desc}...")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"错误: {result.stderr[-1000:]}")
+        sys.exit(1)
+    return result
+
+
+def get_duration(file_path):
+    """获取音视频时长"""
+    result = subprocess.run([
+        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
+        '-of', 'csv=p=0', str(file_path)
+    ], capture_output=True, text=True)
+    return float(result.stdout.strip())
+
+
+def generate_video_with_transitions(images, durations, output_path, fade_duration=0.5, ratio="16:9"):
+    """生成带转场的视频"""
+    print(f"\n[1/4] 生成主视频 ({len(images)}张图片, {fade_duration}秒转场)")
+    
+    width, height = RATIO_TO_SIZE.get(ratio, (1920, 1080))
+    
+    display_durations = []
+    for i, dur in enumerate(durations):
+        if i < len(durations) - 1:
+            display_durations.append(dur + fade_duration)
+        else:
+            display_durations.append(dur)
+    
+    inputs = []
+    for img, dur in zip(images, display_durations):
+        inputs.extend(['-loop', '1', '-t', str(dur), '-i', str(img)])
+    
+    filter_parts = []
+    for i in range(len(images)):
+        filter_parts.append(
+            f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
+            f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,setsar=1,fps=30[v{i}];"
+        )
+    
+    offset = 0
+    for i in range(len(images) - 1):
+        if i == 0:
+            offset = display_durations[0] - fade_duration
+            filter_parts.append(
+                f"[v0][v1]xfade=transition=fade:duration={fade_duration}:offset={offset}[xf1];"
+            )
+        else:
+            offset += display_durations[i] - fade_duration
+            filter_parts.append(
+                f"[xf{i}][v{i+1}]xfade=transition=fade:duration={fade_duration}:offset={offset}[xf{i+1}];"
+            )
+    
+    last_xf = f"xf{len(images)-1}"
+    filter_complex = ''.join(filter_parts).rstrip(';')
+    
+    cmd = ['ffmpeg', '-y'] + inputs + [
+        '-filter_complex', filter_complex,
+        '-map', f'[{last_xf}]',
+        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20', '-pix_fmt', 'yuv420p',
+        str(output_path)
+    ]
+    
+    run_cmd(cmd, f"合成{len(images)}张图片")
+    print(f"  ✓ 主视频: {get_duration(output_path):.1f}秒")
+
+
+def merge_audio(audio_files, output_path):
+    """合并音频文件"""
+    print(f"\n[2/4] 合并音频 ({len(audio_files)}个文件)")
+    
+    concat_file = output_path.parent / "audio_concat.txt"
+    with open(concat_file, 'w') as f:
+        for audio in audio_files:
+            f.write(f"file '{audio.absolute()}'\n")
+    
+    cmd = [
+        'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', str(concat_file),
+        '-af', 'aresample=44100', '-c:a', 'aac', '-b:a', '192k', str(output_path)
+    ]
+    run_cmd(cmd, "合并音频")
+    concat_file.unlink()
+    print(f"  ✓ 音频: {get_duration(output_path):.1f}秒")
+
+
+def combine_video_audio(video_path, audio_path, output_path):
+    """合并视频和音频"""
+    cmd = [
+        'ffmpeg', '-y', '-i', str(video_path), '-i', str(audio_path),
+        '-c:v', 'copy', '-c:a', 'copy', '-shortest', str(output_path)
+    ]
+    run_cmd(cmd, "合并视频音频")
+
+
+def append_outro(video_path, output_path, fade_duration=0.5, ratio="16:9"):
+    """拼接片尾，自动缩放片尾到主视频分辨率"""
+    print(f"\n[3/4] 拼接片尾")
+    
+    outro_file = get_outro_path(ratio)
+    if not outro_file.exists():
+        print(f"  ⚠ 片尾文件不存在: {outro_file}")
+        return video_path
+    
+    width, height = RATIO_TO_SIZE.get(ratio, (1920, 1080))
+    
+    outro_ready = output_path.parent / "outro_ready.mp4"
+    cmd = [
+        'ffmpeg', '-y', '-i', str(outro_file),
+        '-vf', f'scale={width}:{height}:force_original_aspect_ratio=decrease,pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,setsar=1',
+        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20',
+        '-c:a', 'aac', '-ar', '44100', str(outro_ready)
+    ]
+    run_cmd(cmd, "准备片尾")
+    
+    video_duration = get_duration(video_path)
+    fade_start = video_duration - fade_duration
+    
+    cmd = [
+        'ffmpeg', '-y', '-i', str(video_path), '-i', str(outro_ready),
+        '-filter_complex',
+        f"[0:v]fade=t=out:st={fade_start}:d={fade_duration}[v0];"
+        f"[1:v]fade=t=in:st=0:d={fade_duration}[v1];"
+        f"[v0][v1]concat=n=2:v=1:a=0[vout];"
+        f"[0:a][1:a]concat=n=2:v=0:a=1[aout]",
+        '-map', '[vout]', '-map', '[aout]',
+        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20',
+        '-c:a', 'aac', '-b:a', '192k', str(output_path)
+    ]
+    run_cmd(cmd, "拼接片尾")
+    outro_ready.unlink()
+    print(f"  ✓ 含片尾: {get_duration(output_path):.1f}秒")
+    return output_path
+
+
+def burn_subtitles(video_path, srt_path, output_path, ratio="16:9"):
+    """烧录字幕到视频：底部居中固定位置"""
+    print(f"\n[字幕] 烧录字幕")
+    
+    if not Path(srt_path).exists():
+        print(f"  ⚠ 字幕文件不存在: {srt_path}")
+        return video_path
+    
+    width, height = RATIO_TO_SIZE.get(ratio, (1920, 1080))
+    # 字体大小：高度/25，16:9时约43px，9:16时约77px
+    font_size = max(36, int(height / 25))
+    margin_bottom = int(height / 15)
+    
+    ass_path = Path(srt_path).with_suffix('.ass')
+    srt_to_ass(srt_path, ass_path, width, height, font_size, margin_bottom)
+    
+    ass_escaped = str(ass_path).replace(":", r"\:").replace("'", r"\'")
+    
+    cmd = [
+        'ffmpeg', '-y', '-i', str(video_path),
+        '-vf', f"ass='{ass_escaped}'",
+        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20',
+        '-c:a', 'copy', str(output_path)
+    ]
+    run_cmd(cmd, "烧录字幕")
+    print(f"  ✓ 含字幕: {get_duration(output_path):.1f}秒")
+    return output_path
+
+
+def srt_to_ass(srt_path, ass_path, width, height, font_size, margin_bottom):
+    """将 SRT 转换为 ASS 格式，固定底部居中，自动换行"""
+    import re
+    
+    with open(srt_path, 'r', encoding='utf-8') as f:
+        srt_content = f.read()
+    
+    # 每行字数规则表（按分辨率宽度固定）
+    CHARS_PER_LINE_MAP = {
+        1024: 20,  # 1:1
+        832: 14,   # 2:3
+        1248: 32,  # 3:2
+        1080: 16,  # 3:4, 4:5, 5:4, 9:16 (竖版统一16字)
+        1440: 28,  # 4:3
+        864: 17,   # 4:5
+        1920: 38,  # 16:9
+        1536: 48,  # 21:9
+    }
+    # 查表，找不到则按公式计算
+    MAX_CHARS = CHARS_PER_LINE_MAP.get(width)
+    if MAX_CHARS is None:
+        # 兜底：按宽度和字体大小估算
+        MAX_CHARS = max(12, int(width / (font_size * 1.2)))
+    
+    # 标点符号（不能放行首）
+    PUNCTUATION = '，。、：；？！,.:;?!）】」》\'\"'
+    
+    def find_break_point(text, max_pos):
+        """找到合适的断点位置，优先在空格处断开"""
+        if max_pos >= len(text):
+            return len(text)
+        
+        # 从max_pos往前找空格断点
+        for i in range(max_pos, max(max_pos // 2, 1), -1):
+            if text[i] == ' ':
+                return i
+        # 没找到空格就直接断
+        return max_pos
+    
+    def wrap_text_2lines(text):
+        """换行：严格2行，返回单个2行字幕块"""
+        text = text.strip()
+        if len(text) <= MAX_CHARS:
+            return text + r'\N '
+        
+        # 找第一行断点
+        break1 = find_break_point(text, MAX_CHARS)
+        line1 = text[:break1].strip()
+        line2 = text[break1:].strip()
+        
+        # 第二行也限制长度
+        if len(line2) > MAX_CHARS:
+            break2 = find_break_point(line2, MAX_CHARS)
+            line2 = line2[:break2].strip()
+        
+        return line1 + r'\N' + line2
+    
+    def split_long_text(text, start_sec, end_sec):
+        """长文本拆成多条字幕，每条严格2行，时间均分"""
+        text = text.strip()
+        
+        # 先模拟换行，计算实际需要几条字幕
+        blocks = []
+        remaining = text
+        while remaining:
+            # 第一行
+            if len(remaining) <= MAX_CHARS:
+                blocks.append(remaining)
+                break
+            break1 = find_break_point(remaining, MAX_CHARS)
+            line1 = remaining[:break1].strip()
+            rest = remaining[break1:].strip()
+            
+            # 第二行
+            if len(rest) <= MAX_CHARS:
+                blocks.append(line1 + ' ' + rest)
+                break
+            break2 = find_break_point(rest, MAX_CHARS)
+            line2 = rest[:break2].strip()
+            blocks.append(line1 + ' ' + line2)
+            remaining = rest[break2:].strip()
+        
+        # 时间均分
+        duration = end_sec - start_sec
+        time_per_block = duration / len(blocks)
+        
+        result = []
+        for i, block in enumerate(blocks):
+            block_start = start_sec + i * time_per_block
+            block_end = start_sec + (i + 1) * time_per_block
+            result.append((block, block_start, block_end))
+        
+        return result
+    
+    ass_header = f"""[Script Info]
+Title: Subtitles
+ScriptType: v4.00+
+PlayResX: {width}
+PlayResY: {height}
+WrapStyle: 0
+
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,PingFang SC,{font_size},&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,1,2,10,10,{margin_bottom},1
+
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    
+    def sec_to_ass_time(sec):
+        """秒数转ASS时间格式"""
+        h = int(sec // 3600)
+        m = int((sec % 3600) // 60)
+        s = int(sec % 60)
+        cs = int((sec % 1) * 100)
+        return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
+    
+    events = []
+    blocks = re.split(r'\n\n+', srt_content.strip())
+    
+    for block in blocks:
+        lines = block.strip().split('\n')
+        if len(lines) >= 3:
+            time_line = lines[1]
+            text = ' '.join(lines[2:]).replace('\n', ' ')
+            # 标点符号替换为空格，便于换行分割
+            text = re.sub(r'[，。、：；？！,.:;?!""''「」『』【】（）()《》]', ' ', text)
+            # 合并多个空格为一个
+            text = re.sub(r'\s+', ' ', text).strip()
+            
+            match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})', time_line)
+            if match:
+                sh, sm, ss, sms = match.groups()[:4]
+                eh, em, es, ems = match.groups()[4:]
+                start_sec = int(sh) * 3600 + int(sm) * 60 + int(ss) + int(sms) / 1000
+                end_sec = int(eh) * 3600 + int(em) * 60 + int(es) + int(ems) / 1000
+                
+                # 长文本拆成多条字幕
+                sub_blocks = split_long_text(text, start_sec, end_sec)
+                for sub_text, sub_start, sub_end in sub_blocks:
+                    formatted_text = wrap_text_2lines(sub_text)
+                    start = sec_to_ass_time(sub_start)
+                    end = sec_to_ass_time(sub_end)
+                    events.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{formatted_text}")
+    
+    with open(ass_path, 'w', encoding='utf-8') as f:
+        f.write(ass_header + '\n'.join(events))
+
+
+def add_bgm(video_path, output_path, volume=0.08, bgm_path=None):
+    """添加背景音乐"""
+    print(f"\n[4/4] 添加BGM")
+    
+    if bgm_path is None:
+        bgm_path = BGM_DEFAULT
+    bgm_path = Path(bgm_path)
+    
+    if not bgm_path.exists():
+        print(f"  ⚠ BGM文件不存在: {bgm_path}")
+        return video_path
+    
+    cmd = [
+        'ffmpeg', '-y', '-i', str(video_path),
+        '-stream_loop', '-1', '-i', str(bgm_path),
+        '-filter_complex',
+        f"[1:a]volume={volume}[bgm];[0:a][bgm]amix=inputs=2:duration=first[aout]",
+        '-map', '0:v', '-map', '[aout]',
+        '-c:v', 'copy', '-c:a', 'aac', '-b:a', '192k', str(output_path)
+    ]
+    run_cmd(cmd, "添加BGM")
+    print(f"  ✓ 最终视频: {get_duration(output_path):.1f}秒")
+    return output_path
+
+
+def main():
+    parser = argparse.ArgumentParser(description='视频生成器')
+    parser.add_argument('config', help='配置文件路径 (YAML)')
+    parser.add_argument('--no-outro', action='store_true', help='不添加片尾')
+    parser.add_argument('--no-bgm', action='store_true', help='不添加BGM')
+    parser.add_argument('--fade', type=float, default=0.5, help='转场时长(秒)')
+    parser.add_argument('--bgm-volume', type=float, default=0.08, help='BGM音量')
+    parser.add_argument('--bgm', type=str, default=None, help='自定义BGM路径，可选: epic')
+    parser.add_argument('--ratio', type=str, default='16:9', 
+                        help=f'视频比例，支持: {", ".join(VALID_ASPECT_RATIOS)}')
+    parser.add_argument('--srt', type=str, default=None, help='字幕文件路径(SRT格式)')
+    args = parser.parse_args()
+    
+    config_path = Path(args.config)
+    if not config_path.exists():
+        print(f"配置文件不存在: {config_path}")
+        sys.exit(1)
+    
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    
+    work_dir = config_path.parent
+    output_path = work_dir / config.get('output', 'output.mp4')
+    
+    if args.ratio == '16:9' and 'ratio' in config:
+        args.ratio = config['ratio']
+    
+    if 'bgm_volume' in config and args.bgm_volume == 0.08:
+        args.bgm_volume = config['bgm_volume']
+    
+    if args.ratio not in VALID_ASPECT_RATIOS:
+        print(f"错误: 不支持的比例 '{args.ratio}'")
+        print(f"支持的比例: {', '.join(VALID_ASPECT_RATIOS)}")
+        sys.exit(1)
+    
+    scenes = config.get('scenes', [])
+    if not scenes:
+        print("配置文件中没有 scenes")
+        sys.exit(1)
+    
+    images = []
+    durations = []
+    audio_files = []
+    
+    for scene in scenes:
+        audio = work_dir / scene['audio']
+        if not audio.exists():
+            print(f"音频不存在: {audio}")
+            sys.exit(1)
+        audio_files.append(audio)
+        
+        if 'images' in scene:
+            for img_cfg in scene['images']:
+                img = work_dir / img_cfg['file']
+                if not img.exists():
+                    print(f"图片不存在: {img}")
+                    sys.exit(1)
+                images.append(img)
+                durations.append(img_cfg['duration'])
+        else:
+            img = work_dir / scene['image']
+            if not img.exists():
+                print(f"图片不存在: {img}")
+                sys.exit(1)
+            images.append(img)
+            durations.append(get_duration(audio))
+    
+    total_audio_duration = sum(get_duration(af) for af in audio_files)
+    total_image_duration = sum(durations)
+    
+    if total_image_duration < total_audio_duration:
+        gap = total_audio_duration - total_image_duration + 0.5
+        durations[-1] += gap
+        print(f"\n⚠ 图片时长({total_image_duration:.1f}s) < 音频时长({total_audio_duration:.1f}s)")
+        print(f"  自动拉伸最后一张图片 +{gap:.1f}s")
+    
+    print(f"\n{'='*50}")
+    print(f"视频生成器")
+    print(f"{'='*50}")
+    print(f"场景数: {len(scenes)}")
+    print(f"音频时长: {total_audio_duration:.1f}秒")
+    print(f"视频时长: {sum(durations):.1f}秒")
+    print(f"转场: {args.fade}秒 淡入淡出")
+    print(f"片尾: {'是' if not args.no_outro else '否'}")
+    print(f"BGM: {'是' if not args.no_bgm else '否'}")
+    
+    temp_dir = work_dir / "temp"
+    temp_dir.mkdir(exist_ok=True)
+    
+    video_only = temp_dir / "video_only.mp4"
+    generate_video_with_transitions(images, durations, video_only, args.fade, args.ratio)
+    
+    audio_merged = temp_dir / "audio_merged.m4a"
+    merge_audio(audio_files, audio_merged)
+    
+    video_with_audio = temp_dir / "video_with_audio.mp4"
+    combine_video_audio(video_only, audio_merged, video_with_audio)
+    
+    current_video = video_with_audio
+    
+    if args.srt:
+        srt_path = work_dir / args.srt if not Path(args.srt).is_absolute() else Path(args.srt)
+        video_with_subs = temp_dir / "video_with_subs.mp4"
+        current_video = burn_subtitles(current_video, srt_path, video_with_subs, args.ratio)
+    
+    if not args.no_outro:
+        video_with_outro = temp_dir / "video_with_outro.mp4"
+        current_video = append_outro(current_video, video_with_outro, args.fade, args.ratio)
+    
+    if not args.no_bgm:
+        bgm_path = None
+        if args.bgm:
+            if args.bgm == 'epic':
+                bgm_path = BGM_EPIC
+            else:
+                bgm_path = Path(args.bgm)
+        add_bgm(current_video, output_path, args.bgm_volume, bgm_path)
+    else:
+        subprocess.run(['cp', str(current_video), str(output_path)])
+    
+    print(f"\n{'='*50}")
+    print(f"✅ 完成: {output_path}")
+    print(f"{'='*50}\n")
+
+
+if __name__ == "__main__":
+    main()