opencode-skills/video-creator/scripts/video_maker.py

#!/usr/bin/env python3
"""
视频生成器 - 图片+音频合成视频
支持：淡入淡出转场、自动拼接片尾、添加BGM

用法:
    python video_maker.py config.yaml
    python video_maker.py config.yaml --no-outro  # 不加片尾
    python video_maker.py config.yaml --no-bgm    # 不加BGM
"""
import argparse
import os
import subprocess
import sys
import yaml
from pathlib import Path

SCRIPT_DIR = Path(__file__).parent
SKILL_DIR = SCRIPT_DIR.parent
ASSETS_DIR = SKILL_DIR / "assets"
BGM_DEFAULT = ASSETS_DIR / "bgm_technology.mp3"
BGM_EPIC = ASSETS_DIR / "bgm_epic.mp3"

VALID_ASPECT_RATIOS = [
    "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"
]

RATIO_TO_SIZE = {
    "1:1": (1024, 1024),
    "2:3": (832, 1248),
    "3:2": (1248, 832),
    "3:4": (1080, 1440),
    "4:3": (1440, 1080),
    "4:5": (864, 1080),
    "5:4": (1080, 864),
    "9:16": (1080, 1920),
    "16:9": (1920, 1080),
    "21:9": (1536, 672),
}

def get_outro_path(ratio):
    """根据比例获取片尾路径，优先精确匹配，否则按方向匹配，最后兜底"""
    ratio_file = ASSETS_DIR / f"outro_{ratio.replace(':', 'x')}.mp4"
    if ratio_file.exists():
        return ratio_file

    w, h = RATIO_TO_SIZE.get(ratio, (1920, 1080))
    if h > w:
        candidates = ["outro_9x16.mp4", "outro_3x4.mp4"]
    elif w > h:
        candidates = ["outro.mp4", "outro_3x4.mp4"]
    else:
        candidates = ["outro_1x1.mp4", "outro.mp4"]

    for name in candidates:
        fallback = ASSETS_DIR / name
        if fallback.exists():
            return fallback

    return ASSETS_DIR / "outro.mp4"


def run_cmd(cmd, desc=""):
    """执行命令并返回结果"""
    if desc:
        print(f"  {desc}...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"错误: {result.stderr[-1000:]}")
        sys.exit(1)
    return result


def get_duration(file_path):
    """获取音视频时长"""
    result = subprocess.run([
        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
        '-of', 'csv=p=0', str(file_path)
    ], capture_output=True, text=True)
    return float(result.stdout.strip())


def generate_video_with_transitions(images, durations, output_path, fade_duration=0.5, ratio="16:9"):
    """生成带转场的视频"""
    print(f"\n[1/4] 生成主视频 ({len(images)}张图片, {fade_duration}秒转场)")

    width, height = RATIO_TO_SIZE.get(ratio, (1920, 1080))

    display_durations = []
    for i, dur in enumerate(durations):
        if i < len(durations) - 1:
            display_durations.append(dur + fade_duration)
        else:
            display_durations.append(dur)

    inputs = []
    for img, dur in zip(images, display_durations):
        inputs.extend(['-loop', '1', '-t', str(dur), '-i', str(img)])

    filter_parts = []
    for i in range(len(images)):
        filter_parts.append(
            f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
            f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,setsar=1,fps=30[v{i}];"
        )

    offset = 0
    for i in range(len(images) - 1):
        if i == 0:
            offset = display_durations[0] - fade_duration
            filter_parts.append(
                f"[v0][v1]xfade=transition=fade:duration={fade_duration}:offset={offset}[xf1];"
            )
        else:
            offset += display_durations[i] - fade_duration
            filter_parts.append(
                f"[xf{i}][v{i+1}]xfade=transition=fade:duration={fade_duration}:offset={offset}[xf{i+1}];"
            )

    last_xf = f"xf{len(images)-1}"
    filter_complex = ''.join(filter_parts).rstrip(';')

    cmd = ['ffmpeg', '-y'] + inputs + [
        '-filter_complex', filter_complex,
        '-map', f'[{last_xf}]',
        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20', '-pix_fmt', 'yuv420p',
        str(output_path)
    ]

    run_cmd(cmd, f"合成{len(images)}张图片")
    print(f"  ✓ 主视频: {get_duration(output_path):.1f}秒")


def merge_audio(audio_files, output_path):
    """合并音频文件"""
    print(f"\n[2/4] 合并音频 ({len(audio_files)}个文件)")

    concat_file = output_path.parent / "audio_concat.txt"
    with open(concat_file, 'w') as f:
        for audio in audio_files:
            f.write(f"file '{audio.absolute()}'\n")

    cmd = [
        'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', str(concat_file),
        '-af', 'aresample=44100', '-c:a', 'aac', '-b:a', '192k', str(output_path)
    ]
    run_cmd(cmd, "合并音频")
    concat_file.unlink()
    print(f"  ✓ 音频: {get_duration(output_path):.1f}秒")


def combine_video_audio(video_path, audio_path, output_path):
    """合并视频和音频"""
    cmd = [
        'ffmpeg', '-y', '-i', str(video_path), '-i', str(audio_path),
        '-c:v', 'copy', '-c:a', 'copy', '-shortest', str(output_path)
    ]
    run_cmd(cmd, "合并视频音频")


def append_outro(video_path, output_path, fade_duration=0.5, ratio="16:9"):
    """拼接片尾，自动缩放片尾到主视频分辨率"""
    print(f"\n[3/4] 拼接片尾")

    outro_file = get_outro_path(ratio)
    if not outro_file.exists():
        print(f"  ⚠ 片尾文件不存在: {outro_file}")
        return video_path

    width, height = RATIO_TO_SIZE.get(ratio, (1920, 1080))

    outro_ready = output_path.parent / "outro_ready.mp4"
    cmd = [
        'ffmpeg', '-y', '-i', str(outro_file),
        '-vf', f'scale={width}:{height}:force_original_aspect_ratio=decrease,pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,setsar=1',
        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20',
        '-c:a', 'aac', '-ar', '44100', str(outro_ready)
    ]
    run_cmd(cmd, "准备片尾")

    video_duration = get_duration(video_path)
    fade_start = video_duration - fade_duration

    cmd = [
        'ffmpeg', '-y', '-i', str(video_path), '-i', str(outro_ready),
        '-filter_complex',
        f"[0:v]fade=t=out:st={fade_start}:d={fade_duration}[v0];"
        f"[1:v]fade=t=in:st=0:d={fade_duration}[v1];"
        f"[v0][v1]concat=n=2:v=1:a=0[vout];"
        f"[0:a][1:a]concat=n=2:v=0:a=1[aout]",
        '-map', '[vout]', '-map', '[aout]',
        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20',
        '-c:a', 'aac', '-b:a', '192k', str(output_path)
    ]
    run_cmd(cmd, "拼接片尾")
    outro_ready.unlink()
    print(f"  ✓ 含片尾: {get_duration(output_path):.1f}秒")
    return output_path


def burn_subtitles(video_path, srt_path, output_path, ratio="16:9"):
    """烧录字幕到视频：底部居中固定位置"""
    print(f"\n[字幕] 烧录字幕")

    if not Path(srt_path).exists():
        print(f"  ⚠ 字幕文件不存在: {srt_path}")
        return video_path

    width, height = RATIO_TO_SIZE.get(ratio, (1920, 1080))
    # 字体大小：高度/25，16:9时约43px，9:16时约77px
    font_size = max(36, int(height / 25))
    margin_bottom = int(height / 15)

    ass_path = Path(srt_path).with_suffix('.ass')
    srt_to_ass(srt_path, ass_path, width, height, font_size, margin_bottom)

    ass_escaped = str(ass_path).replace(":", r"\:").replace("'", r"\'")

    cmd = [
        'ffmpeg', '-y', '-i', str(video_path),
        '-vf', f"ass='{ass_escaped}'",
        '-c:v', 'libx264', '-preset', 'fast', '-crf', '20',
        '-c:a', 'copy', str(output_path)
    ]
    run_cmd(cmd, "烧录字幕")
    print(f"  ✓ 含字幕: {get_duration(output_path):.1f}秒")
    return output_path


def srt_to_ass(srt_path, ass_path, width, height, font_size, margin_bottom):
    """将 SRT 转换为 ASS 格式，固定底部居中，自动换行"""
    import re

    with open(srt_path, 'r', encoding='utf-8') as f:
        srt_content = f.read()

    # 每行字数规则表（按分辨率宽度固定）
    CHARS_PER_LINE_MAP = {
        1024: 20,  # 1:1
        832: 14,   # 2:3
        1248: 32,  # 3:2
        1080: 16,  # 3:4, 4:5, 5:4, 9:16 (竖版统一16字)
        1440: 28,  # 4:3
        864: 17,   # 4:5
        1920: 38,  # 16:9
        1536: 48,  # 21:9
    }
    # 查表，找不到则按公式计算
    MAX_CHARS = CHARS_PER_LINE_MAP.get(width)
    if MAX_CHARS is None:
        # 兜底：按宽度和字体大小估算
        MAX_CHARS = max(12, int(width / (font_size * 1.2)))

    # 标点符号（不能放行首）
    PUNCTUATION = '，。、：；？！,.:;?!）】」》\'\"'

    def find_break_point(text, max_pos):
        """找到合适的断点位置，优先在空格处断开"""
        if max_pos >= len(text):
            return len(text)

        # 从max_pos往前找空格断点
        for i in range(max_pos, max(max_pos // 2, 1), -1):
            if text[i] == ' ':
                return i
        # 没找到空格就直接断
        return max_pos

    def wrap_text_2lines(text):
        """换行：严格2行，返回单个2行字幕块"""
        text = text.strip()
        if len(text) <= MAX_CHARS:
            return text + r'\N '

        # 找第一行断点
        break1 = find_break_point(text, MAX_CHARS)
        line1 = text[:break1].strip()
        line2 = text[break1:].strip()

        # 第二行也限制长度
        if len(line2) > MAX_CHARS:
            break2 = find_break_point(line2, MAX_CHARS)
            line2 = line2[:break2].strip()

        return line1 + r'\N' + line2

    def split_long_text(text, start_sec, end_sec):
        """长文本拆成多条字幕，每条严格2行，时间均分"""
        text = text.strip()

        # 先模拟换行，计算实际需要几条字幕
        blocks = []
        remaining = text
        while remaining:
            # 第一行
            if len(remaining) <= MAX_CHARS:
                blocks.append(remaining)
                break
            break1 = find_break_point(remaining, MAX_CHARS)
            line1 = remaining[:break1].strip()
            rest = remaining[break1:].strip()

            # 第二行
            if len(rest) <= MAX_CHARS:
                blocks.append(line1 + ' ' + rest)
                break
            break2 = find_break_point(rest, MAX_CHARS)
            line2 = rest[:break2].strip()
            blocks.append(line1 + ' ' + line2)
            remaining = rest[break2:].strip()

        # 时间均分
        duration = end_sec - start_sec
        time_per_block = duration / len(blocks)

        result = []
        for i, block in enumerate(blocks):
            block_start = start_sec + i * time_per_block
            block_end = start_sec + (i + 1) * time_per_block
            result.append((block, block_start, block_end))

        return result

    ass_header = f"""[Script Info]
Title: Subtitles
ScriptType: v4.00+
PlayResX: {width}
PlayResY: {height}
WrapStyle: 0

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,PingFang SC,{font_size},&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,1,2,10,10,{margin_bottom},1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

    def sec_to_ass_time(sec):
        """秒数转ASS时间格式"""
        h = int(sec // 3600)
        m = int((sec % 3600) // 60)
        s = int(sec % 60)
        cs = int((sec % 1) * 100)
        return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

    events = []
    blocks = re.split(r'\n\n+', srt_content.strip())

    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) >= 3:
            time_line = lines[1]
            text = ' '.join(lines[2:]).replace('\n', ' ')
            # 标点符号替换为空格，便于换行分割
            text = re.sub(r'[，。、：；？！,.:;?!""''「」『』【】（）()《》]', ' ', text)
            # 合并多个空格为一个
            text = re.sub(r'\s+', ' ', text).strip()

            match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})', time_line)
            if match:
                sh, sm, ss, sms = match.groups()[:4]
                eh, em, es, ems = match.groups()[4:]
                start_sec = int(sh) * 3600 + int(sm) * 60 + int(ss) + int(sms) / 1000
                end_sec = int(eh) * 3600 + int(em) * 60 + int(es) + int(ems) / 1000

                # 长文本拆成多条字幕
                sub_blocks = split_long_text(text, start_sec, end_sec)
                for sub_text, sub_start, sub_end in sub_blocks:
                    formatted_text = wrap_text_2lines(sub_text)
                    start = sec_to_ass_time(sub_start)
                    end = sec_to_ass_time(sub_end)
                    events.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{formatted_text}")

    with open(ass_path, 'w', encoding='utf-8') as f:
        f.write(ass_header + '\n'.join(events))


def add_bgm(video_path, output_path, volume=0.08, bgm_path=None):
    """添加背景音乐"""
    print(f"\n[4/4] 添加BGM")

    if bgm_path is None:
        bgm_path = BGM_DEFAULT
    bgm_path = Path(bgm_path)

    if not bgm_path.exists():
        print(f"  ⚠ BGM文件不存在: {bgm_path}")
        return video_path

    cmd = [
        'ffmpeg', '-y', '-i', str(video_path),
        '-stream_loop', '-1', '-i', str(bgm_path),
        '-filter_complex',
        f"[1:a]volume={volume}[bgm];[0:a][bgm]amix=inputs=2:duration=first[aout]",
        '-map', '0:v', '-map', '[aout]',
        '-c:v', 'copy', '-c:a', 'aac', '-b:a', '192k', str(output_path)
    ]
    run_cmd(cmd, "添加BGM")
    print(f"  ✓ 最终视频: {get_duration(output_path):.1f}秒")
    return output_path


def main():
    parser = argparse.ArgumentParser(description='视频生成器')
    parser.add_argument('config', help='配置文件路径 (YAML)')
    parser.add_argument('--no-outro', action='store_true', help='不添加片尾')
    parser.add_argument('--no-bgm', action='store_true', help='不添加BGM')
    parser.add_argument('--fade', type=float, default=0.5, help='转场时长(秒)')
    parser.add_argument('--bgm-volume', type=float, default=0.08, help='BGM音量')
    parser.add_argument('--bgm', type=str, default=None, help='自定义BGM路径，可选: epic')
    parser.add_argument('--ratio', type=str, default='16:9',
                        help=f'视频比例，支持: {", ".join(VALID_ASPECT_RATIOS)}')
    parser.add_argument('--srt', type=str, default=None, help='字幕文件路径(SRT格式)')
    args = parser.parse_args()

    config_path = Path(args.config)
    if not config_path.exists():
        print(f"配置文件不存在: {config_path}")
        sys.exit(1)

    with open(config_path) as f:
        config = yaml.safe_load(f)

    work_dir = config_path.parent
    output_path = work_dir / config.get('output', 'output.mp4')

    if args.ratio == '16:9' and 'ratio' in config:
        args.ratio = config['ratio']

    if 'bgm_volume' in config and args.bgm_volume == 0.08:
        args.bgm_volume = config['bgm_volume']

    if args.ratio not in VALID_ASPECT_RATIOS:
        print(f"错误: 不支持的比例 '{args.ratio}'")
        print(f"支持的比例: {', '.join(VALID_ASPECT_RATIOS)}")
        sys.exit(1)

    scenes = config.get('scenes', [])
    if not scenes:
        print("配置文件中没有 scenes")
        sys.exit(1)

    images = []
    durations = []
    audio_files = []

    for scene in scenes:
        audio = work_dir / scene['audio']
        if not audio.exists():
            print(f"音频不存在: {audio}")
            sys.exit(1)
        audio_files.append(audio)

        if 'images' in scene:
            for img_cfg in scene['images']:
                img = work_dir / img_cfg['file']
                if not img.exists():
                    print(f"图片不存在: {img}")
                    sys.exit(1)
                images.append(img)
                durations.append(img_cfg['duration'])
        else:
            img = work_dir / scene['image']
            if not img.exists():
                print(f"图片不存在: {img}")
                sys.exit(1)
            images.append(img)
            durations.append(get_duration(audio))

    total_audio_duration = sum(get_duration(af) for af in audio_files)
    total_image_duration = sum(durations)

    if total_image_duration < total_audio_duration:
        gap = total_audio_duration - total_image_duration + 0.5
        durations[-1] += gap
        print(f"\n⚠ 图片时长({total_image_duration:.1f}s) < 音频时长({total_audio_duration:.1f}s)")
        print(f"  自动拉伸最后一张图片 +{gap:.1f}s")

    print(f"\n{'='*50}")
    print(f"视频生成器")
    print(f"{'='*50}")
    print(f"场景数: {len(scenes)}")
    print(f"音频时长: {total_audio_duration:.1f}秒")
    print(f"视频时长: {sum(durations):.1f}秒")
    print(f"转场: {args.fade}秒 淡入淡出")
    print(f"片尾: {'是' if not args.no_outro else '否'}")
    print(f"BGM: {'是' if not args.no_bgm else '否'}")

    temp_dir = work_dir / "temp"
    temp_dir.mkdir(exist_ok=True)

    video_only = temp_dir / "video_only.mp4"
    generate_video_with_transitions(images, durations, video_only, args.fade, args.ratio)

    audio_merged = temp_dir / "audio_merged.m4a"
    merge_audio(audio_files, audio_merged)

    video_with_audio = temp_dir / "video_with_audio.mp4"
    combine_video_audio(video_only, audio_merged, video_with_audio)

    current_video = video_with_audio

    if args.srt:
        srt_path = work_dir / args.srt if not Path(args.srt).is_absolute() else Path(args.srt)
        video_with_subs = temp_dir / "video_with_subs.mp4"
        current_video = burn_subtitles(current_video, srt_path, video_with_subs, args.ratio)

    if not args.no_outro:
        video_with_outro = temp_dir / "video_with_outro.mp4"
        current_video = append_outro(current_video, video_with_outro, args.fade, args.ratio)

    if not args.no_bgm:
        bgm_path = None
        if args.bgm:
            if args.bgm == 'epic':
                bgm_path = BGM_EPIC
            else:
                bgm_path = Path(args.bgm)
        add_bgm(current_video, output_path, args.bgm_volume, bgm_path)
    else:
        subprocess.run(['cp', str(current_video), str(output_path)])

    print(f"\n{'='*50}")
    print(f"✅ 完成: {output_path}")
    print(f"{'='*50}\n")


if __name__ == "__main__":
    main()