opencode-skills/video-creator/scripts/scene_splitter.py

#!/usr/bin/env python3
"""
场景拆分器 - 将口播文本拆分成细镜头
基于时间戳对齐图片和字幕
"""

import json
import re
import argparse
from pathlib import Path
from typing import List, Dict


def split_by_sentence_timestamps(timestamps: List[Dict]) -> List[Dict]:
    """
    直接使用 TTS 的 SentenceBoundary 时间戳作为镜头分割

    Args:
        timestamps: TTS 输出的时间戳（包含 sentence 类型）

    Returns:
        每个镜头的信息：text, start, end, duration
    """
    shots = []
    for ts in timestamps:
        if ts.get("type") == "sentence":
            shots.append({
                "text": ts["text"],
                "start": ts["start"],
                "end": ts["end"],
                "duration": round(ts["end"] - ts["start"], 2)
            })

    if not shots and timestamps:
        total_start = timestamps[0]["start"]
        total_end = timestamps[-1]["end"]
        full_text = "".join(ts["text"] for ts in timestamps)
        shots.append({
            "text": full_text,
            "start": total_start,
            "end": total_end,
            "duration": round(total_end - total_start, 2)
        })

    return shots


def split_long_shots(shots: List[Dict], max_duration: float = 6.0) -> List[Dict]:
    """
    将过长的镜头按标点符号进一步拆分

    Args:
        shots: 镜头列表
        max_duration: 最大镜头时长

    Returns:
        拆分后的镜头列表
    """
    result = []

    for shot in shots:
        if shot["duration"] <= max_duration:
            result.append(shot)
            continue

        text = shot["text"]
        splits = re.split(r'([，。！？,!?])', text)

        sub_texts = []
        current = ""
        for i, part in enumerate(splits):
            current += part
            if i % 2 == 1 and current.strip():
                sub_texts.append(current.strip())
                current = ""
        if current.strip():
            sub_texts.append(current.strip())

        if len(sub_texts) <= 1:
            result.append(shot)
            continue

        total_chars = sum(len(t) for t in sub_texts)
        current_time = shot["start"]

        for sub_text in sub_texts:
            ratio = len(sub_text) / total_chars
            sub_duration = shot["duration"] * ratio
            result.append({
                "text": sub_text,
                "start": round(current_time, 2),
                "end": round(current_time + sub_duration, 2),
                "duration": round(sub_duration, 2)
            })
            current_time += sub_duration

    return result


def merge_short_shots(shots: List[Dict], min_duration: float = 2.5) -> List[Dict]:
    """合并过短的镜头"""
    if not shots:
        return shots

    merged = []
    current = shots[0].copy()

    for shot in shots[1:]:
        if current["duration"] < min_duration:
            current["text"] += shot["text"]
            current["end"] = shot["end"]
            current["duration"] = round(current["end"] - current["start"], 2)
        else:
            merged.append(current)
            current = shot.copy()

    merged.append(current)
    return merged


def generate_shot_prompts(shots: List[Dict], style: str, context: str = "") -> List[Dict]:
    """
    为每个镜头生成图片提示词

    Args:
        shots: 镜头列表
        style: 画风描述
        context: 上下文（如角色描述）

    Returns:
        带有图片提示词的镜头列表
    """
    for i, shot in enumerate(shots):
        shot["image_prompt"] = f"{style}，{context}，画面：{shot['text']}。禁止出现任何文字"
        shot["index"] = i + 1

    return shots


def generate_srt(shots: List[Dict], output_path: str):
    """生成 SRT 字幕文件"""
    def format_time(seconds: float) -> str:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        millis = int((seconds % 1) * 1000)
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

    with open(output_path, "w", encoding="utf-8") as f:
        for i, shot in enumerate(shots, 1):
            f.write(f"{i}\n")
            f.write(f"{format_time(shot['start'])} --> {format_time(shot['end'])}\n")
            f.write(f"{shot['text']}\n\n")

    print(f"  ✓ 字幕: {output_path}")


def process_scene(text: str, timestamps_path: str, style: str, context: str = "", output_dir: str = ".") -> Dict:
    """
    处理单个场景，输出镜头配置

    Args:
        text: 场景口播文本
        timestamps_path: TTS 时间戳 JSON 文件
        style: 画风
        context: 上下文
        output_dir: 输出目录

    Returns:
        场景配置字典
    """
    with open(timestamps_path, "r", encoding="utf-8") as f:
        timestamps = json.load(f)

    shots = split_by_sentence_timestamps(timestamps)

    shots = split_long_shots(shots, max_duration=6.0)

    shots = merge_short_shots(shots, min_duration=2.5)

    shots = generate_shot_prompts(shots, style, context)

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    srt_path = output_path / "subtitles.srt"
    generate_srt(shots, str(srt_path))

    config_path = output_path / "shots.json"
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(shots, f, ensure_ascii=False, indent=2)
    print(f"  ✓ 镜头配置: {config_path}")

    return {"shots": shots, "srt_path": str(srt_path)}


def main():
    parser = argparse.ArgumentParser(description='场景拆分器')
    parser.add_argument('--text', type=str, required=True, help='口播文本')
    parser.add_argument('--timestamps', type=str, required=True, help='TTS时间戳JSON文件')
    parser.add_argument('--style', type=str, default='', help='画风描述')
    parser.add_argument('--context', type=str, default='', help='上下文（角色等）')
    parser.add_argument('--output-dir', type=str, default='.', help='输出目录')

    args = parser.parse_args()

    result = process_scene(
        text=args.text,
        timestamps_path=args.timestamps,
        style=args.style,
        context=args.context,
        output_dir=args.output_dir
    )

    print(f"\n拆分完成，共 {len(result['shots'])} 个镜头：")
    for shot in result["shots"]:
        print(f"  [{shot['duration']:.1f}s] {shot['text']}")


if __name__ == "__main__":
    main()