#!/usr/bin/env python3 """ 场景拆分器 - 将口播文本拆分成细镜头 基于时间戳对齐图片和字幕 """ import json import re import argparse from pathlib import Path from typing import List, Dict def split_by_sentence_timestamps(timestamps: List[Dict]) -> List[Dict]: """ 直接使用 TTS 的 SentenceBoundary 时间戳作为镜头分割 Args: timestamps: TTS 输出的时间戳(包含 sentence 类型) Returns: 每个镜头的信息:text, start, end, duration """ shots = [] for ts in timestamps: if ts.get("type") == "sentence": shots.append({ "text": ts["text"], "start": ts["start"], "end": ts["end"], "duration": round(ts["end"] - ts["start"], 2) }) if not shots and timestamps: total_start = timestamps[0]["start"] total_end = timestamps[-1]["end"] full_text = "".join(ts["text"] for ts in timestamps) shots.append({ "text": full_text, "start": total_start, "end": total_end, "duration": round(total_end - total_start, 2) }) return shots def split_long_shots(shots: List[Dict], max_duration: float = 6.0) -> List[Dict]: """ 将过长的镜头按标点符号进一步拆分 Args: shots: 镜头列表 max_duration: 最大镜头时长 Returns: 拆分后的镜头列表 """ result = [] for shot in shots: if shot["duration"] <= max_duration: result.append(shot) continue text = shot["text"] splits = re.split(r'([,。!?,!?])', text) sub_texts = [] current = "" for i, part in enumerate(splits): current += part if i % 2 == 1 and current.strip(): sub_texts.append(current.strip()) current = "" if current.strip(): sub_texts.append(current.strip()) if len(sub_texts) <= 1: result.append(shot) continue total_chars = sum(len(t) for t in sub_texts) current_time = shot["start"] for sub_text in sub_texts: ratio = len(sub_text) / total_chars sub_duration = shot["duration"] * ratio result.append({ "text": sub_text, "start": round(current_time, 2), "end": round(current_time + sub_duration, 2), "duration": round(sub_duration, 2) }) current_time += sub_duration return result def merge_short_shots(shots: List[Dict], min_duration: float = 2.5) -> List[Dict]: """合并过短的镜头""" if not shots: return shots merged = [] current = shots[0].copy() for shot in shots[1:]: if current["duration"] < min_duration: current["text"] += shot["text"] current["end"] = shot["end"] current["duration"] = round(current["end"] - current["start"], 2) else: merged.append(current) current = shot.copy() merged.append(current) return merged def generate_shot_prompts(shots: List[Dict], style: str, context: str = "") -> List[Dict]: """ 为每个镜头生成图片提示词 Args: shots: 镜头列表 style: 画风描述 context: 上下文(如角色描述) Returns: 带有图片提示词的镜头列表 """ for i, shot in enumerate(shots): shot["image_prompt"] = f"{style},{context},画面:{shot['text']}。禁止出现任何文字" shot["index"] = i + 1 return shots def generate_srt(shots: List[Dict], output_path: str): """生成 SRT 字幕文件""" def format_time(seconds: float) -> str: hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millis = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" with open(output_path, "w", encoding="utf-8") as f: for i, shot in enumerate(shots, 1): f.write(f"{i}\n") f.write(f"{format_time(shot['start'])} --> {format_time(shot['end'])}\n") f.write(f"{shot['text']}\n\n") print(f" ✓ 字幕: {output_path}") def process_scene(text: str, timestamps_path: str, style: str, context: str = "", output_dir: str = ".") -> Dict: """ 处理单个场景,输出镜头配置 Args: text: 场景口播文本 timestamps_path: TTS 时间戳 JSON 文件 style: 画风 context: 上下文 output_dir: 输出目录 Returns: 场景配置字典 """ with open(timestamps_path, "r", encoding="utf-8") as f: timestamps = json.load(f) shots = split_by_sentence_timestamps(timestamps) shots = split_long_shots(shots, max_duration=6.0) shots = merge_short_shots(shots, min_duration=2.5) shots = generate_shot_prompts(shots, style, context) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) srt_path = output_path / "subtitles.srt" generate_srt(shots, str(srt_path)) config_path = output_path / "shots.json" with open(config_path, "w", encoding="utf-8") as f: json.dump(shots, f, ensure_ascii=False, indent=2) print(f" ✓ 镜头配置: {config_path}") return {"shots": shots, "srt_path": str(srt_path)} def main(): parser = argparse.ArgumentParser(description='场景拆分器') parser.add_argument('--text', type=str, required=True, help='口播文本') parser.add_argument('--timestamps', type=str, required=True, help='TTS时间戳JSON文件') parser.add_argument('--style', type=str, default='', help='画风描述') parser.add_argument('--context', type=str, default='', help='上下文(角色等)') parser.add_argument('--output-dir', type=str, default='.', help='输出目录') args = parser.parse_args() result = process_scene( text=args.text, timestamps_path=args.timestamps, style=args.style, context=args.context, output_dir=args.output_dir ) print(f"\n拆分完成,共 {len(result['shots'])} 个镜头:") for shot in result["shots"]: print(f" [{shot['duration']:.1f}s] {shot['text']}") if __name__ == "__main__": main()