Initial commit to git.yoin

2026-02-11 22:02:47 +08:00
commit cf10ab6473
153 changed files with 14581 additions and 0 deletions
--- a/video-creator/scripts/scene_splitter.py
+++ b/video-creator/scripts/scene_splitter.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+场景拆分器 - 将口播文本拆分成细镜头
+基于时间戳对齐图片和字幕
+"""
+
+import json
+import re
+import argparse
+from pathlib import Path
+from typing import List, Dict
+
+
+def split_by_sentence_timestamps(timestamps: List[Dict]) -> List[Dict]:
+    """
+    直接使用 TTS 的 SentenceBoundary 时间戳作为镜头分割
+    
+    Args:
+        timestamps: TTS 输出的时间戳（包含 sentence 类型）
+    
+    Returns:
+        每个镜头的信息：text, start, end, duration
+    """
+    shots = []
+    for ts in timestamps:
+        if ts.get("type") == "sentence":
+            shots.append({
+                "text": ts["text"],
+                "start": ts["start"],
+                "end": ts["end"],
+                "duration": round(ts["end"] - ts["start"], 2)
+            })
+    
+    if not shots and timestamps:
+        total_start = timestamps[0]["start"]
+        total_end = timestamps[-1]["end"]
+        full_text = "".join(ts["text"] for ts in timestamps)
+        shots.append({
+            "text": full_text,
+            "start": total_start,
+            "end": total_end,
+            "duration": round(total_end - total_start, 2)
+        })
+    
+    return shots
+
+
+def split_long_shots(shots: List[Dict], max_duration: float = 6.0) -> List[Dict]:
+    """
+    将过长的镜头按标点符号进一步拆分
+    
+    Args:
+        shots: 镜头列表
+        max_duration: 最大镜头时长
+    
+    Returns:
+        拆分后的镜头列表
+    """
+    result = []
+    
+    for shot in shots:
+        if shot["duration"] <= max_duration:
+            result.append(shot)
+            continue
+        
+        text = shot["text"]
+        splits = re.split(r'([，。！？,!?])', text)
+        
+        sub_texts = []
+        current = ""
+        for i, part in enumerate(splits):
+            current += part
+            if i % 2 == 1 and current.strip():
+                sub_texts.append(current.strip())
+                current = ""
+        if current.strip():
+            sub_texts.append(current.strip())
+        
+        if len(sub_texts) <= 1:
+            result.append(shot)
+            continue
+        
+        total_chars = sum(len(t) for t in sub_texts)
+        current_time = shot["start"]
+        
+        for sub_text in sub_texts:
+            ratio = len(sub_text) / total_chars
+            sub_duration = shot["duration"] * ratio
+            result.append({
+                "text": sub_text,
+                "start": round(current_time, 2),
+                "end": round(current_time + sub_duration, 2),
+                "duration": round(sub_duration, 2)
+            })
+            current_time += sub_duration
+    
+    return result
+
+
+def merge_short_shots(shots: List[Dict], min_duration: float = 2.5) -> List[Dict]:
+    """合并过短的镜头"""
+    if not shots:
+        return shots
+    
+    merged = []
+    current = shots[0].copy()
+    
+    for shot in shots[1:]:
+        if current["duration"] < min_duration:
+            current["text"] += shot["text"]
+            current["end"] = shot["end"]
+            current["duration"] = round(current["end"] - current["start"], 2)
+        else:
+            merged.append(current)
+            current = shot.copy()
+    
+    merged.append(current)
+    return merged
+
+
+def generate_shot_prompts(shots: List[Dict], style: str, context: str = "") -> List[Dict]:
+    """
+    为每个镜头生成图片提示词
+    
+    Args:
+        shots: 镜头列表
+        style: 画风描述
+        context: 上下文（如角色描述）
+    
+    Returns:
+        带有图片提示词的镜头列表
+    """
+    for i, shot in enumerate(shots):
+        shot["image_prompt"] = f"{style}，{context}，画面：{shot['text']}。禁止出现任何文字"
+        shot["index"] = i + 1
+    
+    return shots
+
+
+def generate_srt(shots: List[Dict], output_path: str):
+    """生成 SRT 字幕文件"""
+    def format_time(seconds: float) -> str:
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+    
+    with open(output_path, "w", encoding="utf-8") as f:
+        for i, shot in enumerate(shots, 1):
+            f.write(f"{i}\n")
+            f.write(f"{format_time(shot['start'])} --> {format_time(shot['end'])}\n")
+            f.write(f"{shot['text']}\n\n")
+    
+    print(f"  ✓ 字幕: {output_path}")
+
+
+def process_scene(text: str, timestamps_path: str, style: str, context: str = "", output_dir: str = ".") -> Dict:
+    """
+    处理单个场景，输出镜头配置
+    
+    Args:
+        text: 场景口播文本
+        timestamps_path: TTS 时间戳 JSON 文件
+        style: 画风
+        context: 上下文
+        output_dir: 输出目录
+    
+    Returns:
+        场景配置字典
+    """
+    with open(timestamps_path, "r", encoding="utf-8") as f:
+        timestamps = json.load(f)
+    
+    shots = split_by_sentence_timestamps(timestamps)
+    
+    shots = split_long_shots(shots, max_duration=6.0)
+    
+    shots = merge_short_shots(shots, min_duration=2.5)
+    
+    shots = generate_shot_prompts(shots, style, context)
+    
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    srt_path = output_path / "subtitles.srt"
+    generate_srt(shots, str(srt_path))
+    
+    config_path = output_path / "shots.json"
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(shots, f, ensure_ascii=False, indent=2)
+    print(f"  ✓ 镜头配置: {config_path}")
+    
+    return {"shots": shots, "srt_path": str(srt_path)}
+
+
+def main():
+    parser = argparse.ArgumentParser(description='场景拆分器')
+    parser.add_argument('--text', type=str, required=True, help='口播文本')
+    parser.add_argument('--timestamps', type=str, required=True, help='TTS时间戳JSON文件')
+    parser.add_argument('--style', type=str, default='', help='画风描述')
+    parser.add_argument('--context', type=str, default='', help='上下文（角色等）')
+    parser.add_argument('--output-dir', type=str, default='.', help='输出目录')
+    
+    args = parser.parse_args()
+    
+    result = process_scene(
+        text=args.text,
+        timestamps_path=args.timestamps,
+        style=args.style,
+        context=args.context,
+        output_dir=args.output_dir
+    )
+    
+    print(f"\n拆分完成，共 {len(result['shots'])} 个镜头：")
+    for shot in result["shots"]:
+        print(f"  [{shot['duration']:.1f}s] {shot['text']}")
+
+
+if __name__ == "__main__":
+    main()