Initial commit to git.yoin

This commit is contained in:
hmo
2026-02-11 22:02:47 +08:00
commit cf10ab6473
153 changed files with 14581 additions and 0 deletions

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
场景拆分器 - 将口播文本拆分成细镜头
基于时间戳对齐图片和字幕
"""
import json
import re
import argparse
from pathlib import Path
from typing import List, Dict
def split_by_sentence_timestamps(timestamps: List[Dict]) -> List[Dict]:
"""
直接使用 TTS 的 SentenceBoundary 时间戳作为镜头分割
Args:
timestamps: TTS 输出的时间戳(包含 sentence 类型)
Returns:
每个镜头的信息text, start, end, duration
"""
shots = []
for ts in timestamps:
if ts.get("type") == "sentence":
shots.append({
"text": ts["text"],
"start": ts["start"],
"end": ts["end"],
"duration": round(ts["end"] - ts["start"], 2)
})
if not shots and timestamps:
total_start = timestamps[0]["start"]
total_end = timestamps[-1]["end"]
full_text = "".join(ts["text"] for ts in timestamps)
shots.append({
"text": full_text,
"start": total_start,
"end": total_end,
"duration": round(total_end - total_start, 2)
})
return shots
def split_long_shots(shots: List[Dict], max_duration: float = 6.0) -> List[Dict]:
"""
将过长的镜头按标点符号进一步拆分
Args:
shots: 镜头列表
max_duration: 最大镜头时长
Returns:
拆分后的镜头列表
"""
result = []
for shot in shots:
if shot["duration"] <= max_duration:
result.append(shot)
continue
text = shot["text"]
splits = re.split(r'([,。!?,!?])', text)
sub_texts = []
current = ""
for i, part in enumerate(splits):
current += part
if i % 2 == 1 and current.strip():
sub_texts.append(current.strip())
current = ""
if current.strip():
sub_texts.append(current.strip())
if len(sub_texts) <= 1:
result.append(shot)
continue
total_chars = sum(len(t) for t in sub_texts)
current_time = shot["start"]
for sub_text in sub_texts:
ratio = len(sub_text) / total_chars
sub_duration = shot["duration"] * ratio
result.append({
"text": sub_text,
"start": round(current_time, 2),
"end": round(current_time + sub_duration, 2),
"duration": round(sub_duration, 2)
})
current_time += sub_duration
return result
def merge_short_shots(shots: List[Dict], min_duration: float = 2.5) -> List[Dict]:
"""合并过短的镜头"""
if not shots:
return shots
merged = []
current = shots[0].copy()
for shot in shots[1:]:
if current["duration"] < min_duration:
current["text"] += shot["text"]
current["end"] = shot["end"]
current["duration"] = round(current["end"] - current["start"], 2)
else:
merged.append(current)
current = shot.copy()
merged.append(current)
return merged
def generate_shot_prompts(shots: List[Dict], style: str, context: str = "") -> List[Dict]:
"""
为每个镜头生成图片提示词
Args:
shots: 镜头列表
style: 画风描述
context: 上下文(如角色描述)
Returns:
带有图片提示词的镜头列表
"""
for i, shot in enumerate(shots):
shot["image_prompt"] = f"{style}{context},画面:{shot['text']}。禁止出现任何文字"
shot["index"] = i + 1
return shots
def generate_srt(shots: List[Dict], output_path: str):
"""生成 SRT 字幕文件"""
def format_time(seconds: float) -> str:
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
with open(output_path, "w", encoding="utf-8") as f:
for i, shot in enumerate(shots, 1):
f.write(f"{i}\n")
f.write(f"{format_time(shot['start'])} --> {format_time(shot['end'])}\n")
f.write(f"{shot['text']}\n\n")
print(f" ✓ 字幕: {output_path}")
def process_scene(text: str, timestamps_path: str, style: str, context: str = "", output_dir: str = ".") -> Dict:
"""
处理单个场景,输出镜头配置
Args:
text: 场景口播文本
timestamps_path: TTS 时间戳 JSON 文件
style: 画风
context: 上下文
output_dir: 输出目录
Returns:
场景配置字典
"""
with open(timestamps_path, "r", encoding="utf-8") as f:
timestamps = json.load(f)
shots = split_by_sentence_timestamps(timestamps)
shots = split_long_shots(shots, max_duration=6.0)
shots = merge_short_shots(shots, min_duration=2.5)
shots = generate_shot_prompts(shots, style, context)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
srt_path = output_path / "subtitles.srt"
generate_srt(shots, str(srt_path))
config_path = output_path / "shots.json"
with open(config_path, "w", encoding="utf-8") as f:
json.dump(shots, f, ensure_ascii=False, indent=2)
print(f" ✓ 镜头配置: {config_path}")
return {"shots": shots, "srt_path": str(srt_path)}
def main():
parser = argparse.ArgumentParser(description='场景拆分器')
parser.add_argument('--text', type=str, required=True, help='口播文本')
parser.add_argument('--timestamps', type=str, required=True, help='TTS时间戳JSON文件')
parser.add_argument('--style', type=str, default='', help='画风描述')
parser.add_argument('--context', type=str, default='', help='上下文(角色等)')
parser.add_argument('--output-dir', type=str, default='.', help='输出目录')
args = parser.parse_args()
result = process_scene(
text=args.text,
timestamps_path=args.timestamps,
style=args.style,
context=args.context,
output_dir=args.output_dir
)
print(f"\n拆分完成,共 {len(result['shots'])} 个镜头:")
for shot in result["shots"]:
print(f" [{shot['duration']:.1f}s] {shot['text']}")
if __name__ == "__main__":
main()