skills/audio-generator/scripts/generate_audio.py

#!/usr/bin/env python3
"""
Audio Generator Skill
文本转音频生成工具

功能：
- 支持Markdown和纯文本两种格式
- 自动分割长文本为章节
- 使用edge-tts生成高质量中文语音
- 支持批量生成和增量更新

依赖：
- pip install edge-tts

使用：
    python scripts/generate_audio.py <input_file> [options]

示例：
    # 生成纯文本音频
    python scripts/generate_audio.py text.txt --format plain --output-dir ./audio

    # 生成Markdown音频（按标题分割）
    python scripts/generate_audio.py doc.md --format markdown --output-dir ./audio

    # 只生成特定章节
    python scripts/generate_audio.py text.txt --chapters "第一章,第二章"
"""

import sys
import io

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")

import asyncio
import argparse
import edge_tts
import os
from pathlib import Path

# 默认配置
DEFAULT_VOICE = "zh-CN-XiaoxiaoNeural"  # 晓晓，适合长文朗读
DEFAULT_FORMAT = "plain"
DEFAULT_OUTPUT_DIR = "./audio_output"
MIN_SECTION_LENGTH = 200  # 最小章节长度（字符）


def clean_filename(title: str, max_length: int = 30) -> str:
    """清理文件名，移除非法字符"""
    illegal_chars = ["/", "\\", ":", "?", '"', "<", ">", "|", "*", "#"]
    clean = title
    for char in illegal_chars:
        clean = clean.replace(char, "-")
    return clean[:max_length].strip()


def split_by_markdown(content: str) -> list:
    """按Markdown标题分割文本"""
    sections = []
    current_section = []
    current_title = "未命名章节"

    for line in content.split("\n"):
        if line.startswith("##") and len(line) > 2:
            if current_section:
                sections.append((current_title, "\n".join(current_section)))
            current_title = line.replace("#", "").strip()
            current_section = [line]
        else:
            current_section.append(line)

    if current_section:
        sections.append((current_title, "\n".join(current_section)))

    return sections


def split_by_separator(content: str, separator: str = "===") -> list:
    """按分隔符和章节标记分割文本"""
    sections = []
    current_section = []
    current_title = "开场"

    for line in content.split("\n"):
        if line.startswith(separator) and len(line) > 10:
            if current_section:
                sections.append((current_title, "\n".join(current_section)))
            current_section = []
        elif line.startswith("【") and line.endswith("】"):
            if current_section and len("\n".join(current_section)) > 100:
                sections.append((current_title, "\n".join(current_section)))
                current_section = []
            current_title = line.replace("【", "").replace("】", "").strip()
        else:
            current_section.append(line)

    if current_section:
        sections.append((current_title, "\n".join(current_section)))

    return sections


def split_plain_text(content: str, chunk_size: int = 2000) -> list:
    """将纯文本按大小分割"""
    lines = content.split("\n")
    sections = []
    current_chunk = []
    current_size = 0
    chunk_num = 1

    for line in lines:
        current_chunk.append(line)
        current_size += len(line)

        if current_size >= chunk_size:
            sections.append((f"第{chunk_num}部分", "\n".join(current_chunk)))
            current_chunk = []
            current_size = 0
            chunk_num += 1

    if current_chunk:
        sections.append((f"第{chunk_num}部分", "\n".join(current_chunk)))

    return sections


async def generate_audio_file(text: str, output_path: str, voice: str) -> bool:
    """生成单个音频文件"""
    try:
        communicate = edge_tts.Communicate(text, voice)
        await communicate.save(output_path)
        return True
    except Exception as e:
        print(f"  生成失败: {str(e)[:80]}")
        return False


async def generate_audio_files(
    sections: list, output_dir: str, voice: str, filter_chapters=None
) -> tuple:
    """批量生成音频文件"""
    os.makedirs(output_dir, exist_ok=True)

    generated = 0
    total_chars = 0

    for i, (title, content) in enumerate(sections):
        content_len = len(content.strip())

        # 跳过太短的章节
        if content_len < MIN_SECTION_LENGTH:
            continue

        # 如果指定了章节过滤
        if filter_chapters and title not in filter_chapters:
            continue

        clean_title = clean_filename(title)
        output_file = os.path.join(output_dir, f"{i + 1:02d}_{clean_title}.mp3")

        print(f"[{i + 1}/{len(sections)}] {title[:40]}...")
        print(f"    字数: {content_len}")

        success = await generate_audio_file(content, output_file, voice)

        if success:
            generated += 1
            total_chars += content_len
            file_size = os.path.getsize(output_file) / 1024 / 1024
            print(f"    ✓ 完成 [{file_size:.1f}MB]")
        else:
            print(f"    ✗ 失败")
        print()

    return generated, total_chars


def main():
    parser = argparse.ArgumentParser(
        description="文本转音频生成工具",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例:
  %(prog)s input.txt                           # 默认生成纯文本音频
  %(prog)s doc.md --format markdown            # Markdown格式
  %(prog)s text.txt --voice zh-CN-YunxiNeural  # 使用男声
  %(prog)s text.txt --output-dir ./my_audio    # 指定输出目录
        """,
    )

    parser.add_argument("input_file", help="输入文本文件路径")
    parser.add_argument(
        "--format",
        choices=["plain", "markdown", "separator"],
        default=DEFAULT_FORMAT,
        help="文本格式 (默认: plain)",
    )
    parser.add_argument(
        "--output-dir",
        default=DEFAULT_OUTPUT_DIR,
        help=f"输出目录 (默认: {DEFAULT_OUTPUT_DIR})",
    )
    parser.add_argument(
        "--voice", default=DEFAULT_VOICE, help=f"语音模型 (默认: {DEFAULT_VOICE})"
    )
    parser.add_argument("--chapters", help="只生成指定章节，用逗号分隔")

    args = parser.parse_args()

    # 检查输入文件
    if not os.path.exists(args.input_file):
        print(f"错误: 找不到文件 {args.input_file}")
        sys.exit(1)

    # 读取文件
    print(f"正在读取: {args.input_file}")
    with open(args.input_file, "r", encoding="utf-8") as f:
        content = f.read()

    print(f"文件大小: {len(content)} 字符")
    print()

    # 分割章节
    if args.format == "markdown":
        sections = split_by_markdown(content)
    elif args.format == "separator":
        sections = split_by_separator(content)
    else:
        sections = split_plain_text(content)

    print(f"共识别 {len(sections)} 个章节/段落")
    print(f"使用语音: {args.voice}")
    print(f"输出目录: {args.output_dir}")
    print()

    # 解析章节过滤
    filter_chapters = []
    if args.chapters:
        filter_chapters = [c.strip() for c in args.chapters.split(",")]
        print(f"只生成章节: {', '.join(filter_chapters)}")
        print()

    # 生成音频
    print("开始生成音频文件...")
    print("=" * 60)

    result = asyncio.run(
        generate_audio_files(sections, args.output_dir, args.voice, filter_chapters)
    )

    generated, total_chars = result

    print("=" * 60)
    print(f"音频生成完成！")
    print(f"共生成 {generated} 个音频文件")
    print(f"总字数: {total_chars}")
    print(f"预计总时长: {total_chars // 250} 分钟")
    print(f"保存位置: {args.output_dir}")
    print("=" * 60)


if __name__ == "__main__":
    main()