skills/doc-to-tables/scripts/doc_to_tables.py

#!/usr/bin/env python3
"""
DOC to Tables - Convert Word documents to structured Markdown and HTML tables

Usage:
    python doc_to_tables.py <input.docx> <output_prefix> [options]

Options:
    --three-col-widths WIDTHS    Three-column table widths (default: "20,60,20")
    --two-col-widths WIDTHS      Two-column table widths (default: "70,30")
    --process-teacher-awards     Process and merge teacher awards
    --no-headers                 Generate HTML without table headers
    --help                       Show this help message
"""

import argparse
import os
import re
import subprocess
import sys
from typing import List, Dict, Tuple


def convert_docx_to_markdown(docx_path: str, md_path: str) -> None:
    """Convert DOCX to Markdown using pandoc"""
    try:
        subprocess.run(
            ["pandoc", "--track-changes=all", docx_path, "-o", md_path], check=True
        )
        print(f"✓ Converted {docx_path} to {md_path}")
    except subprocess.CalledProcessError as e:
        print(f"✗ Error converting DOCX to Markdown: {e}")
        sys.exit(1)


def parse_markdown_content(md_content: str) -> List[Dict]:
    """Parse markdown content and extract structured data"""
    lines = md_content.split("\n")
    sections = []
    current_section = None

    for line in lines:
        # Check for section headers (#### or ######)
        if line.strip().startswith("####"):
            if current_section:
                sections.append(current_section)
            # Extract section title
            title_match = re.search(r"\*\*(.*?)\*\*", line)
            title = title_match.group(1) if title_match else line.strip("# ").strip()
            current_section = {"title": title, "entries": [], "teacher_awards": []}
        elif line.strip().startswith("|") and not line.strip().startswith("|---"):
            # Parse table row
            cells = [cell.strip() for cell in line.strip("| \n").split("|")]
            if len(cells) >= 3:
                entry = {
                    "student": cells[0],
                    "award": cells[1],
                    "teacher": cells[2] if len(cells) > 2 else "",
                }
                if current_section:
                    current_section["entries"].append(entry)
        elif "**" in line and ":" in line and current_section:
            # Parse teacher awards
            current_section["teacher_awards"].append(line.strip())

    if current_section:
        sections.append(current_section)

    return sections


def generate_markdown_tables(sections: List[Dict]) -> str:
    """Generate structured markdown with proper table formatting"""
    md_output = []

    for section in sections:
        # Add section header
        md_output.append(f"#### **{section['title']}**")
        md_output.append("")

        # Add table if there are entries
        if section["entries"]:
            md_output.append("|获奖学生|组别和奖项|指导老师|")
            md_output.append("|---|---|---|")
            for entry in section["entries"]:
                md_output.append(
                    f"|{entry['student']}|{entry['award']}|{entry['teacher']}|"
                )
            md_output.append("")

        # Add teacher awards
        for award in section["teacher_awards"]:
            md_output.append(award)
            md_output.append("")

    return "\n".join(md_output)


def generate_html_tables(
    sections: List[Dict],
    three_col_widths: str = "20,60,20",
    two_col_widths: str = "70,30",
    no_headers: bool = False,
) -> str:
    """Generate professional HTML with responsive tables"""

    # Parse width percentages
    three_widths = [int(w) for w in three_col_widths.split(",")]
    two_widths = [int(w) for w in two_col_widths.split(",")]

    html_parts = [
        "<!DOCTYPE html>",
        '<html lang="zh-CN">',
        "<head>",
        '    <meta charset="UTF-8">',
        '    <meta name="viewport" content="width=device-width, initial-scale=1.0">',
        "    <title>Structured Tables</title>",
        "    <style>",
        "        body {",
        '            font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;',
        "            margin: 20px;",
        "            line-height: 1.6;",
        "        }",
        "        h4 {",
        "            text-align: center;",
        "            color: #333;",
        "            margin: 30px 0 15px 0;",
        "            font-size: 18px;",
        "            border-bottom: 2px solid #333;",
        "            padding-bottom: 8px;",
        "        }",
        "        .table-3col {",
        "            width: 100%;",
        "            border-collapse: collapse;",
        "            margin: 10px 0 5px 0;",
        "            table-layout: fixed;",
        "        }",
        "        .table-3col td {",
        "            border: 1px solid #333;",
        "            padding: 8px;",
        "            text-align: center;",
        "            vertical-align: middle;",
        "            font-size: 14px;",
        "        }",
        "        .table-3col tr:nth-child(even) {",
        "            background-color: #f9f9f9;",
        "        }",
        f"        .table-3col col:nth-child(1) {{ width: {three_widths[0]}%; }}",
        f"        .table-3col col:nth-child(2) {{ width: {three_widths[1]}%; }}",
        f"        .table-3col col:nth-child(3) {{ width: {three_widths[2]}%; }}",
        "        .table-2col {",
        "            width: 100%;",
        "            border-collapse: collapse;",
        "            margin: 10px 0 5px 0;",
        "            table-layout: fixed;",
        "        }",
        "        .table-2col td {",
        "            border: 1px solid #333;",
        "            padding: 8px;",
        "            text-align: center;",
        "            vertical-align: middle;",
        "            font-size: 14px;",
        "        }",
        "        .table-2col tr:nth-child(even) {",
        "            background-color: #f9f9f9;",
        "        }",
        f"        .table-2col col:nth-child(1) {{ width: {two_widths[0]}%; }}",
        f"        .table-2col col:nth-child(2) {{ width: {two_widths[1]}%; }}",
        "        .teacher-award {",
        "            font-size: 12px;",
        "            margin: 0 0 10px 0;",
        "            text-align: center;",
        "            color: #666;",
        "        }",
        "        .subtitle {",
        "            text-align: center;",
        "            margin: 10px 0;",
        "            font-weight: bold;",
        "            color: #333;",
        "        }",
        "    </style>",
        "</head>",
        "<body>",
    ]

    for section in sections:
        html_parts.append(f"<h4>{section['title']}</h4>")

        if section["entries"]:
            # Determine if it's a 3-col or 2-col table based on data
            is_three_col = any(entry.get("teacher") for entry in section["entries"])

            if is_three_col:
                html_parts.append('<table class="table-3col">')
                html_parts.append("    <colgroup>")
                html_parts.append("        <col><col><col>")
                html_parts.append("    </colgroup>")
                html_parts.append("    <tbody>")
                for entry in section["entries"]:
                    html_parts.append(
                        f"        <tr><td>{entry['student']}</td><td>{entry['award']}</td><td>{entry['teacher']}</td></tr>"
                    )
                html_parts.append("    </tbody>")
                html_parts.append("</table>")
            else:
                html_parts.append('<table class="table-2col">')
                html_parts.append("    <colgroup>")
                html_parts.append("        <col><col>")
                html_parts.append("    </colgroup>")
                html_parts.append("    <tbody>")
                for entry in section["entries"]:
                    html_parts.append(
                        f"        <tr><td>{entry['award']}</td><td>{entry['student']}</td></tr>"
                    )
                html_parts.append("    </tbody>")
                html_parts.append("</table>")

        for award in section["teacher_awards"]:
            html_parts.append(f'<p class="teacher-award">{award}</p>')

    html_parts.extend(["</body>", "</html>"])
    return "\n".join(html_parts)


def main():
    parser = argparse.ArgumentParser(
        description="Convert Word documents to structured tables"
    )
    parser.add_argument("input_docx", help="Input DOCX file path")
    parser.add_argument("output_prefix", help="Output file prefix")
    parser.add_argument(
        "--three-col-widths",
        default="20,60,20",
        help="Three-column table widths (default: 20,60,20)",
    )
    parser.add_argument(
        "--two-col-widths",
        default="70,30",
        help="Two-column table widths (default: 70,30)",
    )
    parser.add_argument(
        "--process-teacher-awards",
        action="store_true",
        help="Process and merge teacher awards",
    )
    parser.add_argument(
        "--no-headers", action="store_true", help="Generate HTML without table headers"
    )
    parser.add_argument("--help", action="help", help="Show this help message")

    args = parser.parse_args()

    # Validate input file
    if not os.path.exists(args.input_docx):
        print(f"✗ Input file not found: {args.input_docx}")
        sys.exit(1)

    # Create output directory if needed
    output_dir = os.path.dirname(args.output_prefix) or "."
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Convert DOCX to Markdown
    temp_md = args.output_prefix + "_temp.md"
    convert_docx_to_markdown(args.input_docx, temp_md)

    # Step 2: Read and parse Markdown
    with open(temp_md, "r", encoding="utf-8") as f:
        md_content = f.read()

    sections = parse_markdown_content(md_content)

    # Step 3: Generate structured Markdown
    structured_md = generate_markdown_tables(sections)
    md_output = args.output_prefix + "_md.md"
    with open(md_output, "w", encoding="utf-8") as f:
        f.write(structured_md)
    print(f"✓ Generated structured Markdown: {md_output}")

    # Step 4: Generate HTML
    html_content = generate_html_tables(
        sections, args.three_col_widths, args.two_col_widths, args.no_headers
    )
    html_output = args.output_prefix + "_html.html"
    with open(html_output, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"✓ Generated HTML tables: {html_output}")

    # Clean up temp file
    os.remove(temp_md)
    print("✓ Process completed successfully!")


if __name__ == "__main__":
    main()