Files
skills/doc-to-tables/scripts/doc_to_tables.py
T
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

295 lines
10 KiB
Python

#!/usr/bin/env python3
"""
DOC to Tables - Convert Word documents to structured Markdown and HTML tables
Usage:
python doc_to_tables.py <input.docx> <output_prefix> [options]
Options:
--three-col-widths WIDTHS Three-column table widths (default: "20,60,20")
--two-col-widths WIDTHS Two-column table widths (default: "70,30")
--process-teacher-awards Process and merge teacher awards
--no-headers Generate HTML without table headers
--help Show this help message
"""
import argparse
import os
import re
import subprocess
import sys
from typing import List, Dict, Tuple
def convert_docx_to_markdown(docx_path: str, md_path: str) -> None:
"""Convert DOCX to Markdown using pandoc"""
try:
subprocess.run(
["pandoc", "--track-changes=all", docx_path, "-o", md_path], check=True
)
print(f"✓ Converted {docx_path} to {md_path}")
except subprocess.CalledProcessError as e:
print(f"✗ Error converting DOCX to Markdown: {e}")
sys.exit(1)
def parse_markdown_content(md_content: str) -> List[Dict]:
"""Parse markdown content and extract structured data"""
lines = md_content.split("\n")
sections = []
current_section = None
for line in lines:
# Check for section headers (#### or ######)
if line.strip().startswith("####"):
if current_section:
sections.append(current_section)
# Extract section title
title_match = re.search(r"\*\*(.*?)\*\*", line)
title = title_match.group(1) if title_match else line.strip("# ").strip()
current_section = {"title": title, "entries": [], "teacher_awards": []}
elif line.strip().startswith("|") and not line.strip().startswith("|---"):
# Parse table row
cells = [cell.strip() for cell in line.strip("| \n").split("|")]
if len(cells) >= 3:
entry = {
"student": cells[0],
"award": cells[1],
"teacher": cells[2] if len(cells) > 2 else "",
}
if current_section:
current_section["entries"].append(entry)
elif "**" in line and ":" in line and current_section:
# Parse teacher awards
current_section["teacher_awards"].append(line.strip())
if current_section:
sections.append(current_section)
return sections
def generate_markdown_tables(sections: List[Dict]) -> str:
"""Generate structured markdown with proper table formatting"""
md_output = []
for section in sections:
# Add section header
md_output.append(f"#### **{section['title']}**")
md_output.append("")
# Add table if there are entries
if section["entries"]:
md_output.append("|获奖学生|组别和奖项|指导老师|")
md_output.append("|---|---|---|")
for entry in section["entries"]:
md_output.append(
f"|{entry['student']}|{entry['award']}|{entry['teacher']}|"
)
md_output.append("")
# Add teacher awards
for award in section["teacher_awards"]:
md_output.append(award)
md_output.append("")
return "\n".join(md_output)
def generate_html_tables(
sections: List[Dict],
three_col_widths: str = "20,60,20",
two_col_widths: str = "70,30",
no_headers: bool = False,
) -> str:
"""Generate professional HTML with responsive tables"""
# Parse width percentages
three_widths = [int(w) for w in three_col_widths.split(",")]
two_widths = [int(w) for w in two_col_widths.split(",")]
html_parts = [
"<!DOCTYPE html>",
'<html lang="zh-CN">',
"<head>",
' <meta charset="UTF-8">',
' <meta name="viewport" content="width=device-width, initial-scale=1.0">',
" <title>Structured Tables</title>",
" <style>",
" body {",
' font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;',
" margin: 20px;",
" line-height: 1.6;",
" }",
" h4 {",
" text-align: center;",
" color: #333;",
" margin: 30px 0 15px 0;",
" font-size: 18px;",
" border-bottom: 2px solid #333;",
" padding-bottom: 8px;",
" }",
" .table-3col {",
" width: 100%;",
" border-collapse: collapse;",
" margin: 10px 0 5px 0;",
" table-layout: fixed;",
" }",
" .table-3col td {",
" border: 1px solid #333;",
" padding: 8px;",
" text-align: center;",
" vertical-align: middle;",
" font-size: 14px;",
" }",
" .table-3col tr:nth-child(even) {",
" background-color: #f9f9f9;",
" }",
f" .table-3col col:nth-child(1) {{ width: {three_widths[0]}%; }}",
f" .table-3col col:nth-child(2) {{ width: {three_widths[1]}%; }}",
f" .table-3col col:nth-child(3) {{ width: {three_widths[2]}%; }}",
" .table-2col {",
" width: 100%;",
" border-collapse: collapse;",
" margin: 10px 0 5px 0;",
" table-layout: fixed;",
" }",
" .table-2col td {",
" border: 1px solid #333;",
" padding: 8px;",
" text-align: center;",
" vertical-align: middle;",
" font-size: 14px;",
" }",
" .table-2col tr:nth-child(even) {",
" background-color: #f9f9f9;",
" }",
f" .table-2col col:nth-child(1) {{ width: {two_widths[0]}%; }}",
f" .table-2col col:nth-child(2) {{ width: {two_widths[1]}%; }}",
" .teacher-award {",
" font-size: 12px;",
" margin: 0 0 10px 0;",
" text-align: center;",
" color: #666;",
" }",
" .subtitle {",
" text-align: center;",
" margin: 10px 0;",
" font-weight: bold;",
" color: #333;",
" }",
" </style>",
"</head>",
"<body>",
]
for section in sections:
html_parts.append(f"<h4>{section['title']}</h4>")
if section["entries"]:
# Determine if it's a 3-col or 2-col table based on data
is_three_col = any(entry.get("teacher") for entry in section["entries"])
if is_three_col:
html_parts.append('<table class="table-3col">')
html_parts.append(" <colgroup>")
html_parts.append(" <col><col><col>")
html_parts.append(" </colgroup>")
html_parts.append(" <tbody>")
for entry in section["entries"]:
html_parts.append(
f" <tr><td>{entry['student']}</td><td>{entry['award']}</td><td>{entry['teacher']}</td></tr>"
)
html_parts.append(" </tbody>")
html_parts.append("</table>")
else:
html_parts.append('<table class="table-2col">')
html_parts.append(" <colgroup>")
html_parts.append(" <col><col>")
html_parts.append(" </colgroup>")
html_parts.append(" <tbody>")
for entry in section["entries"]:
html_parts.append(
f" <tr><td>{entry['award']}</td><td>{entry['student']}</td></tr>"
)
html_parts.append(" </tbody>")
html_parts.append("</table>")
for award in section["teacher_awards"]:
html_parts.append(f'<p class="teacher-award">{award}</p>')
html_parts.extend(["</body>", "</html>"])
return "\n".join(html_parts)
def main():
parser = argparse.ArgumentParser(
description="Convert Word documents to structured tables"
)
parser.add_argument("input_docx", help="Input DOCX file path")
parser.add_argument("output_prefix", help="Output file prefix")
parser.add_argument(
"--three-col-widths",
default="20,60,20",
help="Three-column table widths (default: 20,60,20)",
)
parser.add_argument(
"--two-col-widths",
default="70,30",
help="Two-column table widths (default: 70,30)",
)
parser.add_argument(
"--process-teacher-awards",
action="store_true",
help="Process and merge teacher awards",
)
parser.add_argument(
"--no-headers", action="store_true", help="Generate HTML without table headers"
)
parser.add_argument("--help", action="help", help="Show this help message")
args = parser.parse_args()
# Validate input file
if not os.path.exists(args.input_docx):
print(f"✗ Input file not found: {args.input_docx}")
sys.exit(1)
# Create output directory if needed
output_dir = os.path.dirname(args.output_prefix) or "."
os.makedirs(output_dir, exist_ok=True)
# Step 1: Convert DOCX to Markdown
temp_md = args.output_prefix + "_temp.md"
convert_docx_to_markdown(args.input_docx, temp_md)
# Step 2: Read and parse Markdown
with open(temp_md, "r", encoding="utf-8") as f:
md_content = f.read()
sections = parse_markdown_content(md_content)
# Step 3: Generate structured Markdown
structured_md = generate_markdown_tables(sections)
md_output = args.output_prefix + "_md.md"
with open(md_output, "w", encoding="utf-8") as f:
f.write(structured_md)
print(f"✓ Generated structured Markdown: {md_output}")
# Step 4: Generate HTML
html_content = generate_html_tables(
sections, args.three_col_widths, args.two_col_widths, args.no_headers
)
html_output = args.output_prefix + "_html.html"
with open(html_output, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"✓ Generated HTML tables: {html_output}")
# Clean up temp file
os.remove(temp_md)
print("✓ Process completed successfully!")
if __name__ == "__main__":
main()