04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
295 lines
10 KiB
Python
295 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
DOC to Tables - Convert Word documents to structured Markdown and HTML tables
|
|
|
|
Usage:
|
|
python doc_to_tables.py <input.docx> <output_prefix> [options]
|
|
|
|
Options:
|
|
--three-col-widths WIDTHS Three-column table widths (default: "20,60,20")
|
|
--two-col-widths WIDTHS Two-column table widths (default: "70,30")
|
|
--process-teacher-awards Process and merge teacher awards
|
|
--no-headers Generate HTML without table headers
|
|
--help Show this help message
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from typing import List, Dict, Tuple
|
|
|
|
|
|
def convert_docx_to_markdown(docx_path: str, md_path: str) -> None:
|
|
"""Convert DOCX to Markdown using pandoc"""
|
|
try:
|
|
subprocess.run(
|
|
["pandoc", "--track-changes=all", docx_path, "-o", md_path], check=True
|
|
)
|
|
print(f"✓ Converted {docx_path} to {md_path}")
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"✗ Error converting DOCX to Markdown: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def parse_markdown_content(md_content: str) -> List[Dict]:
|
|
"""Parse markdown content and extract structured data"""
|
|
lines = md_content.split("\n")
|
|
sections = []
|
|
current_section = None
|
|
|
|
for line in lines:
|
|
# Check for section headers (#### or ######)
|
|
if line.strip().startswith("####"):
|
|
if current_section:
|
|
sections.append(current_section)
|
|
# Extract section title
|
|
title_match = re.search(r"\*\*(.*?)\*\*", line)
|
|
title = title_match.group(1) if title_match else line.strip("# ").strip()
|
|
current_section = {"title": title, "entries": [], "teacher_awards": []}
|
|
elif line.strip().startswith("|") and not line.strip().startswith("|---"):
|
|
# Parse table row
|
|
cells = [cell.strip() for cell in line.strip("| \n").split("|")]
|
|
if len(cells) >= 3:
|
|
entry = {
|
|
"student": cells[0],
|
|
"award": cells[1],
|
|
"teacher": cells[2] if len(cells) > 2 else "",
|
|
}
|
|
if current_section:
|
|
current_section["entries"].append(entry)
|
|
elif "**" in line and ":" in line and current_section:
|
|
# Parse teacher awards
|
|
current_section["teacher_awards"].append(line.strip())
|
|
|
|
if current_section:
|
|
sections.append(current_section)
|
|
|
|
return sections
|
|
|
|
|
|
def generate_markdown_tables(sections: List[Dict]) -> str:
|
|
"""Generate structured markdown with proper table formatting"""
|
|
md_output = []
|
|
|
|
for section in sections:
|
|
# Add section header
|
|
md_output.append(f"#### **{section['title']}**")
|
|
md_output.append("")
|
|
|
|
# Add table if there are entries
|
|
if section["entries"]:
|
|
md_output.append("|获奖学生|组别和奖项|指导老师|")
|
|
md_output.append("|---|---|---|")
|
|
for entry in section["entries"]:
|
|
md_output.append(
|
|
f"|{entry['student']}|{entry['award']}|{entry['teacher']}|"
|
|
)
|
|
md_output.append("")
|
|
|
|
# Add teacher awards
|
|
for award in section["teacher_awards"]:
|
|
md_output.append(award)
|
|
md_output.append("")
|
|
|
|
return "\n".join(md_output)
|
|
|
|
|
|
def generate_html_tables(
|
|
sections: List[Dict],
|
|
three_col_widths: str = "20,60,20",
|
|
two_col_widths: str = "70,30",
|
|
no_headers: bool = False,
|
|
) -> str:
|
|
"""Generate professional HTML with responsive tables"""
|
|
|
|
# Parse width percentages
|
|
three_widths = [int(w) for w in three_col_widths.split(",")]
|
|
two_widths = [int(w) for w in two_col_widths.split(",")]
|
|
|
|
html_parts = [
|
|
"<!DOCTYPE html>",
|
|
'<html lang="zh-CN">',
|
|
"<head>",
|
|
' <meta charset="UTF-8">',
|
|
' <meta name="viewport" content="width=device-width, initial-scale=1.0">',
|
|
" <title>Structured Tables</title>",
|
|
" <style>",
|
|
" body {",
|
|
' font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;',
|
|
" margin: 20px;",
|
|
" line-height: 1.6;",
|
|
" }",
|
|
" h4 {",
|
|
" text-align: center;",
|
|
" color: #333;",
|
|
" margin: 30px 0 15px 0;",
|
|
" font-size: 18px;",
|
|
" border-bottom: 2px solid #333;",
|
|
" padding-bottom: 8px;",
|
|
" }",
|
|
" .table-3col {",
|
|
" width: 100%;",
|
|
" border-collapse: collapse;",
|
|
" margin: 10px 0 5px 0;",
|
|
" table-layout: fixed;",
|
|
" }",
|
|
" .table-3col td {",
|
|
" border: 1px solid #333;",
|
|
" padding: 8px;",
|
|
" text-align: center;",
|
|
" vertical-align: middle;",
|
|
" font-size: 14px;",
|
|
" }",
|
|
" .table-3col tr:nth-child(even) {",
|
|
" background-color: #f9f9f9;",
|
|
" }",
|
|
f" .table-3col col:nth-child(1) {{ width: {three_widths[0]}%; }}",
|
|
f" .table-3col col:nth-child(2) {{ width: {three_widths[1]}%; }}",
|
|
f" .table-3col col:nth-child(3) {{ width: {three_widths[2]}%; }}",
|
|
" .table-2col {",
|
|
" width: 100%;",
|
|
" border-collapse: collapse;",
|
|
" margin: 10px 0 5px 0;",
|
|
" table-layout: fixed;",
|
|
" }",
|
|
" .table-2col td {",
|
|
" border: 1px solid #333;",
|
|
" padding: 8px;",
|
|
" text-align: center;",
|
|
" vertical-align: middle;",
|
|
" font-size: 14px;",
|
|
" }",
|
|
" .table-2col tr:nth-child(even) {",
|
|
" background-color: #f9f9f9;",
|
|
" }",
|
|
f" .table-2col col:nth-child(1) {{ width: {two_widths[0]}%; }}",
|
|
f" .table-2col col:nth-child(2) {{ width: {two_widths[1]}%; }}",
|
|
" .teacher-award {",
|
|
" font-size: 12px;",
|
|
" margin: 0 0 10px 0;",
|
|
" text-align: center;",
|
|
" color: #666;",
|
|
" }",
|
|
" .subtitle {",
|
|
" text-align: center;",
|
|
" margin: 10px 0;",
|
|
" font-weight: bold;",
|
|
" color: #333;",
|
|
" }",
|
|
" </style>",
|
|
"</head>",
|
|
"<body>",
|
|
]
|
|
|
|
for section in sections:
|
|
html_parts.append(f"<h4>{section['title']}</h4>")
|
|
|
|
if section["entries"]:
|
|
# Determine if it's a 3-col or 2-col table based on data
|
|
is_three_col = any(entry.get("teacher") for entry in section["entries"])
|
|
|
|
if is_three_col:
|
|
html_parts.append('<table class="table-3col">')
|
|
html_parts.append(" <colgroup>")
|
|
html_parts.append(" <col><col><col>")
|
|
html_parts.append(" </colgroup>")
|
|
html_parts.append(" <tbody>")
|
|
for entry in section["entries"]:
|
|
html_parts.append(
|
|
f" <tr><td>{entry['student']}</td><td>{entry['award']}</td><td>{entry['teacher']}</td></tr>"
|
|
)
|
|
html_parts.append(" </tbody>")
|
|
html_parts.append("</table>")
|
|
else:
|
|
html_parts.append('<table class="table-2col">')
|
|
html_parts.append(" <colgroup>")
|
|
html_parts.append(" <col><col>")
|
|
html_parts.append(" </colgroup>")
|
|
html_parts.append(" <tbody>")
|
|
for entry in section["entries"]:
|
|
html_parts.append(
|
|
f" <tr><td>{entry['award']}</td><td>{entry['student']}</td></tr>"
|
|
)
|
|
html_parts.append(" </tbody>")
|
|
html_parts.append("</table>")
|
|
|
|
for award in section["teacher_awards"]:
|
|
html_parts.append(f'<p class="teacher-award">{award}</p>')
|
|
|
|
html_parts.extend(["</body>", "</html>"])
|
|
return "\n".join(html_parts)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert Word documents to structured tables"
|
|
)
|
|
parser.add_argument("input_docx", help="Input DOCX file path")
|
|
parser.add_argument("output_prefix", help="Output file prefix")
|
|
parser.add_argument(
|
|
"--three-col-widths",
|
|
default="20,60,20",
|
|
help="Three-column table widths (default: 20,60,20)",
|
|
)
|
|
parser.add_argument(
|
|
"--two-col-widths",
|
|
default="70,30",
|
|
help="Two-column table widths (default: 70,30)",
|
|
)
|
|
parser.add_argument(
|
|
"--process-teacher-awards",
|
|
action="store_true",
|
|
help="Process and merge teacher awards",
|
|
)
|
|
parser.add_argument(
|
|
"--no-headers", action="store_true", help="Generate HTML without table headers"
|
|
)
|
|
parser.add_argument("--help", action="help", help="Show this help message")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input file
|
|
if not os.path.exists(args.input_docx):
|
|
print(f"✗ Input file not found: {args.input_docx}")
|
|
sys.exit(1)
|
|
|
|
# Create output directory if needed
|
|
output_dir = os.path.dirname(args.output_prefix) or "."
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Step 1: Convert DOCX to Markdown
|
|
temp_md = args.output_prefix + "_temp.md"
|
|
convert_docx_to_markdown(args.input_docx, temp_md)
|
|
|
|
# Step 2: Read and parse Markdown
|
|
with open(temp_md, "r", encoding="utf-8") as f:
|
|
md_content = f.read()
|
|
|
|
sections = parse_markdown_content(md_content)
|
|
|
|
# Step 3: Generate structured Markdown
|
|
structured_md = generate_markdown_tables(sections)
|
|
md_output = args.output_prefix + "_md.md"
|
|
with open(md_output, "w", encoding="utf-8") as f:
|
|
f.write(structured_md)
|
|
print(f"✓ Generated structured Markdown: {md_output}")
|
|
|
|
# Step 4: Generate HTML
|
|
html_content = generate_html_tables(
|
|
sections, args.three_col_widths, args.two_col_widths, args.no_headers
|
|
)
|
|
html_output = args.output_prefix + "_html.html"
|
|
with open(html_output, "w", encoding="utf-8") as f:
|
|
f.write(html_content)
|
|
print(f"✓ Generated HTML tables: {html_output}")
|
|
|
|
# Clean up temp file
|
|
os.remove(temp_md)
|
|
print("✓ Process completed successfully!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|