#!/usr/bin/env python3 """ DOC to Tables - Convert Word documents to structured Markdown and HTML tables Usage: python doc_to_tables.py [options] Options: --three-col-widths WIDTHS Three-column table widths (default: "20,60,20") --two-col-widths WIDTHS Two-column table widths (default: "70,30") --process-teacher-awards Process and merge teacher awards --no-headers Generate HTML without table headers --help Show this help message """ import argparse import os import re import subprocess import sys from typing import List, Dict, Tuple def convert_docx_to_markdown(docx_path: str, md_path: str) -> None: """Convert DOCX to Markdown using pandoc""" try: subprocess.run( ["pandoc", "--track-changes=all", docx_path, "-o", md_path], check=True ) print(f"✓ Converted {docx_path} to {md_path}") except subprocess.CalledProcessError as e: print(f"✗ Error converting DOCX to Markdown: {e}") sys.exit(1) def parse_markdown_content(md_content: str) -> List[Dict]: """Parse markdown content and extract structured data""" lines = md_content.split("\n") sections = [] current_section = None for line in lines: # Check for section headers (#### or ######) if line.strip().startswith("####"): if current_section: sections.append(current_section) # Extract section title title_match = re.search(r"\*\*(.*?)\*\*", line) title = title_match.group(1) if title_match else line.strip("# ").strip() current_section = {"title": title, "entries": [], "teacher_awards": []} elif line.strip().startswith("|") and not line.strip().startswith("|---"): # Parse table row cells = [cell.strip() for cell in line.strip("| \n").split("|")] if len(cells) >= 3: entry = { "student": cells[0], "award": cells[1], "teacher": cells[2] if len(cells) > 2 else "", } if current_section: current_section["entries"].append(entry) elif "**" in line and ":" in line and current_section: # Parse teacher awards current_section["teacher_awards"].append(line.strip()) if current_section: sections.append(current_section) return sections def generate_markdown_tables(sections: List[Dict]) -> str: """Generate structured markdown with proper table formatting""" md_output = [] for section in sections: # Add section header md_output.append(f"#### **{section['title']}**") md_output.append("") # Add table if there are entries if section["entries"]: md_output.append("|获奖学生|组别和奖项|指导老师|") md_output.append("|---|---|---|") for entry in section["entries"]: md_output.append( f"|{entry['student']}|{entry['award']}|{entry['teacher']}|" ) md_output.append("") # Add teacher awards for award in section["teacher_awards"]: md_output.append(award) md_output.append("") return "\n".join(md_output) def generate_html_tables( sections: List[Dict], three_col_widths: str = "20,60,20", two_col_widths: str = "70,30", no_headers: bool = False, ) -> str: """Generate professional HTML with responsive tables""" # Parse width percentages three_widths = [int(w) for w in three_col_widths.split(",")] two_widths = [int(w) for w in two_col_widths.split(",")] html_parts = [ "", '', "", ' ', ' ', " Structured Tables", " ", "", "", ] for section in sections: html_parts.append(f"

{section['title']}

") if section["entries"]: # Determine if it's a 3-col or 2-col table based on data is_three_col = any(entry.get("teacher") for entry in section["entries"]) if is_three_col: html_parts.append('') html_parts.append(" ") html_parts.append(" ") html_parts.append(" ") html_parts.append(" ") for entry in section["entries"]: html_parts.append( f" " ) html_parts.append(" ") html_parts.append("
{entry['student']}{entry['award']}{entry['teacher']}
") else: html_parts.append('') html_parts.append(" ") html_parts.append(" ") html_parts.append(" ") html_parts.append(" ") for entry in section["entries"]: html_parts.append( f" " ) html_parts.append(" ") html_parts.append("
{entry['award']}{entry['student']}
") for award in section["teacher_awards"]: html_parts.append(f'

{award}

') html_parts.extend(["", ""]) return "\n".join(html_parts) def main(): parser = argparse.ArgumentParser( description="Convert Word documents to structured tables" ) parser.add_argument("input_docx", help="Input DOCX file path") parser.add_argument("output_prefix", help="Output file prefix") parser.add_argument( "--three-col-widths", default="20,60,20", help="Three-column table widths (default: 20,60,20)", ) parser.add_argument( "--two-col-widths", default="70,30", help="Two-column table widths (default: 70,30)", ) parser.add_argument( "--process-teacher-awards", action="store_true", help="Process and merge teacher awards", ) parser.add_argument( "--no-headers", action="store_true", help="Generate HTML without table headers" ) parser.add_argument("--help", action="help", help="Show this help message") args = parser.parse_args() # Validate input file if not os.path.exists(args.input_docx): print(f"✗ Input file not found: {args.input_docx}") sys.exit(1) # Create output directory if needed output_dir = os.path.dirname(args.output_prefix) or "." os.makedirs(output_dir, exist_ok=True) # Step 1: Convert DOCX to Markdown temp_md = args.output_prefix + "_temp.md" convert_docx_to_markdown(args.input_docx, temp_md) # Step 2: Read and parse Markdown with open(temp_md, "r", encoding="utf-8") as f: md_content = f.read() sections = parse_markdown_content(md_content) # Step 3: Generate structured Markdown structured_md = generate_markdown_tables(sections) md_output = args.output_prefix + "_md.md" with open(md_output, "w", encoding="utf-8") as f: f.write(structured_md) print(f"✓ Generated structured Markdown: {md_output}") # Step 4: Generate HTML html_content = generate_html_tables( sections, args.three_col_widths, args.two_col_widths, args.no_headers ) html_output = args.output_prefix + "_html.html" with open(html_output, "w", encoding="utf-8") as f: f.write(html_content) print(f"✓ Generated HTML tables: {html_output}") # Clean up temp file os.remove(temp_md) print("✓ Process completed successfully!") if __name__ == "__main__": main()