#!/usr/bin/env python3 """ Word 文档格式化脚本 Author: 翟星人 功能: 1. 标题居中,黑色字体,加粗 2. 在日期后插入目录 3. 一级标题前分页 4. 表格实线边框,不跨页断开 5. 日期居中 6. 图片说明小字居中 7. 1/2/3级标题加粗 8. 附录参考文献左对齐 """ import sys import re from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.ns import qn from docx.oxml import OxmlElement def add_page_break_before(paragraph): """在段落前添加分页符""" p = paragraph._p pPr = p.get_or_add_pPr() pageBreakBefore = OxmlElement('w:pageBreakBefore') pPr.insert(0, pageBreakBefore) def set_table_border(table): """设置表格实线边框""" tbl = table._tbl tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement('w:tblPr') tblBorders = OxmlElement('w:tblBorders') for border_name in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']: border = OxmlElement(f'w:{border_name}') border.set(qn('w:val'), 'single') border.set(qn('w:sz'), '4') border.set(qn('w:space'), '0') border.set(qn('w:color'), '000000') tblBorders.append(border) tblPr.append(tblBorders) if tbl.tblPr is None: tbl.insert(0, tblPr) def keep_table_together(table): """保持表格不跨页断开""" for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: pPr = paragraph._p.get_or_add_pPr() keepNext = OxmlElement('w:keepNext') keepLines = OxmlElement('w:keepLines') pPr.append(keepNext) pPr.append(keepLines) def keep_paragraph_together(paragraph): """保持段落不断开""" pPr = paragraph._p.get_or_add_pPr() keepNext = OxmlElement('w:keepNext') keepLines = OxmlElement('w:keepLines') pPr.append(keepNext) pPr.append(keepLines) def set_heading_style(paragraph, level=1): """设置标题样式:黑色加粗""" for run in paragraph.runs: run.font.color.rgb = RGBColor(0, 0, 0) run.font.bold = True if level == 1: run.font.size = Pt(16) elif level == 2: run.font.size = Pt(14) elif level == 3: run.font.size = Pt(12) def set_caption_style(paragraph): """设置图片说明样式:小字居中""" paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER for run in paragraph.runs: run.font.size = Pt(9) run.font.color.rgb = RGBColor(80, 80, 80) def is_image_caption(text, prev_has_image): """判断是否为图片说明""" if prev_has_image and text and len(text) < 100: # 必须以特定词开头才算图片说明 if text.startswith("上图") or text.startswith("图:") or text.startswith("图:"): return True return False def paragraph_has_image(paragraph): """检查段落是否包含图片""" for run in paragraph.runs: if run._element.xpath('.//w:drawing') or run._element.xpath('.//w:pict'): return True return False def is_horizontal_rule(paragraph): """检查是否为分割线(文本或绘图元素)""" text = paragraph.text.strip() # 检查文本形式的分割线 if text == "---" or text == "***" or text == "___" or (len(text) > 0 and all(c == '-' for c in text)): return True # 检查 pandoc 生成的绘图形式水平线(包含 line 或 rect 且文本为空,但不包含图片) if text == "": xml_str = paragraph._p.xml has_drawing = 'w:pict' in xml_str or 'w:drawing' in xml_str has_line = 'v:line' in xml_str or 'v:rect' in xml_str or ' [output.docx]") sys.exit(1) input_file = sys.argv[1] output_file = sys.argv[2] if len(sys.argv) > 2 else None format_docx(input_file, output_file)