Files

333 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Word 文档格式化脚本
Author: 翟星人
功能:
1. 标题居中,黑色字体,加粗
2. 在日期后插入目录
3. 一级标题前分页
4. 表格实线边框,不跨页断开
5. 日期居中
6. 图片说明小字居中
7. 1/2/3级标题加粗
8. 附录参考文献左对齐
"""
import sys
import re
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
def add_page_break_before(paragraph):
"""在段落前添加分页符"""
p = paragraph._p
pPr = p.get_or_add_pPr()
pageBreakBefore = OxmlElement('w:pageBreakBefore')
pPr.insert(0, pageBreakBefore)
def set_table_border(table):
"""设置表格实线边框"""
tbl = table._tbl
tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement('w:tblPr')
tblBorders = OxmlElement('w:tblBorders')
for border_name in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
border = OxmlElement(f'w:{border_name}')
border.set(qn('w:val'), 'single')
border.set(qn('w:sz'), '4')
border.set(qn('w:space'), '0')
border.set(qn('w:color'), '000000')
tblBorders.append(border)
tblPr.append(tblBorders)
if tbl.tblPr is None:
tbl.insert(0, tblPr)
def keep_table_together(table):
"""保持表格不跨页断开"""
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
pPr = paragraph._p.get_or_add_pPr()
keepNext = OxmlElement('w:keepNext')
keepLines = OxmlElement('w:keepLines')
pPr.append(keepNext)
pPr.append(keepLines)
def keep_paragraph_together(paragraph):
"""保持段落不断开"""
pPr = paragraph._p.get_or_add_pPr()
keepNext = OxmlElement('w:keepNext')
keepLines = OxmlElement('w:keepLines')
pPr.append(keepNext)
pPr.append(keepLines)
def set_heading_style(paragraph, level=1):
"""设置标题样式:黑色加粗"""
for run in paragraph.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
run.font.bold = True
if level == 1:
run.font.size = Pt(16)
elif level == 2:
run.font.size = Pt(14)
elif level == 3:
run.font.size = Pt(12)
def set_caption_style(paragraph):
"""设置图片说明样式:小字居中"""
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in paragraph.runs:
run.font.size = Pt(9)
run.font.color.rgb = RGBColor(80, 80, 80)
def is_image_caption(text, prev_has_image):
"""判断是否为图片说明"""
if prev_has_image and text and len(text) < 100:
# 必须以特定词开头才算图片说明
if text.startswith("上图") or text.startswith("图:") or text.startswith("图:"):
return True
return False
def paragraph_has_image(paragraph):
"""检查段落是否包含图片"""
for run in paragraph.runs:
if run._element.xpath('.//w:drawing') or run._element.xpath('.//w:pict'):
return True
return False
def is_horizontal_rule(paragraph):
"""检查是否为分割线(文本或绘图元素)"""
text = paragraph.text.strip()
# 检查文本形式的分割线
if text == "---" or text == "***" or text == "___" or (len(text) > 0 and all(c == '-' for c in text)):
return True
# 检查 pandoc 生成的绘图形式水平线(包含 line 或 rect 且文本为空,但不包含图片)
if text == "":
xml_str = paragraph._p.xml
has_drawing = 'w:pict' in xml_str or 'w:drawing' in xml_str
has_line = 'v:line' in xml_str or 'v:rect' in xml_str or '<a:ln' in xml_str
has_image = 'a:blip' in xml_str or 'v:imagedata' in xml_str or 'r:embed' in xml_str
# 只有有绘图、有线条、但没有图片时才是水平线
if has_drawing and has_line and not has_image:
return True
return False
def is_reference_item(text):
"""判断是否为参考文献条目"""
if re.match(r'^\d+\.\s', text):
if 'http' in text or '地址' in text or 'github' in text.lower():
return True
return False
def add_toc_field(paragraph):
"""向段落添加目录域"""
run = paragraph.add_run()
fldChar1 = OxmlElement('w:fldChar')
fldChar1.set(qn('w:fldCharType'), 'begin')
instrText = OxmlElement('w:instrText')
instrText.set(qn('xml:space'), 'preserve')
instrText.text = ' TOC \\o "1-3" \\h \\z \\u '
fldChar2 = OxmlElement('w:fldChar')
fldChar2.set(qn('w:fldCharType'), 'separate')
fldChar3 = OxmlElement('w:fldChar')
fldChar3.set(qn('w:fldCharType'), 'end')
run._r.append(fldChar1)
run._r.append(instrText)
run._r.append(fldChar2)
run.add_text('请右键点击此处,选择"更新域"以生成目录')
run._r.append(fldChar3)
def format_docx(input_path, output_path=None):
"""格式化 Word 文档"""
if output_path is None:
output_path = input_path
doc = Document(input_path)
is_first_heading1 = True
prev_was_code = False
prev_has_image = False
in_appendix = False
date_para_index = -1
paragraphs_to_remove = []
# 第一遍遍历:找到并删除 Markdown 转换来的手写目录章节
# 查找 Heading 2 样式且文本为"目录"的段落,以及紧随其后的 Compact 列表
in_md_toc_section = False
for i, paragraph in enumerate(doc.paragraphs):
text = paragraph.text.strip()
style_name = paragraph.style.name if paragraph.style else ""
if style_name is None:
style_name = ""
# 检测 Markdown 转换来的目录章节开始Heading 2 且文本为"目录"
if "Heading 2" in style_name and text == "目录":
in_md_toc_section = True
paragraphs_to_remove.append(paragraph)
continue
# 在目录章节内,删除 Compact 样式的列表项(手写目录内容)
if in_md_toc_section:
# 遇到 Heading 样式,说明目录章节结束
if "Heading" in style_name:
in_md_toc_section = False
continue
# 删除 Compact 样式的列表项
if style_name == "Compact":
paragraphs_to_remove.append(paragraph)
continue
elif text == "":
paragraphs_to_remove.append(paragraph)
continue
else:
in_md_toc_section = False
# 第二遍遍历:处理样式、标记删除分割线
for i, paragraph in enumerate(doc.paragraphs):
text = paragraph.text.strip()
style_name = paragraph.style.name if paragraph.style else ""
if style_name is None:
style_name = ""
# 跳过已标记删除的段落
if paragraph in paragraphs_to_remove:
continue
# 删除分割线段落
if is_horizontal_rule(paragraph):
paragraphs_to_remove.append(paragraph)
continue
# 检查是否进入附录部分
if text.startswith("四、") or "附录" in text:
in_appendix = True
# 附录中的参考文献左对齐
if in_appendix and is_reference_item(text):
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
continue
# 检查当前段落是否有图片
current_has_image = paragraph_has_image(paragraph)
# 处理图片说明
if is_image_caption(text, prev_has_image):
set_caption_style(paragraph)
prev_has_image = False
continue
prev_has_image = current_has_image
# First Paragraph 样式应该左对齐pandoc 可能设置为居中)
if style_name == "First Paragraph":
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
# 处理文档标题
if "Heading 1" in style_name or style_name == "Title":
set_heading_style(paragraph, 1)
if is_first_heading1 and ("调研报告" in text or i < 3):
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
is_first_heading1 = False
continue
# 处理日期行,记录索引
if "调研日期" in text or re.match(r'.*\d{4}\d{1,2}月\d{1,2}日.*', text):
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
date_para_index = i
# 一级标题前分页
if "Heading" in style_name:
if text.startswith("一、") or text.startswith("二、") or text.startswith("三、") or text.startswith("四、"):
add_page_break_before(paragraph)
set_heading_style(paragraph, 1)
elif "Heading 2" in style_name:
set_heading_style(paragraph, 2)
elif "Heading 3" in style_name:
set_heading_style(paragraph, 3)
else:
set_heading_style(paragraph, 1)
# 代码块保持不断开
if paragraph.style and "Code" in str(paragraph.style.name):
keep_paragraph_together(paragraph)
prev_was_code = True
elif prev_was_code and text.startswith("```"):
prev_was_code = False
# 删除待删除的段落
for paragraph in paragraphs_to_remove:
p = paragraph._element
parent = p.getparent()
if parent is not None:
parent.remove(p)
# 插入 Word 目录域:在日期行后面插入
target_para = None
for i, p in enumerate(doc.paragraphs):
if "调研日期" in p.text or re.match(r'.*\d{4}\d{1,2}月\d{1,2}日.*', p.text):
if i + 1 < len(doc.paragraphs):
target_para = doc.paragraphs[i+1]
break
if target_para:
toc_title = target_para.insert_paragraph_before("目录")
toc_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in toc_title.runs:
run.font.bold = True
run.font.size = Pt(16)
run.font.color.rgb = RGBColor(0, 0, 0)
toc_field = target_para.insert_paragraph_before()
add_toc_field(toc_field)
# 最后一遍:修复 First Paragraph 样式的对齐pandoc 默认居中)
for paragraph in doc.paragraphs:
style_name = paragraph.style.name if paragraph.style else ""
if style_name == "First Paragraph":
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
# 处理表格
for table in doc.tables:
set_table_border(table)
keep_table_together(table)
table.alignment = WD_TABLE_ALIGNMENT.CENTER
doc.save(output_path)
print(f"格式化完成: {output_path}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("用法: python format_docx.py <input.docx> [output.docx]")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else None
format_docx(input_file, output_file)