Initial commit to git.yoin
This commit is contained in:
332
deep-research/scripts/format_docx.py
Normal file
332
deep-research/scripts/format_docx.py
Normal file
@@ -0,0 +1,332 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Word 文档格式化脚本
|
||||
|
||||
Author: 翟星人
|
||||
|
||||
功能:
|
||||
1. 标题居中,黑色字体,加粗
|
||||
2. 在日期后插入目录
|
||||
3. 一级标题前分页
|
||||
4. 表格实线边框,不跨页断开
|
||||
5. 日期居中
|
||||
6. 图片说明小字居中
|
||||
7. 1/2/3级标题加粗
|
||||
8. 附录参考文献左对齐
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||||
from docx.oxml.ns import qn
|
||||
from docx.oxml import OxmlElement
|
||||
|
||||
|
||||
def add_page_break_before(paragraph):
|
||||
"""在段落前添加分页符"""
|
||||
p = paragraph._p
|
||||
pPr = p.get_or_add_pPr()
|
||||
pageBreakBefore = OxmlElement('w:pageBreakBefore')
|
||||
pPr.insert(0, pageBreakBefore)
|
||||
|
||||
|
||||
def set_table_border(table):
|
||||
"""设置表格实线边框"""
|
||||
tbl = table._tbl
|
||||
tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement('w:tblPr')
|
||||
|
||||
tblBorders = OxmlElement('w:tblBorders')
|
||||
|
||||
for border_name in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
|
||||
border = OxmlElement(f'w:{border_name}')
|
||||
border.set(qn('w:val'), 'single')
|
||||
border.set(qn('w:sz'), '4')
|
||||
border.set(qn('w:space'), '0')
|
||||
border.set(qn('w:color'), '000000')
|
||||
tblBorders.append(border)
|
||||
|
||||
tblPr.append(tblBorders)
|
||||
if tbl.tblPr is None:
|
||||
tbl.insert(0, tblPr)
|
||||
|
||||
|
||||
def keep_table_together(table):
|
||||
"""保持表格不跨页断开"""
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
for paragraph in cell.paragraphs:
|
||||
pPr = paragraph._p.get_or_add_pPr()
|
||||
keepNext = OxmlElement('w:keepNext')
|
||||
keepLines = OxmlElement('w:keepLines')
|
||||
pPr.append(keepNext)
|
||||
pPr.append(keepLines)
|
||||
|
||||
|
||||
def keep_paragraph_together(paragraph):
|
||||
"""保持段落不断开"""
|
||||
pPr = paragraph._p.get_or_add_pPr()
|
||||
keepNext = OxmlElement('w:keepNext')
|
||||
keepLines = OxmlElement('w:keepLines')
|
||||
pPr.append(keepNext)
|
||||
pPr.append(keepLines)
|
||||
|
||||
|
||||
def set_heading_style(paragraph, level=1):
|
||||
"""设置标题样式:黑色加粗"""
|
||||
for run in paragraph.runs:
|
||||
run.font.color.rgb = RGBColor(0, 0, 0)
|
||||
run.font.bold = True
|
||||
if level == 1:
|
||||
run.font.size = Pt(16)
|
||||
elif level == 2:
|
||||
run.font.size = Pt(14)
|
||||
elif level == 3:
|
||||
run.font.size = Pt(12)
|
||||
|
||||
|
||||
def set_caption_style(paragraph):
|
||||
"""设置图片说明样式:小字居中"""
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
for run in paragraph.runs:
|
||||
run.font.size = Pt(9)
|
||||
run.font.color.rgb = RGBColor(80, 80, 80)
|
||||
|
||||
|
||||
def is_image_caption(text, prev_has_image):
|
||||
"""判断是否为图片说明"""
|
||||
if prev_has_image and text and len(text) < 100:
|
||||
# 必须以特定词开头才算图片说明
|
||||
if text.startswith("上图") or text.startswith("图:") or text.startswith("图:"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def paragraph_has_image(paragraph):
|
||||
"""检查段落是否包含图片"""
|
||||
for run in paragraph.runs:
|
||||
if run._element.xpath('.//w:drawing') or run._element.xpath('.//w:pict'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_horizontal_rule(paragraph):
|
||||
"""检查是否为分割线(文本或绘图元素)"""
|
||||
text = paragraph.text.strip()
|
||||
# 检查文本形式的分割线
|
||||
if text == "---" or text == "***" or text == "___" or (len(text) > 0 and all(c == '-' for c in text)):
|
||||
return True
|
||||
# 检查 pandoc 生成的绘图形式水平线(包含 line 或 rect 且文本为空,但不包含图片)
|
||||
if text == "":
|
||||
xml_str = paragraph._p.xml
|
||||
has_drawing = 'w:pict' in xml_str or 'w:drawing' in xml_str
|
||||
has_line = 'v:line' in xml_str or 'v:rect' in xml_str or '<a:ln' in xml_str
|
||||
has_image = 'a:blip' in xml_str or 'v:imagedata' in xml_str or 'r:embed' in xml_str
|
||||
# 只有有绘图、有线条、但没有图片时才是水平线
|
||||
if has_drawing and has_line and not has_image:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_reference_item(text):
|
||||
"""判断是否为参考文献条目"""
|
||||
if re.match(r'^\d+\.\s', text):
|
||||
if 'http' in text or '地址' in text or 'github' in text.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def add_toc_field(paragraph):
|
||||
"""向段落添加目录域"""
|
||||
run = paragraph.add_run()
|
||||
fldChar1 = OxmlElement('w:fldChar')
|
||||
fldChar1.set(qn('w:fldCharType'), 'begin')
|
||||
|
||||
instrText = OxmlElement('w:instrText')
|
||||
instrText.set(qn('xml:space'), 'preserve')
|
||||
instrText.text = ' TOC \\o "1-3" \\h \\z \\u '
|
||||
|
||||
fldChar2 = OxmlElement('w:fldChar')
|
||||
fldChar2.set(qn('w:fldCharType'), 'separate')
|
||||
|
||||
fldChar3 = OxmlElement('w:fldChar')
|
||||
fldChar3.set(qn('w:fldCharType'), 'end')
|
||||
|
||||
run._r.append(fldChar1)
|
||||
run._r.append(instrText)
|
||||
run._r.append(fldChar2)
|
||||
run.add_text('请右键点击此处,选择"更新域"以生成目录')
|
||||
run._r.append(fldChar3)
|
||||
|
||||
|
||||
def format_docx(input_path, output_path=None):
|
||||
"""格式化 Word 文档"""
|
||||
if output_path is None:
|
||||
output_path = input_path
|
||||
|
||||
doc = Document(input_path)
|
||||
|
||||
is_first_heading1 = True
|
||||
prev_was_code = False
|
||||
prev_has_image = False
|
||||
in_appendix = False
|
||||
date_para_index = -1
|
||||
paragraphs_to_remove = []
|
||||
|
||||
# 第一遍遍历:找到并删除 Markdown 转换来的手写目录章节
|
||||
# 查找 Heading 2 样式且文本为"目录"的段落,以及紧随其后的 Compact 列表
|
||||
in_md_toc_section = False
|
||||
for i, paragraph in enumerate(doc.paragraphs):
|
||||
text = paragraph.text.strip()
|
||||
style_name = paragraph.style.name if paragraph.style else ""
|
||||
if style_name is None:
|
||||
style_name = ""
|
||||
|
||||
# 检测 Markdown 转换来的目录章节开始(Heading 2 且文本为"目录")
|
||||
if "Heading 2" in style_name and text == "目录":
|
||||
in_md_toc_section = True
|
||||
paragraphs_to_remove.append(paragraph)
|
||||
continue
|
||||
|
||||
# 在目录章节内,删除 Compact 样式的列表项(手写目录内容)
|
||||
if in_md_toc_section:
|
||||
# 遇到 Heading 样式,说明目录章节结束
|
||||
if "Heading" in style_name:
|
||||
in_md_toc_section = False
|
||||
continue
|
||||
# 删除 Compact 样式的列表项
|
||||
if style_name == "Compact":
|
||||
paragraphs_to_remove.append(paragraph)
|
||||
continue
|
||||
elif text == "":
|
||||
paragraphs_to_remove.append(paragraph)
|
||||
continue
|
||||
else:
|
||||
in_md_toc_section = False
|
||||
|
||||
# 第二遍遍历:处理样式、标记删除分割线
|
||||
for i, paragraph in enumerate(doc.paragraphs):
|
||||
text = paragraph.text.strip()
|
||||
style_name = paragraph.style.name if paragraph.style else ""
|
||||
if style_name is None:
|
||||
style_name = ""
|
||||
|
||||
# 跳过已标记删除的段落
|
||||
if paragraph in paragraphs_to_remove:
|
||||
continue
|
||||
|
||||
# 删除分割线段落
|
||||
if is_horizontal_rule(paragraph):
|
||||
paragraphs_to_remove.append(paragraph)
|
||||
continue
|
||||
|
||||
# 检查是否进入附录部分
|
||||
if text.startswith("四、") or "附录" in text:
|
||||
in_appendix = True
|
||||
|
||||
# 附录中的参考文献左对齐
|
||||
if in_appendix and is_reference_item(text):
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||||
continue
|
||||
|
||||
# 检查当前段落是否有图片
|
||||
current_has_image = paragraph_has_image(paragraph)
|
||||
|
||||
# 处理图片说明
|
||||
if is_image_caption(text, prev_has_image):
|
||||
set_caption_style(paragraph)
|
||||
prev_has_image = False
|
||||
continue
|
||||
|
||||
prev_has_image = current_has_image
|
||||
|
||||
# First Paragraph 样式应该左对齐(pandoc 可能设置为居中)
|
||||
if style_name == "First Paragraph":
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||||
|
||||
# 处理文档标题
|
||||
if "Heading 1" in style_name or style_name == "Title":
|
||||
set_heading_style(paragraph, 1)
|
||||
|
||||
if is_first_heading1 and ("调研报告" in text or i < 3):
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
is_first_heading1 = False
|
||||
continue
|
||||
|
||||
# 处理日期行,记录索引
|
||||
if "调研日期" in text or re.match(r'.*\d{4}年\d{1,2}月\d{1,2}日.*', text):
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
date_para_index = i
|
||||
|
||||
# 一级标题前分页
|
||||
if "Heading" in style_name:
|
||||
if text.startswith("一、") or text.startswith("二、") or text.startswith("三、") or text.startswith("四、"):
|
||||
add_page_break_before(paragraph)
|
||||
set_heading_style(paragraph, 1)
|
||||
elif "Heading 2" in style_name:
|
||||
set_heading_style(paragraph, 2)
|
||||
elif "Heading 3" in style_name:
|
||||
set_heading_style(paragraph, 3)
|
||||
else:
|
||||
set_heading_style(paragraph, 1)
|
||||
|
||||
# 代码块保持不断开
|
||||
if paragraph.style and "Code" in str(paragraph.style.name):
|
||||
keep_paragraph_together(paragraph)
|
||||
prev_was_code = True
|
||||
elif prev_was_code and text.startswith("```"):
|
||||
prev_was_code = False
|
||||
|
||||
# 删除待删除的段落
|
||||
for paragraph in paragraphs_to_remove:
|
||||
p = paragraph._element
|
||||
parent = p.getparent()
|
||||
if parent is not None:
|
||||
parent.remove(p)
|
||||
|
||||
# 插入 Word 目录域:在日期行后面插入
|
||||
target_para = None
|
||||
for i, p in enumerate(doc.paragraphs):
|
||||
if "调研日期" in p.text or re.match(r'.*\d{4}年\d{1,2}月\d{1,2}日.*', p.text):
|
||||
if i + 1 < len(doc.paragraphs):
|
||||
target_para = doc.paragraphs[i+1]
|
||||
break
|
||||
|
||||
if target_para:
|
||||
toc_title = target_para.insert_paragraph_before("目录")
|
||||
toc_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
for run in toc_title.runs:
|
||||
run.font.bold = True
|
||||
run.font.size = Pt(16)
|
||||
run.font.color.rgb = RGBColor(0, 0, 0)
|
||||
|
||||
toc_field = target_para.insert_paragraph_before()
|
||||
add_toc_field(toc_field)
|
||||
|
||||
# 最后一遍:修复 First Paragraph 样式的对齐(pandoc 默认居中)
|
||||
for paragraph in doc.paragraphs:
|
||||
style_name = paragraph.style.name if paragraph.style else ""
|
||||
if style_name == "First Paragraph":
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||||
|
||||
# 处理表格
|
||||
for table in doc.tables:
|
||||
set_table_border(table)
|
||||
keep_table_together(table)
|
||||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||
|
||||
doc.save(output_path)
|
||||
print(f"格式化完成: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("用法: python format_docx.py <input.docx> [output.docx]")
|
||||
sys.exit(1)
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
format_docx(input_file, output_file)
|
||||
Reference in New Issue
Block a user