# -*- coding: utf-8 -*- """ 文件读取工具 - 支持docx、xlsx等二进制文件读取 解决中文编码问题 """ import sys import os import zipfile import re def read_docx(file_path): """读取docx文件内容""" try: import zipfile import re with zipfile.ZipFile(file_path, "r") as z: with z.open("word/document.xml") as f: content = f.read() # docx内部编码可能是GBK但声明为UTF-8 try: text = content.decode("utf-8", errors="strict") except: text = content.decode("gbk", errors="ignore") # 提取段落 paragraphs = re.findall(r"]*>([^<]*)", text) content_lines = [] for p in paragraphs: if p.strip(): content_lines.append(p) # 读取表格 # 先找到表格 tables = re.findall(r"(.*?)", text, re.DOTALL) for table in tables: rows = re.findall(r"(.*?)", table, re.DOTALL) for row in rows: cells = re.findall(r"(.*?)", row, re.DOTALL) row_data = [] for cell in cells: cell_text = re.findall(r"]*>([^<]*)", cell) if cell_text: row_data.append("".join(cell_text).strip()) if row_data: content_lines.append(" | ".join(row_data)) return "\n".join(content_lines) except Exception as e: return f"Error reading docx: {e}" def read_docx_and_save(file_path): """读取docx并保存为UTF-8文本文件""" try: import zipfile import re import os with zipfile.ZipFile(file_path, "r") as z: with z.open("word/document.xml") as f: content = f.read() # 尝试UTF-8,然后尝试GBK try: text = content.decode("utf-8", errors="strict") except: try: text = content.decode("gbk", errors="ignore") except: text = content.decode("utf-8", errors="replace") # 提取段落 paragraphs = re.findall(r"]*>([^<]*)", text) content_lines = [] for p in paragraphs: if p.strip(): content_lines.append(p) # 读取表格 tables = re.findall(r"(.*?)", text, re.DOTALL) for table in tables: rows = re.findall(r"(.*?)", table, re.DOTALL) for row in rows: cells = re.findall(r"(.*?)", row, re.DOTALL) row_data = [] for cell in cells: cell_text = re.findall(r"]*>([^<]*)", cell) if cell_text: row_data.append("".join(cell_text).strip()) if row_data: content_lines.append(" | ".join(row_data)) output = "\n".join(content_lines) # 保存到文件 output_file = "temp/docx_output.txt" os.makedirs("temp", exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: f.write(output) return f"[内容已保存到 temp/docx_output.txt]\n\n{output}" except Exception as e: return f"Error: {e}" def read_xlsx(file_path, sheet_index=0, max_rows=20): """读取xlsx文件,中文正确显示(保存到文件)""" try: with zipfile.ZipFile(file_path, "r") as z: # 读取sharedStrings - UTF-8编码 string_map = {} if "xl/sharedStrings.xml" in z.namelist(): with z.open("xl/sharedStrings.xml") as f: ss_content = f.read() text = ss_content.decode("utf-8", errors="ignore") strings = re.findall(r"([^<]*)", text) for i, s in enumerate(strings): string_map[i] = s # 读取sheet sheet_files = [ f for f in z.namelist() if f.startswith("xl/worksheets/sheet") and f.endswith(".xml") ] if sheet_index >= len(sheet_files): return f"Sheet index {sheet_index} out of range" sheet_file = sheet_files[sheet_index] with z.open(sheet_file) as f: sheet_content = f.read().decode("utf-8", errors="ignore") # 解析数据 rows = re.findall( r']*>(.*?)', sheet_content, re.DOTALL ) result = [f"=== Sheet {sheet_index} ==="] for row_num, row_content in rows[:max_rows]: cells = re.findall( r']*>(.*?)', row_content, re.DOTALL ) row_data = [] for cell_ref, cell_content in cells: v_match = re.search(r"([^<]*)", cell_content) t_match = re.search(r"]*>([^<]*)", cell_content) if t_match: val = t_match.group(1) elif v_match: v = v_match.group(1) if 't="s"' in cell_content and v.isdigit(): idx = int(v) val = string_map.get(idx, f"[str:{v}]") else: try: val = str(int(v)) if "." not in v else str(float(v)) except: val = v else: val = "" if val: row_data.append(val) if row_data: result.append(" | ".join(row_data)) output = "\n".join(result) # 写入文件确保中文正确显示 output_file = f"temp/xlsx_output.txt" os.makedirs("temp", exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: f.write(output) return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}" except Exception as e: return f"Error: {e}" def list_sheets(file_path): """列出xlsx所有sheet""" try: import openpyxl wb = openpyxl.load_workbook(file_path, data_only=True) sheets = wb.sheetnames return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets)) except Exception as e: return f"Error: {e}" if __name__ == "__main__": if len(sys.argv) < 2: print("用法:") print(" python file_reader.py ") print(" python file_reader.py # 列出所有sheets") print(" python file_reader.py # 读取指定sheet") print(" python file_reader.py ") print("\n示例:") print(" python file_reader.py test.xlsx # 列出sheets") print(" python file_reader.py test.xlsx 1 # 读取第2个sheet") print(" python file_reader.py test.xlsx 1 30 # 读取前30行") sys.exit(1) file_path = sys.argv[1] if not os.path.exists(file_path): print(f"文件不存在: {file_path}") sys.exit(1) ext = os.path.splitext(file_path)[1].lower() if ext == ".docx": print(read_docx_and_save(file_path)) elif ext in [".xlsx", ".xls"]: if len(sys.argv) == 2: print(list_sheets(file_path)) else: sheet_index = int(sys.argv[2]) max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20 print(read_xlsx(file_path, sheet_index, max_rows)) else: print(f"不支持的文件类型: {ext}")