# -*- coding: utf-8 -*- """ 文件读取工具 - 支持docx、xlsx、xls等二进制文件读取 解决中文编码问题 """ import sys import os import zipfile import re import pandas as pd def read_docx_and_save(file_path): """读取docx并保存为UTF-8文本文件""" try: import zipfile import re import os with zipfile.ZipFile(file_path, "r") as z: with z.open("word/document.xml") as f: content = f.read() # 尝试UTF-8,然后尝试GBK try: text = content.decode("utf-8", errors="strict") except: try: text = content.decode("gbk", errors="ignore") except: text = content.decode("utf-8", errors="replace") # 提取段落 paragraphs = re.findall(r"]*>([^<]*)", text) content_lines = [] for p in paragraphs: if p.strip(): content_lines.append(p) # 读取表格 tables = re.findall(r"(.*?)", text, re.DOTALL) for table in tables: rows = re.findall(r"(.*?)", table, re.DOTALL) for row in rows: cells = re.findall(r"(.*?)", row, re.DOTALL) row_data = [] for cell in cells: cell_text = re.findall(r"]*>([^<]*)", cell) if cell_text: row_data.append("".join(cell_text).strip()) if row_data: content_lines.append(" | ".join(row_data)) output = "\n".join(content_lines) # 保存到文件 output_file = "temp/docx_output.txt" os.makedirs("temp", exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: f.write(output) return f"[内容已保存到 temp/docx_output.txt]\n\n{output}" except Exception as e: return f"Error: {e}" def read_xlsx(file_path, sheet_index=0, max_rows=20): """读取xlsx文件,中文正确显示(保存到文件)""" try: with zipfile.ZipFile(file_path, "r") as z: # 读取sharedStrings - UTF-8编码 string_map = {} if "xl/sharedStrings.xml" in z.namelist(): with z.open("xl/sharedStrings.xml") as f: ss_content = f.read() text = ss_content.decode("utf-8", errors="ignore") strings = re.findall(r"([^<]*)", text) for i, s in enumerate(strings): string_map[i] = s # 读取sheet sheet_files = [ f for f in z.namelist() if f.startswith("xl/worksheets/sheet") and f.endswith(".xml") ] if sheet_index >= len(sheet_files): return f"Sheet index {sheet_index} out of range" sheet_file = sheet_files[sheet_index] with z.open(sheet_file) as f: sheet_content = f.read().decode("utf-8", errors="ignore") # 解析数据 rows = re.findall( r']*>(.*?)', sheet_content, re.DOTALL ) result = [f"=== Sheet {sheet_index} ==="] for row_num, row_content in rows[:max_rows]: cells = re.findall( r']*>(.*?)', row_content, re.DOTALL ) row_data = [] for cell_ref, cell_content in cells: v_match = re.search(r"([^<]*)", cell_content) t_match = re.search(r"]*>([^<]*)", cell_content) if t_match: val = t_match.group(1) elif v_match: v = v_match.group(1) if 't="s"' in cell_content and v.isdigit(): idx = int(v) val = string_map.get(idx, f"[str:{v}]") else: try: val = str(int(v)) if "." not in v else str(float(v)) except: val = v else: val = "" if val: row_data.append(val) if row_data: result.append(" | ".join(row_data)) output = "\n".join(result) # 写入文件确保中文正确显示 output_file = f"temp/xlsx_output.txt" os.makedirs("temp", exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: f.write(output) return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}" except Exception as e: return f"Error: {e}" def read_xls(file_path, sheet_index=0, max_rows=20): """读取xls文件,处理中文编码""" try: # 首先尝试用pandas直接读取 df = pd.read_excel(file_path, sheet_name=sheet_index, nrows=max_rows) # 转换为文本格式 result = [f"=== Sheet {sheet_index} ==="] # 添加列名 result.append(" | ".join(str(col) for col in df.columns)) # 添加数据行 for idx, row in df.iterrows(): row_data = [] for val in row: if pd.isna(val): row_data.append("") else: row_data.append(str(val)) result.append(" | ".join(row_data)) output = "\n".join(result) # 保存到文件 output_file = "temp/xls_output.txt" os.makedirs("temp", exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: f.write(output) return f"[内容已保存到 temp/xls_output.txt]\n\n{output}" except Exception as e: # 如果pandas失败,尝试手动解析(可能是制表符分隔的csv) try: encodings = ["gbk", "utf-8", "latin1"] for encoding in encodings: try: df = pd.read_csv( file_path, encoding=encoding, sep="\t", nrows=max_rows ) result = [f"=== Sheet {sheet_index} (CSV format) ==="] result.append(" | ".join(str(col) for col in df.columns)) for idx, row in df.iterrows(): row_data = [] for val in row: if pd.isna(val): row_data.append("") else: row_data.append(str(val)) result.append(" | ".join(row_data)) output = "\n".join(result) output_file = "temp/xls_output.txt" os.makedirs("temp", exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: f.write(output) return f"[内容已保存到 temp/xls_output.txt]\n\n{output}" except: continue return f"Error reading xls file: {e}" except Exception as e2: return f"Error: {e} | {e2}" def list_sheets(file_path): """列出Excel所有sheet""" try: import pandas as pd excel_file = pd.ExcelFile(file_path) sheets = excel_file.sheet_names return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets)) except Exception as e: return f"Error: {e}" if __name__ == "__main__": if len(sys.argv) < 2: print("用法:") print(" python file_reader.py ") print(" python file_reader.py # 列出所有sheets") print(" python file_reader.py # 读取指定sheet") print(" python file_reader.py ") print(" python file_reader.py # 列出所有sheets") print(" python file_reader.py # 读取指定sheet") print("\n示例:") print(" python file_reader.py test.xlsx # 列出sheets") print(" python file_reader.py test.xlsx 1 # 读取第2个sheet") print(" python file_reader.py test.xlsx 1 30 # 读取前30行") print(" python file_reader.py test.xls # 读取xls文件") sys.exit(1) file_path = sys.argv[1] if not os.path.exists(file_path): print(f"文件不存在: {file_path}") sys.exit(1) ext = os.path.splitext(file_path)[1].lower() if ext == ".docx": print(read_docx_and_save(file_path)) elif ext == ".xlsx": if len(sys.argv) == 2: print(list_sheets(file_path)) else: sheet_index = int(sys.argv[2]) max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20 print(read_xlsx(file_path, sheet_index, max_rows)) elif ext == ".xls": if len(sys.argv) == 2: print(list_sheets(file_path)) else: sheet_index = int(sys.argv[2]) max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20 print(read_xls(file_path, sheet_index, max_rows)) else: print(f"不支持的文件类型: {ext}")