# -*- coding: utf-8 -*-
"""
文件读取工具 - 支持docx、xlsx、xls等二进制文件读取
解决中文编码问题
"""
import sys
import os
import zipfile
import re
import pandas as pd
def read_docx_and_save(file_path):
"""读取docx并保存为UTF-8文本文件"""
try:
import zipfile
import re
import os
with zipfile.ZipFile(file_path, "r") as z:
with z.open("word/document.xml") as f:
content = f.read()
# 尝试UTF-8,然后尝试GBK
try:
text = content.decode("utf-8", errors="strict")
except:
try:
text = content.decode("gbk", errors="ignore")
except:
text = content.decode("utf-8", errors="replace")
# 提取段落
paragraphs = re.findall(r"]*>([^<]*)", text)
content_lines = []
for p in paragraphs:
if p.strip():
content_lines.append(p)
# 读取表格
tables = re.findall(r"(.*?)", text, re.DOTALL)
for table in tables:
rows = re.findall(r"(.*?)", table, re.DOTALL)
for row in rows:
cells = re.findall(r"(.*?)", row, re.DOTALL)
row_data = []
for cell in cells:
cell_text = re.findall(r"]*>([^<]*)", cell)
if cell_text:
row_data.append("".join(cell_text).strip())
if row_data:
content_lines.append(" | ".join(row_data))
output = "\n".join(content_lines)
# 保存到文件
output_file = "temp/docx_output.txt"
os.makedirs("temp", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
f.write(output)
return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"
except Exception as e:
return f"Error: {e}"
def read_xlsx(file_path, sheet_index=0, max_rows=20):
"""读取xlsx文件,中文正确显示(保存到文件)"""
try:
with zipfile.ZipFile(file_path, "r") as z:
# 读取sharedStrings - UTF-8编码
string_map = {}
if "xl/sharedStrings.xml" in z.namelist():
with z.open("xl/sharedStrings.xml") as f:
ss_content = f.read()
text = ss_content.decode("utf-8", errors="ignore")
strings = re.findall(r"([^<]*)", text)
for i, s in enumerate(strings):
string_map[i] = s
# 读取sheet
sheet_files = [
f
for f in z.namelist()
if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
]
if sheet_index >= len(sheet_files):
return f"Sheet index {sheet_index} out of range"
sheet_file = sheet_files[sheet_index]
with z.open(sheet_file) as f:
sheet_content = f.read().decode("utf-8", errors="ignore")
# 解析数据
rows = re.findall(
r']*>(.*?)
', sheet_content, re.DOTALL
)
result = [f"=== Sheet {sheet_index} ==="]
for row_num, row_content in rows[:max_rows]:
cells = re.findall(
r']*>(.*?)', row_content, re.DOTALL
)
row_data = []
for cell_ref, cell_content in cells:
v_match = re.search(r"([^<]*)", cell_content)
t_match = re.search(r"]*>([^<]*)", cell_content)
if t_match:
val = t_match.group(1)
elif v_match:
v = v_match.group(1)
if 't="s"' in cell_content and v.isdigit():
idx = int(v)
val = string_map.get(idx, f"[str:{v}]")
else:
try:
val = str(int(v)) if "." not in v else str(float(v))
except:
val = v
else:
val = ""
if val:
row_data.append(val)
if row_data:
result.append(" | ".join(row_data))
output = "\n".join(result)
# 写入文件确保中文正确显示
output_file = f"temp/xlsx_output.txt"
os.makedirs("temp", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
f.write(output)
return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"
except Exception as e:
return f"Error: {e}"
def read_xls(file_path, sheet_index=0, max_rows=20):
"""读取xls文件,处理中文编码"""
try:
# 首先尝试用pandas直接读取
df = pd.read_excel(file_path, sheet_name=sheet_index, nrows=max_rows)
# 转换为文本格式
result = [f"=== Sheet {sheet_index} ==="]
# 添加列名
result.append(" | ".join(str(col) for col in df.columns))
# 添加数据行
for idx, row in df.iterrows():
row_data = []
for val in row:
if pd.isna(val):
row_data.append("")
else:
row_data.append(str(val))
result.append(" | ".join(row_data))
output = "\n".join(result)
# 保存到文件
output_file = "temp/xls_output.txt"
os.makedirs("temp", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
f.write(output)
return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
except Exception as e:
# 如果pandas失败,尝试手动解析(可能是制表符分隔的csv)
try:
encodings = ["gbk", "utf-8", "latin1"]
for encoding in encodings:
try:
df = pd.read_csv(
file_path, encoding=encoding, sep="\t", nrows=max_rows
)
result = [f"=== Sheet {sheet_index} (CSV format) ==="]
result.append(" | ".join(str(col) for col in df.columns))
for idx, row in df.iterrows():
row_data = []
for val in row:
if pd.isna(val):
row_data.append("")
else:
row_data.append(str(val))
result.append(" | ".join(row_data))
output = "\n".join(result)
output_file = "temp/xls_output.txt"
os.makedirs("temp", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
f.write(output)
return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
except:
continue
return f"Error reading xls file: {e}"
except Exception as e2:
return f"Error: {e} | {e2}"
def list_sheets(file_path):
"""列出Excel所有sheet"""
try:
import pandas as pd
excel_file = pd.ExcelFile(file_path)
sheets = excel_file.sheet_names
return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
except Exception as e:
return f"Error: {e}"
if __name__ == "__main__":
if len(sys.argv) < 2:
print("用法:")
print(" python file_reader.py ")
print(" python file_reader.py # 列出所有sheets")
print(" python file_reader.py # 读取指定sheet")
print(" python file_reader.py ")
print(" python file_reader.py # 列出所有sheets")
print(" python file_reader.py # 读取指定sheet")
print("\n示例:")
print(" python file_reader.py test.xlsx # 列出sheets")
print(" python file_reader.py test.xlsx 1 # 读取第2个sheet")
print(" python file_reader.py test.xlsx 1 30 # 读取前30行")
print(" python file_reader.py test.xls # 读取xls文件")
sys.exit(1)
file_path = sys.argv[1]
if not os.path.exists(file_path):
print(f"文件不存在: {file_path}")
sys.exit(1)
ext = os.path.splitext(file_path)[1].lower()
if ext == ".docx":
print(read_docx_and_save(file_path))
elif ext == ".xlsx":
if len(sys.argv) == 2:
print(list_sheets(file_path))
else:
sheet_index = int(sys.argv[2])
max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
print(read_xlsx(file_path, sheet_index, max_rows))
elif ext == ".xls":
if len(sys.argv) == 2:
print(list_sheets(file_path))
else:
sheet_index = int(sys.argv[2])
max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
print(read_xls(file_path, sheet_index, max_rows))
else:
print(f"不支持的文件类型: {ext}")