04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
87 lines
3.1 KiB
Python
87 lines
3.1 KiB
Python
import pandas as pd
|
||
import chardet
|
||
import os
|
||
|
||
|
||
def check_file_format(file_path):
|
||
"""检测文件格式和编码"""
|
||
print(f"检查文件: {file_path}")
|
||
|
||
# 检查文件扩展名
|
||
ext = os.path.splitext(file_path)[1].lower()
|
||
print(f"文件扩展名: {ext}")
|
||
|
||
if ext in [".xls", ".xlsx"]:
|
||
print("检测到Excel文件,尝试读取...")
|
||
try:
|
||
# 首先尝试读取二进制内容来判断格式
|
||
with open(file_path, "rb") as f:
|
||
header = f.read(512)
|
||
|
||
# 检查是否是二进制格式(.xls)
|
||
if b"\x09\x08\x10\x00\x00\x06\x05\x00" in header or b"Workbook" in header:
|
||
print("确认是.xls (二进制) 格式")
|
||
|
||
# 尝试用xlrd读取
|
||
try:
|
||
import xlrd
|
||
|
||
workbook = xlrd.open_workbook(file_path, encoding_override="gbk")
|
||
print(f"工作表数量: {len(workbook.sheets())}")
|
||
for i, sheet in enumerate(workbook.sheets()):
|
||
print(
|
||
f" 表{i}: {sheet.name} ({sheet.nrows}行, {sheet.ncols}列)"
|
||
)
|
||
except:
|
||
print("使用xlrd读取失败")
|
||
|
||
elif ext == ".xlsx":
|
||
print("检测到.xlsx格式")
|
||
try:
|
||
df = pd.read_excel(file_path, sheet_name=None)
|
||
print(f"工作表数量: {len(df.keys())}")
|
||
for sheet_name, sheet_df in df.items():
|
||
print(
|
||
f" 表: {sheet_name} ({len(sheet_df)}行, {len(sheet_df.columns)}列)"
|
||
)
|
||
except Exception as e:
|
||
print(f"读取.xlsx失败: {e}")
|
||
|
||
except Exception as e:
|
||
print(f"检测Excel文件失败: {e}")
|
||
|
||
else:
|
||
# 对于文本文件,检测编码
|
||
try:
|
||
with open(file_path, "rb") as f:
|
||
raw_data = f.read(10000) # 读取前10KB用于检测
|
||
encoding_result = chardet.detect(raw_data)
|
||
print(
|
||
f"检测到编码: {encoding_result['encoding']} (置信度: {encoding_result['confidence']:.2f})"
|
||
)
|
||
|
||
# 尝试以检测到的编码读取前几行
|
||
try:
|
||
decoded_content = raw_data.decode(encoding_result["encoding"])
|
||
lines = decoded_content.split("\n")[:10] # 前10行
|
||
print("前几行内容:")
|
||
for i, line in enumerate(lines):
|
||
if line.strip():
|
||
print(
|
||
f" {i + 1}: {line[:100]}{'...' if len(line) > 100 else ''}"
|
||
)
|
||
except Exception as e:
|
||
print(f"解码失败: {e}")
|
||
|
||
except Exception as e:
|
||
print(f"检测文本文件失败: {e}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
|
||
if len(sys.argv) > 1:
|
||
check_file_format(sys.argv[1])
|
||
else:
|
||
print("用法: python check_file_format.py <file_path>")
|