Files
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

87 lines
3.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import chardet
import os
def check_file_format(file_path):
"""检测文件格式和编码"""
print(f"检查文件: {file_path}")
# 检查文件扩展名
ext = os.path.splitext(file_path)[1].lower()
print(f"文件扩展名: {ext}")
if ext in [".xls", ".xlsx"]:
print("检测到Excel文件,尝试读取...")
try:
# 首先尝试读取二进制内容来判断格式
with open(file_path, "rb") as f:
header = f.read(512)
# 检查是否是二进制格式(.xls
if b"\x09\x08\x10\x00\x00\x06\x05\x00" in header or b"Workbook" in header:
print("确认是.xls (二进制) 格式")
# 尝试用xlrd读取
try:
import xlrd
workbook = xlrd.open_workbook(file_path, encoding_override="gbk")
print(f"工作表数量: {len(workbook.sheets())}")
for i, sheet in enumerate(workbook.sheets()):
print(
f"{i}: {sheet.name} ({sheet.nrows}行, {sheet.ncols}列)"
)
except:
print("使用xlrd读取失败")
elif ext == ".xlsx":
print("检测到.xlsx格式")
try:
df = pd.read_excel(file_path, sheet_name=None)
print(f"工作表数量: {len(df.keys())}")
for sheet_name, sheet_df in df.items():
print(
f" 表: {sheet_name} ({len(sheet_df)}行, {len(sheet_df.columns)}列)"
)
except Exception as e:
print(f"读取.xlsx失败: {e}")
except Exception as e:
print(f"检测Excel文件失败: {e}")
else:
# 对于文本文件,检测编码
try:
with open(file_path, "rb") as f:
raw_data = f.read(10000) # 读取前10KB用于检测
encoding_result = chardet.detect(raw_data)
print(
f"检测到编码: {encoding_result['encoding']} (置信度: {encoding_result['confidence']:.2f})"
)
# 尝试以检测到的编码读取前几行
try:
decoded_content = raw_data.decode(encoding_result["encoding"])
lines = decoded_content.split("\n")[:10] # 前10行
print("前几行内容:")
for i, line in enumerate(lines):
if line.strip():
print(
f" {i + 1}: {line[:100]}{'...' if len(line) > 100 else ''}"
)
except Exception as e:
print(f"解码失败: {e}")
except Exception as e:
print(f"检测文本文件失败: {e}")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
check_file_format(sys.argv[1])
else:
print("用法: python check_file_format.py <file_path>")