import pandas as pd import chardet import os def check_file_format(file_path): """检测文件格式和编码""" print(f"检查文件: {file_path}") # 检查文件扩展名 ext = os.path.splitext(file_path)[1].lower() print(f"文件扩展名: {ext}") if ext in [".xls", ".xlsx"]: print("检测到Excel文件,尝试读取...") try: # 首先尝试读取二进制内容来判断格式 with open(file_path, "rb") as f: header = f.read(512) # 检查是否是二进制格式(.xls) if b"\x09\x08\x10\x00\x00\x06\x05\x00" in header or b"Workbook" in header: print("确认是.xls (二进制) 格式") # 尝试用xlrd读取 try: import xlrd workbook = xlrd.open_workbook(file_path, encoding_override="gbk") print(f"工作表数量: {len(workbook.sheets())}") for i, sheet in enumerate(workbook.sheets()): print( f" 表{i}: {sheet.name} ({sheet.nrows}行, {sheet.ncols}列)" ) except: print("使用xlrd读取失败") elif ext == ".xlsx": print("检测到.xlsx格式") try: df = pd.read_excel(file_path, sheet_name=None) print(f"工作表数量: {len(df.keys())}") for sheet_name, sheet_df in df.items(): print( f" 表: {sheet_name} ({len(sheet_df)}行, {len(sheet_df.columns)}列)" ) except Exception as e: print(f"读取.xlsx失败: {e}") except Exception as e: print(f"检测Excel文件失败: {e}") else: # 对于文本文件,检测编码 try: with open(file_path, "rb") as f: raw_data = f.read(10000) # 读取前10KB用于检测 encoding_result = chardet.detect(raw_data) print( f"检测到编码: {encoding_result['encoding']} (置信度: {encoding_result['confidence']:.2f})" ) # 尝试以检测到的编码读取前几行 try: decoded_content = raw_data.decode(encoding_result["encoding"]) lines = decoded_content.split("\n")[:10] # 前10行 print("前几行内容:") for i, line in enumerate(lines): if line.strip(): print( f" {i + 1}: {line[:100]}{'...' if len(line) > 100 else ''}" ) except Exception as e: print(f"解码失败: {e}") except Exception as e: print(f"检测文本文件失败: {e}") if __name__ == "__main__": import sys if len(sys.argv) > 1: check_file_format(sys.argv[1]) else: print("用法: python check_file_format.py ")