Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
import pandas as pd
|
||||
import chardet
|
||||
import os
|
||||
|
||||
|
||||
def check_file_format(file_path):
|
||||
"""检测文件格式和编码"""
|
||||
print(f"检查文件: {file_path}")
|
||||
|
||||
# 检查文件扩展名
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
print(f"文件扩展名: {ext}")
|
||||
|
||||
if ext in [".xls", ".xlsx"]:
|
||||
print("检测到Excel文件,尝试读取...")
|
||||
try:
|
||||
# 首先尝试读取二进制内容来判断格式
|
||||
with open(file_path, "rb") as f:
|
||||
header = f.read(512)
|
||||
|
||||
# 检查是否是二进制格式(.xls)
|
||||
if b"\x09\x08\x10\x00\x00\x06\x05\x00" in header or b"Workbook" in header:
|
||||
print("确认是.xls (二进制) 格式")
|
||||
|
||||
# 尝试用xlrd读取
|
||||
try:
|
||||
import xlrd
|
||||
|
||||
workbook = xlrd.open_workbook(file_path, encoding_override="gbk")
|
||||
print(f"工作表数量: {len(workbook.sheets())}")
|
||||
for i, sheet in enumerate(workbook.sheets()):
|
||||
print(
|
||||
f" 表{i}: {sheet.name} ({sheet.nrows}行, {sheet.ncols}列)"
|
||||
)
|
||||
except:
|
||||
print("使用xlrd读取失败")
|
||||
|
||||
elif ext == ".xlsx":
|
||||
print("检测到.xlsx格式")
|
||||
try:
|
||||
df = pd.read_excel(file_path, sheet_name=None)
|
||||
print(f"工作表数量: {len(df.keys())}")
|
||||
for sheet_name, sheet_df in df.items():
|
||||
print(
|
||||
f" 表: {sheet_name} ({len(sheet_df)}行, {len(sheet_df.columns)}列)"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"读取.xlsx失败: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"检测Excel文件失败: {e}")
|
||||
|
||||
else:
|
||||
# 对于文本文件,检测编码
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read(10000) # 读取前10KB用于检测
|
||||
encoding_result = chardet.detect(raw_data)
|
||||
print(
|
||||
f"检测到编码: {encoding_result['encoding']} (置信度: {encoding_result['confidence']:.2f})"
|
||||
)
|
||||
|
||||
# 尝试以检测到的编码读取前几行
|
||||
try:
|
||||
decoded_content = raw_data.decode(encoding_result["encoding"])
|
||||
lines = decoded_content.split("\n")[:10] # 前10行
|
||||
print("前几行内容:")
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip():
|
||||
print(
|
||||
f" {i + 1}: {line[:100]}{'...' if len(line) > 100 else ''}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"解码失败: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"检测文本文件失败: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
check_file_format(sys.argv[1])
|
||||
else:
|
||||
print("用法: python check_file_format.py <file_path>")
|
||||
@@ -0,0 +1,38 @@
|
||||
import pandas as pd
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def convert_xls_to_xlsx(xls_file, xlsx_file=None):
|
||||
"""将.xls文件转换为.xlsx文件"""
|
||||
if not xlsx_file:
|
||||
xlsx_file = os.path.splitext(xls_file)[0] + ".xlsx"
|
||||
|
||||
try:
|
||||
# 尝试使用xlrd读取.xls文件
|
||||
df = pd.read_excel(xls_file, engine="xlrd")
|
||||
|
||||
# 保存为.xlsx格式
|
||||
df.to_excel(xlsx_file, index=False)
|
||||
|
||||
print(f"成功转换: {xls_file} -> {xlsx_file}")
|
||||
print(f"数据形状: {df.shape}")
|
||||
print("前几行预览:")
|
||||
print(df.head())
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"转换失败: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("用法: python convert_xls_to_xlsx.py <input.xls> [output.xlsx]")
|
||||
sys.exit(1)
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
convert_xls_to_xlsx(input_file, output_file)
|
||||
@@ -0,0 +1,65 @@
|
||||
import pandas as pd
|
||||
import sys
|
||||
|
||||
|
||||
def parse_holdings_correct(file_path):
|
||||
"""修正版持仓解析器 - 支持.csv和.xls格式"""
|
||||
try:
|
||||
# 尝试检测文件类型并用相应方式读取
|
||||
if file_path.lower().endswith(".csv") or "\t" in open(file_path, "rb").read(
|
||||
100
|
||||
).decode("utf-8", errors="ignore"):
|
||||
# 尝试作为CSV读取(制表符分隔)
|
||||
try:
|
||||
df = pd.read_csv(file_path, encoding="utf-8", sep="\t")
|
||||
print("成功以UTF-8制表符分隔方式读取")
|
||||
except:
|
||||
try:
|
||||
df = pd.read_csv(file_path, encoding="gbk", sep="\t")
|
||||
print("成功以GBK制表符分隔方式读取")
|
||||
except:
|
||||
df = pd.read_csv(file_path, encoding="gb2312", sep="\t")
|
||||
print("成功以GB2312制表符分隔方式读取")
|
||||
elif file_path.lower().endswith(".xls"):
|
||||
# 使用xlrd读取xls文件
|
||||
try:
|
||||
df = pd.read_excel(file_path, engine="xlrd", encoding="gbk")
|
||||
print("成功以.xls格式读取")
|
||||
except:
|
||||
# 尝试作为制表符分隔的文本文件读取
|
||||
df = pd.read_csv(file_path, sep="\t", encoding="gbk")
|
||||
print("成功以制表符分隔文本格式读取.xls文件")
|
||||
elif file_path.lower().endswith(".xlsx"):
|
||||
df = pd.read_excel(file_path, engine="openpyxl")
|
||||
print("成功以.xlsx格式读取")
|
||||
else:
|
||||
# 尝试作为普通CSV读取
|
||||
try:
|
||||
df = pd.read_csv(file_path, encoding="utf-8")
|
||||
print("成功以UTF-8 CSV格式读取")
|
||||
except:
|
||||
df = pd.read_csv(file_path, encoding="gbk")
|
||||
print("成功以GBK CSV格式读取")
|
||||
|
||||
print(f"数据形状: {df.shape}")
|
||||
print("列名:")
|
||||
for i, col in enumerate(df.columns):
|
||||
print(f" {i}: {col}")
|
||||
|
||||
print("\n前5行数据:")
|
||||
print(df.head())
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("用法: python parse_holdings_correct.py <file_path>")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
parse_holdings_correct(file_path)
|
||||
Reference in New Issue
Block a user