Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Example script - delete if not needed."""
|
||||
|
||||
print("Hello from skill!")
|
||||
@@ -0,0 +1,234 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
文件读取工具 - 支持docx、xlsx等二进制文件读取
|
||||
解决中文编码问题
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import zipfile
|
||||
import re
|
||||
|
||||
|
||||
def read_docx(file_path):
|
||||
"""读取docx文件内容"""
|
||||
try:
|
||||
import zipfile
|
||||
import re
|
||||
|
||||
with zipfile.ZipFile(file_path, "r") as z:
|
||||
with z.open("word/document.xml") as f:
|
||||
content = f.read()
|
||||
|
||||
# docx内部编码可能是GBK但声明为UTF-8
|
||||
try:
|
||||
text = content.decode("utf-8", errors="strict")
|
||||
except:
|
||||
text = content.decode("gbk", errors="ignore")
|
||||
|
||||
# 提取段落
|
||||
paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)
|
||||
|
||||
content_lines = []
|
||||
for p in paragraphs:
|
||||
if p.strip():
|
||||
content_lines.append(p)
|
||||
|
||||
# 读取表格
|
||||
# 先找到表格
|
||||
tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
|
||||
for table in tables:
|
||||
rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
|
||||
for row in rows:
|
||||
cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
|
||||
row_data = []
|
||||
for cell in cells:
|
||||
cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
|
||||
if cell_text:
|
||||
row_data.append("".join(cell_text).strip())
|
||||
if row_data:
|
||||
content_lines.append(" | ".join(row_data))
|
||||
|
||||
return "\n".join(content_lines)
|
||||
|
||||
except Exception as e:
|
||||
return f"Error reading docx: {e}"
|
||||
|
||||
|
||||
def read_docx_and_save(file_path):
|
||||
"""读取docx并保存为UTF-8文本文件"""
|
||||
try:
|
||||
import zipfile
|
||||
import re
|
||||
import os
|
||||
|
||||
with zipfile.ZipFile(file_path, "r") as z:
|
||||
with z.open("word/document.xml") as f:
|
||||
content = f.read()
|
||||
|
||||
# 尝试UTF-8,然后尝试GBK
|
||||
try:
|
||||
text = content.decode("utf-8", errors="strict")
|
||||
except:
|
||||
try:
|
||||
text = content.decode("gbk", errors="ignore")
|
||||
except:
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
|
||||
# 提取段落
|
||||
paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)
|
||||
|
||||
content_lines = []
|
||||
for p in paragraphs:
|
||||
if p.strip():
|
||||
content_lines.append(p)
|
||||
|
||||
# 读取表格
|
||||
tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
|
||||
for table in tables:
|
||||
rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
|
||||
for row in rows:
|
||||
cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
|
||||
row_data = []
|
||||
for cell in cells:
|
||||
cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
|
||||
if cell_text:
|
||||
row_data.append("".join(cell_text).strip())
|
||||
if row_data:
|
||||
content_lines.append(" | ".join(row_data))
|
||||
|
||||
output = "\n".join(content_lines)
|
||||
|
||||
# 保存到文件
|
||||
output_file = "temp/docx_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
|
||||
return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"
|
||||
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
def read_xlsx(file_path, sheet_index=0, max_rows=20):
|
||||
"""读取xlsx文件,中文正确显示(保存到文件)"""
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, "r") as z:
|
||||
# 读取sharedStrings - UTF-8编码
|
||||
string_map = {}
|
||||
if "xl/sharedStrings.xml" in z.namelist():
|
||||
with z.open("xl/sharedStrings.xml") as f:
|
||||
ss_content = f.read()
|
||||
text = ss_content.decode("utf-8", errors="ignore")
|
||||
strings = re.findall(r"<t>([^<]*)</t>", text)
|
||||
for i, s in enumerate(strings):
|
||||
string_map[i] = s
|
||||
|
||||
# 读取sheet
|
||||
sheet_files = [
|
||||
f
|
||||
for f in z.namelist()
|
||||
if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
|
||||
]
|
||||
if sheet_index >= len(sheet_files):
|
||||
return f"Sheet index {sheet_index} out of range"
|
||||
|
||||
sheet_file = sheet_files[sheet_index]
|
||||
with z.open(sheet_file) as f:
|
||||
sheet_content = f.read().decode("utf-8", errors="ignore")
|
||||
|
||||
# 解析数据
|
||||
rows = re.findall(
|
||||
r'<row r="(\d+)"[^>]*>(.*?)</row>', sheet_content, re.DOTALL
|
||||
)
|
||||
|
||||
result = [f"=== Sheet {sheet_index} ==="]
|
||||
for row_num, row_content in rows[:max_rows]:
|
||||
cells = re.findall(
|
||||
r'<c r="([A-Z]+\d+)"[^>]*>(.*?)</c>', row_content, re.DOTALL
|
||||
)
|
||||
row_data = []
|
||||
for cell_ref, cell_content in cells:
|
||||
v_match = re.search(r"<v>([^<]*)</v>", cell_content)
|
||||
t_match = re.search(r"<is><t[^>]*>([^<]*)</t></is>", cell_content)
|
||||
|
||||
if t_match:
|
||||
val = t_match.group(1)
|
||||
elif v_match:
|
||||
v = v_match.group(1)
|
||||
if 't="s"' in cell_content and v.isdigit():
|
||||
idx = int(v)
|
||||
val = string_map.get(idx, f"[str:{v}]")
|
||||
else:
|
||||
try:
|
||||
val = str(int(v)) if "." not in v else str(float(v))
|
||||
except:
|
||||
val = v
|
||||
else:
|
||||
val = ""
|
||||
|
||||
if val:
|
||||
row_data.append(val)
|
||||
|
||||
if row_data:
|
||||
result.append(" | ".join(row_data))
|
||||
|
||||
output = "\n".join(result)
|
||||
|
||||
# 写入文件确保中文正确显示
|
||||
output_file = f"temp/xlsx_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
|
||||
return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"
|
||||
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
def list_sheets(file_path):
|
||||
"""列出xlsx所有sheet"""
|
||||
try:
|
||||
import openpyxl
|
||||
|
||||
wb = openpyxl.load_workbook(file_path, data_only=True)
|
||||
sheets = wb.sheetnames
|
||||
return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("用法:")
|
||||
print(" python file_reader.py <file.docx>")
|
||||
print(" python file_reader.py <file.xlsx> # 列出所有sheets")
|
||||
print(" python file_reader.py <file.xlsx> <sheet_index> # 读取指定sheet")
|
||||
print(" python file_reader.py <file.xlsx> <sheet_index> <max_rows>")
|
||||
print("\n示例:")
|
||||
print(" python file_reader.py test.xlsx # 列出sheets")
|
||||
print(" python file_reader.py test.xlsx 1 # 读取第2个sheet")
|
||||
print(" python file_reader.py test.xlsx 1 30 # 读取前30行")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
print(f"文件不存在: {file_path}")
|
||||
sys.exit(1)
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == ".docx":
|
||||
print(read_docx_and_save(file_path))
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
if len(sys.argv) == 2:
|
||||
print(list_sheets(file_path))
|
||||
else:
|
||||
sheet_index = int(sys.argv[2])
|
||||
max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
||||
print(read_xlsx(file_path, sheet_index, max_rows))
|
||||
else:
|
||||
print(f"不支持的文件类型: {ext}")
|
||||
@@ -0,0 +1,265 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
文件读取工具 - 支持docx、xlsx、xls等二进制文件读取
|
||||
解决中文编码问题
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import zipfile
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_docx_and_save(file_path):
|
||||
"""读取docx并保存为UTF-8文本文件"""
|
||||
try:
|
||||
import zipfile
|
||||
import re
|
||||
import os
|
||||
|
||||
with zipfile.ZipFile(file_path, "r") as z:
|
||||
with z.open("word/document.xml") as f:
|
||||
content = f.read()
|
||||
|
||||
# 尝试UTF-8,然后尝试GBK
|
||||
try:
|
||||
text = content.decode("utf-8", errors="strict")
|
||||
except:
|
||||
try:
|
||||
text = content.decode("gbk", errors="ignore")
|
||||
except:
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
|
||||
# 提取段落
|
||||
paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)
|
||||
|
||||
content_lines = []
|
||||
for p in paragraphs:
|
||||
if p.strip():
|
||||
content_lines.append(p)
|
||||
|
||||
# 读取表格
|
||||
tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
|
||||
for table in tables:
|
||||
rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
|
||||
for row in rows:
|
||||
cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
|
||||
row_data = []
|
||||
for cell in cells:
|
||||
cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
|
||||
if cell_text:
|
||||
row_data.append("".join(cell_text).strip())
|
||||
if row_data:
|
||||
content_lines.append(" | ".join(row_data))
|
||||
|
||||
output = "\n".join(content_lines)
|
||||
|
||||
# 保存到文件
|
||||
output_file = "temp/docx_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
|
||||
return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"
|
||||
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
def read_xlsx(file_path, sheet_index=0, max_rows=20):
|
||||
"""读取xlsx文件,中文正确显示(保存到文件)"""
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, "r") as z:
|
||||
# 读取sharedStrings - UTF-8编码
|
||||
string_map = {}
|
||||
if "xl/sharedStrings.xml" in z.namelist():
|
||||
with z.open("xl/sharedStrings.xml") as f:
|
||||
ss_content = f.read()
|
||||
text = ss_content.decode("utf-8", errors="ignore")
|
||||
strings = re.findall(r"<t>([^<]*)</t>", text)
|
||||
for i, s in enumerate(strings):
|
||||
string_map[i] = s
|
||||
|
||||
# 读取sheet
|
||||
sheet_files = [
|
||||
f
|
||||
for f in z.namelist()
|
||||
if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
|
||||
]
|
||||
if sheet_index >= len(sheet_files):
|
||||
return f"Sheet index {sheet_index} out of range"
|
||||
|
||||
sheet_file = sheet_files[sheet_index]
|
||||
with z.open(sheet_file) as f:
|
||||
sheet_content = f.read().decode("utf-8", errors="ignore")
|
||||
|
||||
# 解析数据
|
||||
rows = re.findall(
|
||||
r'<row r="(\d+)"[^>]*>(.*?)</row>', sheet_content, re.DOTALL
|
||||
)
|
||||
|
||||
result = [f"=== Sheet {sheet_index} ==="]
|
||||
for row_num, row_content in rows[:max_rows]:
|
||||
cells = re.findall(
|
||||
r'<c r="([A-Z]+\d+)"[^>]*>(.*?)</c>', row_content, re.DOTALL
|
||||
)
|
||||
row_data = []
|
||||
for cell_ref, cell_content in cells:
|
||||
v_match = re.search(r"<v>([^<]*)</v>", cell_content)
|
||||
t_match = re.search(r"<is><t[^>]*>([^<]*)</t></is>", cell_content)
|
||||
|
||||
if t_match:
|
||||
val = t_match.group(1)
|
||||
elif v_match:
|
||||
v = v_match.group(1)
|
||||
if 't="s"' in cell_content and v.isdigit():
|
||||
idx = int(v)
|
||||
val = string_map.get(idx, f"[str:{v}]")
|
||||
else:
|
||||
try:
|
||||
val = str(int(v)) if "." not in v else str(float(v))
|
||||
except:
|
||||
val = v
|
||||
else:
|
||||
val = ""
|
||||
|
||||
if val:
|
||||
row_data.append(val)
|
||||
|
||||
if row_data:
|
||||
result.append(" | ".join(row_data))
|
||||
|
||||
output = "\n".join(result)
|
||||
|
||||
# 写入文件确保中文正确显示
|
||||
output_file = f"temp/xlsx_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
|
||||
return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"
|
||||
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
def read_xls(file_path, sheet_index=0, max_rows=20):
|
||||
"""读取xls文件,处理中文编码"""
|
||||
try:
|
||||
# 首先尝试用pandas直接读取
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_index, nrows=max_rows)
|
||||
|
||||
# 转换为文本格式
|
||||
result = [f"=== Sheet {sheet_index} ==="]
|
||||
# 添加列名
|
||||
result.append(" | ".join(str(col) for col in df.columns))
|
||||
|
||||
# 添加数据行
|
||||
for idx, row in df.iterrows():
|
||||
row_data = []
|
||||
for val in row:
|
||||
if pd.isna(val):
|
||||
row_data.append("")
|
||||
else:
|
||||
row_data.append(str(val))
|
||||
result.append(" | ".join(row_data))
|
||||
|
||||
output = "\n".join(result)
|
||||
|
||||
# 保存到文件
|
||||
output_file = "temp/xls_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
|
||||
return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
|
||||
|
||||
except Exception as e:
|
||||
# 如果pandas失败,尝试手动解析(可能是制表符分隔的csv)
|
||||
try:
|
||||
encodings = ["gbk", "utf-8", "latin1"]
|
||||
for encoding in encodings:
|
||||
try:
|
||||
df = pd.read_csv(
|
||||
file_path, encoding=encoding, sep="\t", nrows=max_rows
|
||||
)
|
||||
result = [f"=== Sheet {sheet_index} (CSV format) ==="]
|
||||
result.append(" | ".join(str(col) for col in df.columns))
|
||||
for idx, row in df.iterrows():
|
||||
row_data = []
|
||||
for val in row:
|
||||
if pd.isna(val):
|
||||
row_data.append("")
|
||||
else:
|
||||
row_data.append(str(val))
|
||||
result.append(" | ".join(row_data))
|
||||
output = "\n".join(result)
|
||||
|
||||
output_file = "temp/xls_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
|
||||
return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
|
||||
except:
|
||||
continue
|
||||
return f"Error reading xls file: {e}"
|
||||
except Exception as e2:
|
||||
return f"Error: {e} | {e2}"
|
||||
|
||||
|
||||
def list_sheets(file_path):
|
||||
"""列出Excel所有sheet"""
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
excel_file = pd.ExcelFile(file_path)
|
||||
sheets = excel_file.sheet_names
|
||||
return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("用法:")
|
||||
print(" python file_reader.py <file.docx>")
|
||||
print(" python file_reader.py <file.xlsx> # 列出所有sheets")
|
||||
print(" python file_reader.py <file.xlsx> <sheet_index> # 读取指定sheet")
|
||||
print(" python file_reader.py <file.xlsx> <sheet_index> <max_rows>")
|
||||
print(" python file_reader.py <file.xls> # 列出所有sheets")
|
||||
print(" python file_reader.py <file.xls> <sheet_index> # 读取指定sheet")
|
||||
print("\n示例:")
|
||||
print(" python file_reader.py test.xlsx # 列出sheets")
|
||||
print(" python file_reader.py test.xlsx 1 # 读取第2个sheet")
|
||||
print(" python file_reader.py test.xlsx 1 30 # 读取前30行")
|
||||
print(" python file_reader.py test.xls # 读取xls文件")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
print(f"文件不存在: {file_path}")
|
||||
sys.exit(1)
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == ".docx":
|
||||
print(read_docx_and_save(file_path))
|
||||
elif ext == ".xlsx":
|
||||
if len(sys.argv) == 2:
|
||||
print(list_sheets(file_path))
|
||||
else:
|
||||
sheet_index = int(sys.argv[2])
|
||||
max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
||||
print(read_xlsx(file_path, sheet_index, max_rows))
|
||||
elif ext == ".xls":
|
||||
if len(sys.argv) == 2:
|
||||
print(list_sheets(file_path))
|
||||
else:
|
||||
sheet_index = int(sys.argv[2])
|
||||
max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
||||
print(read_xls(file_path, sheet_index, max_rows))
|
||||
else:
|
||||
print(f"不支持的文件类型: {ext}")
|
||||
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
安全的PPTX读取脚本 - 不依赖markitdown,避免Google Vision API问题
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
|
||||
|
||||
def extract_text_from_shape(shape):
|
||||
"""从形状中提取文本"""
|
||||
if not hasattr(shape, "text"):
|
||||
return ""
|
||||
return shape.text
|
||||
|
||||
|
||||
def extract_text_from_slide(slide):
|
||||
"""从幻灯片中提取所有文本"""
|
||||
texts = []
|
||||
|
||||
# 提取标题
|
||||
if slide.shapes.title:
|
||||
title = slide.shapes.title.text
|
||||
if title.strip():
|
||||
texts.append(f"标题: {title}")
|
||||
|
||||
# 提取所有文本框
|
||||
for shape in slide.shapes:
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:
|
||||
text = extract_text_from_shape(shape)
|
||||
if text.strip():
|
||||
texts.append(text)
|
||||
elif shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER:
|
||||
text = extract_text_from_shape(shape)
|
||||
if text.strip():
|
||||
texts.append(text)
|
||||
elif hasattr(shape, "text") and shape.text:
|
||||
text = shape.text
|
||||
if text.strip():
|
||||
texts.append(text)
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def read_pptx_safe(file_path):
|
||||
"""安全读取PPTX文件"""
|
||||
try:
|
||||
prs = Presentation(file_path)
|
||||
all_content = []
|
||||
|
||||
for i, slide in enumerate(prs.slides):
|
||||
slide_content = extract_text_from_slide(slide)
|
||||
if slide_content:
|
||||
all_content.append(f"--- 幻灯片 {i + 1} ---")
|
||||
all_content.extend(slide_content)
|
||||
all_content.append("")
|
||||
|
||||
return "\n".join(all_content)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading PPTX: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python pptx_reader_safe.py <presentation.pptx>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
if not os.path.exists(file_path):
|
||||
print(f"File not found: {file_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
content = read_pptx_safe(file_path)
|
||||
if content:
|
||||
# 保存到临时文件以避免终端编码问题
|
||||
output_path = "temp/pptx_output.txt"
|
||||
os.makedirs("temp", exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
print(f"Content extracted to: {output_path}")
|
||||
print("\n" + content)
|
||||
else:
|
||||
print("Failed to extract content", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user