Initial commit: skills library

- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+"""Example script - delete if not needed."""
+
+print("Hello from skill!")
@@ -0,0 +1,234 @@
+# -*- coding: utf-8 -*-
+"""
+文件读取工具 - 支持docx、xlsx等二进制文件读取
+解决中文编码问题
+"""
+
+import sys
+import os
+import zipfile
+import re
+
+
+def read_docx(file_path):
+    """读取docx文件内容"""
+    try:
+        import zipfile
+        import re
+
+        with zipfile.ZipFile(file_path, "r") as z:
+            with z.open("word/document.xml") as f:
+                content = f.read()
+
+        # docx内部编码可能是GBK但声明为UTF-8
+        try:
+            text = content.decode("utf-8", errors="strict")
+        except:
+            text = content.decode("gbk", errors="ignore")
+
+        # 提取段落
+        paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)
+
+        content_lines = []
+        for p in paragraphs:
+            if p.strip():
+                content_lines.append(p)
+
+        # 读取表格
+        # 先找到表格
+        tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
+        for table in tables:
+            rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
+            for row in rows:
+                cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
+                row_data = []
+                for cell in cells:
+                    cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
+                    if cell_text:
+                        row_data.append("".join(cell_text).strip())
+                if row_data:
+                    content_lines.append(" | ".join(row_data))
+
+        return "\n".join(content_lines)
+
+    except Exception as e:
+        return f"Error reading docx: {e}"
+
+
+def read_docx_and_save(file_path):
+    """读取docx并保存为UTF-8文本文件"""
+    try:
+        import zipfile
+        import re
+        import os
+
+        with zipfile.ZipFile(file_path, "r") as z:
+            with z.open("word/document.xml") as f:
+                content = f.read()
+
+        # 尝试UTF-8，然后尝试GBK
+        try:
+            text = content.decode("utf-8", errors="strict")
+        except:
+            try:
+                text = content.decode("gbk", errors="ignore")
+            except:
+                text = content.decode("utf-8", errors="replace")
+
+        # 提取段落
+        paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)
+
+        content_lines = []
+        for p in paragraphs:
+            if p.strip():
+                content_lines.append(p)
+
+        # 读取表格
+        tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
+        for table in tables:
+            rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
+            for row in rows:
+                cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
+                row_data = []
+                for cell in cells:
+                    cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
+                    if cell_text:
+                        row_data.append("".join(cell_text).strip())
+                if row_data:
+                    content_lines.append(" | ".join(row_data))
+
+        output = "\n".join(content_lines)
+
+        # 保存到文件
+        output_file = "temp/docx_output.txt"
+        os.makedirs("temp", exist_ok=True)
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(output)
+
+        return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"
+
+    except Exception as e:
+        return f"Error: {e}"
+
+
+def read_xlsx(file_path, sheet_index=0, max_rows=20):
+    """读取xlsx文件，中文正确显示（保存到文件）"""
+    try:
+        with zipfile.ZipFile(file_path, "r") as z:
+            # 读取sharedStrings - UTF-8编码
+            string_map = {}
+            if "xl/sharedStrings.xml" in z.namelist():
+                with z.open("xl/sharedStrings.xml") as f:
+                    ss_content = f.read()
+                    text = ss_content.decode("utf-8", errors="ignore")
+                    strings = re.findall(r"<t>([^<]*)</t>", text)
+                    for i, s in enumerate(strings):
+                        string_map[i] = s
+
+            # 读取sheet
+            sheet_files = [
+                f
+                for f in z.namelist()
+                if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
+            ]
+            if sheet_index >= len(sheet_files):
+                return f"Sheet index {sheet_index} out of range"
+
+            sheet_file = sheet_files[sheet_index]
+            with z.open(sheet_file) as f:
+                sheet_content = f.read().decode("utf-8", errors="ignore")
+
+            # 解析数据
+            rows = re.findall(
+                r'<row r="(\d+)"[^>]*>(.*?)</row>', sheet_content, re.DOTALL
+            )
+
+            result = [f"=== Sheet {sheet_index} ==="]
+            for row_num, row_content in rows[:max_rows]:
+                cells = re.findall(
+                    r'<c r="([A-Z]+\d+)"[^>]*>(.*?)</c>', row_content, re.DOTALL
+                )
+                row_data = []
+                for cell_ref, cell_content in cells:
+                    v_match = re.search(r"<v>([^<]*)</v>", cell_content)
+                    t_match = re.search(r"<is><t[^>]*>([^<]*)</t></is>", cell_content)
+
+                    if t_match:
+                        val = t_match.group(1)
+                    elif v_match:
+                        v = v_match.group(1)
+                        if 't="s"' in cell_content and v.isdigit():
+                            idx = int(v)
+                            val = string_map.get(idx, f"[str:{v}]")
+                        else:
+                            try:
+                                val = str(int(v)) if "." not in v else str(float(v))
+                            except:
+                                val = v
+                    else:
+                        val = ""
+
+                    if val:
+                        row_data.append(val)
+
+                if row_data:
+                    result.append(" | ".join(row_data))
+
+            output = "\n".join(result)
+
+            # 写入文件确保中文正确显示
+            output_file = f"temp/xlsx_output.txt"
+            os.makedirs("temp", exist_ok=True)
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(output)
+
+            return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"
+
+    except Exception as e:
+        return f"Error: {e}"
+
+
+def list_sheets(file_path):
+    """列出xlsx所有sheet"""
+    try:
+        import openpyxl
+
+        wb = openpyxl.load_workbook(file_path, data_only=True)
+        sheets = wb.sheetnames
+        return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
+    except Exception as e:
+        return f"Error: {e}"
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("用法:")
+        print("  python file_reader.py <file.docx>")
+        print("  python file_reader.py <file.xlsx>                    # 列出所有sheets")
+        print("  python file_reader.py <file.xlsx> <sheet_index>      # 读取指定sheet")
+        print("  python file_reader.py <file.xlsx> <sheet_index> <max_rows>")
+        print("\n示例:")
+        print("  python file_reader.py test.xlsx           # 列出sheets")
+        print("  python file_reader.py test.xlsx 1         # 读取第2个sheet")
+        print("  python file_reader.py test.xlsx 1 30      # 读取前30行")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+
+    if not os.path.exists(file_path):
+        print(f"文件不存在: {file_path}")
+        sys.exit(1)
+
+    ext = os.path.splitext(file_path)[1].lower()
+
+    if ext == ".docx":
+        print(read_docx_and_save(file_path))
+    elif ext in [".xlsx", ".xls"]:
+        if len(sys.argv) == 2:
+            print(list_sheets(file_path))
+        else:
+            sheet_index = int(sys.argv[2])
+            max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
+            print(read_xlsx(file_path, sheet_index, max_rows))
+    else:
+        print(f"不支持的文件类型: {ext}")
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+"""
+文件读取工具 - 支持docx、xlsx、xls等二进制文件读取
+解决中文编码问题
+"""
+
+import sys
+import os
+import zipfile
+import re
+import pandas as pd
+
+
+def read_docx_and_save(file_path):
+    """读取docx并保存为UTF-8文本文件"""
+    try:
+        import zipfile
+        import re
+        import os
+
+        with zipfile.ZipFile(file_path, "r") as z:
+            with z.open("word/document.xml") as f:
+                content = f.read()
+
+        # 尝试UTF-8，然后尝试GBK
+        try:
+            text = content.decode("utf-8", errors="strict")
+        except:
+            try:
+                text = content.decode("gbk", errors="ignore")
+            except:
+                text = content.decode("utf-8", errors="replace")
+
+        # 提取段落
+        paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)
+
+        content_lines = []
+        for p in paragraphs:
+            if p.strip():
+                content_lines.append(p)
+
+        # 读取表格
+        tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
+        for table in tables:
+            rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
+            for row in rows:
+                cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
+                row_data = []
+                for cell in cells:
+                    cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
+                    if cell_text:
+                        row_data.append("".join(cell_text).strip())
+                if row_data:
+                    content_lines.append(" | ".join(row_data))
+
+        output = "\n".join(content_lines)
+
+        # 保存到文件
+        output_file = "temp/docx_output.txt"
+        os.makedirs("temp", exist_ok=True)
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(output)
+
+        return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"
+
+    except Exception as e:
+        return f"Error: {e}"
+
+
+def read_xlsx(file_path, sheet_index=0, max_rows=20):
+    """读取xlsx文件，中文正确显示（保存到文件）"""
+    try:
+        with zipfile.ZipFile(file_path, "r") as z:
+            # 读取sharedStrings - UTF-8编码
+            string_map = {}
+            if "xl/sharedStrings.xml" in z.namelist():
+                with z.open("xl/sharedStrings.xml") as f:
+                    ss_content = f.read()
+                    text = ss_content.decode("utf-8", errors="ignore")
+                    strings = re.findall(r"<t>([^<]*)</t>", text)
+                    for i, s in enumerate(strings):
+                        string_map[i] = s
+
+            # 读取sheet
+            sheet_files = [
+                f
+                for f in z.namelist()
+                if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
+            ]
+            if sheet_index >= len(sheet_files):
+                return f"Sheet index {sheet_index} out of range"
+
+            sheet_file = sheet_files[sheet_index]
+            with z.open(sheet_file) as f:
+                sheet_content = f.read().decode("utf-8", errors="ignore")
+
+            # 解析数据
+            rows = re.findall(
+                r'<row r="(\d+)"[^>]*>(.*?)</row>', sheet_content, re.DOTALL
+            )
+
+            result = [f"=== Sheet {sheet_index} ==="]
+            for row_num, row_content in rows[:max_rows]:
+                cells = re.findall(
+                    r'<c r="([A-Z]+\d+)"[^>]*>(.*?)</c>', row_content, re.DOTALL
+                )
+                row_data = []
+                for cell_ref, cell_content in cells:
+                    v_match = re.search(r"<v>([^<]*)</v>", cell_content)
+                    t_match = re.search(r"<is><t[^>]*>([^<]*)</t></is>", cell_content)
+
+                    if t_match:
+                        val = t_match.group(1)
+                    elif v_match:
+                        v = v_match.group(1)
+                        if 't="s"' in cell_content and v.isdigit():
+                            idx = int(v)
+                            val = string_map.get(idx, f"[str:{v}]")
+                        else:
+                            try:
+                                val = str(int(v)) if "." not in v else str(float(v))
+                            except:
+                                val = v
+                    else:
+                        val = ""
+
+                    if val:
+                        row_data.append(val)
+
+                if row_data:
+                    result.append(" | ".join(row_data))
+
+            output = "\n".join(result)
+
+            # 写入文件确保中文正确显示
+            output_file = f"temp/xlsx_output.txt"
+            os.makedirs("temp", exist_ok=True)
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(output)
+
+            return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"
+
+    except Exception as e:
+        return f"Error: {e}"
+
+
+def read_xls(file_path, sheet_index=0, max_rows=20):
+    """读取xls文件，处理中文编码"""
+    try:
+        # 首先尝试用pandas直接读取
+        df = pd.read_excel(file_path, sheet_name=sheet_index, nrows=max_rows)
+
+        # 转换为文本格式
+        result = [f"=== Sheet {sheet_index} ==="]
+        # 添加列名
+        result.append(" | ".join(str(col) for col in df.columns))
+
+        # 添加数据行
+        for idx, row in df.iterrows():
+            row_data = []
+            for val in row:
+                if pd.isna(val):
+                    row_data.append("")
+                else:
+                    row_data.append(str(val))
+            result.append(" | ".join(row_data))
+
+        output = "\n".join(result)
+
+        # 保存到文件
+        output_file = "temp/xls_output.txt"
+        os.makedirs("temp", exist_ok=True)
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(output)
+
+        return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
+
+    except Exception as e:
+        # 如果pandas失败，尝试手动解析（可能是制表符分隔的csv）
+        try:
+            encodings = ["gbk", "utf-8", "latin1"]
+            for encoding in encodings:
+                try:
+                    df = pd.read_csv(
+                        file_path, encoding=encoding, sep="\t", nrows=max_rows
+                    )
+                    result = [f"=== Sheet {sheet_index} (CSV format) ==="]
+                    result.append(" | ".join(str(col) for col in df.columns))
+                    for idx, row in df.iterrows():
+                        row_data = []
+                        for val in row:
+                            if pd.isna(val):
+                                row_data.append("")
+                            else:
+                                row_data.append(str(val))
+                        result.append(" | ".join(row_data))
+                    output = "\n".join(result)
+
+                    output_file = "temp/xls_output.txt"
+                    os.makedirs("temp", exist_ok=True)
+                    with open(output_file, "w", encoding="utf-8") as f:
+                        f.write(output)
+
+                    return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
+                except:
+                    continue
+            return f"Error reading xls file: {e}"
+        except Exception as e2:
+            return f"Error: {e} | {e2}"
+
+
+def list_sheets(file_path):
+    """列出Excel所有sheet"""
+    try:
+        import pandas as pd
+
+        excel_file = pd.ExcelFile(file_path)
+        sheets = excel_file.sheet_names
+        return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
+    except Exception as e:
+        return f"Error: {e}"
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("用法:")
+        print("  python file_reader.py <file.docx>")
+        print("  python file_reader.py <file.xlsx>                    # 列出所有sheets")
+        print("  python file_reader.py <file.xlsx> <sheet_index>      # 读取指定sheet")
+        print("  python file_reader.py <file.xlsx> <sheet_index> <max_rows>")
+        print("  python file_reader.py <file.xls>                     # 列出所有sheets")
+        print("  python file_reader.py <file.xls> <sheet_index>       # 读取指定sheet")
+        print("\n示例:")
+        print("  python file_reader.py test.xlsx           # 列出sheets")
+        print("  python file_reader.py test.xlsx 1         # 读取第2个sheet")
+        print("  python file_reader.py test.xlsx 1 30      # 读取前30行")
+        print("  python file_reader.py test.xls            # 读取xls文件")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+
+    if not os.path.exists(file_path):
+        print(f"文件不存在: {file_path}")
+        sys.exit(1)
+
+    ext = os.path.splitext(file_path)[1].lower()
+
+    if ext == ".docx":
+        print(read_docx_and_save(file_path))
+    elif ext == ".xlsx":
+        if len(sys.argv) == 2:
+            print(list_sheets(file_path))
+        else:
+            sheet_index = int(sys.argv[2])
+            max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
+            print(read_xlsx(file_path, sheet_index, max_rows))
+    elif ext == ".xls":
+        if len(sys.argv) == 2:
+            print(list_sheets(file_path))
+        else:
+            sheet_index = int(sys.argv[2])
+            max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
+            print(read_xls(file_path, sheet_index, max_rows))
+    else:
+        print(f"不支持的文件类型: {ext}")
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+安全的PPTX读取脚本 - 不依赖markitdown，避免Google Vision API问题
+"""
+
+import sys
+import os
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+
+
+def extract_text_from_shape(shape):
+    """从形状中提取文本"""
+    if not hasattr(shape, "text"):
+        return ""
+    return shape.text
+
+
+def extract_text_from_slide(slide):
+    """从幻灯片中提取所有文本"""
+    texts = []
+
+    # 提取标题
+    if slide.shapes.title:
+        title = slide.shapes.title.text
+        if title.strip():
+            texts.append(f"标题: {title}")
+
+    # 提取所有文本框
+    for shape in slide.shapes:
+        if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:
+            text = extract_text_from_shape(shape)
+            if text.strip():
+                texts.append(text)
+        elif shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER:
+            text = extract_text_from_shape(shape)
+            if text.strip():
+                texts.append(text)
+        elif hasattr(shape, "text") and shape.text:
+            text = shape.text
+            if text.strip():
+                texts.append(text)
+
+    return texts
+
+
+def read_pptx_safe(file_path):
+    """安全读取PPTX文件"""
+    try:
+        prs = Presentation(file_path)
+        all_content = []
+
+        for i, slide in enumerate(prs.slides):
+            slide_content = extract_text_from_slide(slide)
+            if slide_content:
+                all_content.append(f"--- 幻灯片 {i + 1} ---")
+                all_content.extend(slide_content)
+                all_content.append("")
+
+        return "\n".join(all_content)
+
+    except Exception as e:
+        print(f"Error reading PPTX: {e}", file=sys.stderr)
+        return None
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python pptx_reader_safe.py <presentation.pptx>", file=sys.stderr)
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    if not os.path.exists(file_path):
+        print(f"File not found: {file_path}", file=sys.stderr)
+        sys.exit(1)
+
+    content = read_pptx_safe(file_path)
+    if content:
+        # 保存到临时文件以避免终端编码问题
+        output_path = "temp/pptx_output.txt"
+        os.makedirs("temp", exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(content)
+        print(f"Content extracted to: {output_path}")
+        print("\n" + content)
+    else:
+        print("Failed to extract content", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()