skills/file-reader/scripts/file_reader_enhanced.py

# -*- coding: utf-8 -*-
"""
文件读取工具 - 支持docx、xlsx、xls等二进制文件读取
解决中文编码问题
"""

import sys
import os
import zipfile
import re
import pandas as pd


def read_docx_and_save(file_path):
    """读取docx并保存为UTF-8文本文件"""
    try:
        import zipfile
        import re
        import os

        with zipfile.ZipFile(file_path, "r") as z:
            with z.open("word/document.xml") as f:
                content = f.read()

        # 尝试UTF-8，然后尝试GBK
        try:
            text = content.decode("utf-8", errors="strict")
        except:
            try:
                text = content.decode("gbk", errors="ignore")
            except:
                text = content.decode("utf-8", errors="replace")

        # 提取段落
        paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)

        content_lines = []
        for p in paragraphs:
            if p.strip():
                content_lines.append(p)

        # 读取表格
        tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
        for table in tables:
            rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
            for row in rows:
                cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
                row_data = []
                for cell in cells:
                    cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
                    if cell_text:
                        row_data.append("".join(cell_text).strip())
                if row_data:
                    content_lines.append(" | ".join(row_data))

        output = "\n".join(content_lines)

        # 保存到文件
        output_file = "temp/docx_output.txt"
        os.makedirs("temp", exist_ok=True)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(output)

        return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"

    except Exception as e:
        return f"Error: {e}"


def read_xlsx(file_path, sheet_index=0, max_rows=20):
    """读取xlsx文件，中文正确显示（保存到文件）"""
    try:
        with zipfile.ZipFile(file_path, "r") as z:
            # 读取sharedStrings - UTF-8编码
            string_map = {}
            if "xl/sharedStrings.xml" in z.namelist():
                with z.open("xl/sharedStrings.xml") as f:
                    ss_content = f.read()
                    text = ss_content.decode("utf-8", errors="ignore")
                    strings = re.findall(r"<t>([^<]*)</t>", text)
                    for i, s in enumerate(strings):
                        string_map[i] = s

            # 读取sheet
            sheet_files = [
                f
                for f in z.namelist()
                if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
            ]
            if sheet_index >= len(sheet_files):
                return f"Sheet index {sheet_index} out of range"

            sheet_file = sheet_files[sheet_index]
            with z.open(sheet_file) as f:
                sheet_content = f.read().decode("utf-8", errors="ignore")

            # 解析数据
            rows = re.findall(
                r'<row r="(\d+)"[^>]*>(.*?)</row>', sheet_content, re.DOTALL
            )

            result = [f"=== Sheet {sheet_index} ==="]
            for row_num, row_content in rows[:max_rows]:
                cells = re.findall(
                    r'<c r="([A-Z]+\d+)"[^>]*>(.*?)</c>', row_content, re.DOTALL
                )
                row_data = []
                for cell_ref, cell_content in cells:
                    v_match = re.search(r"<v>([^<]*)</v>", cell_content)
                    t_match = re.search(r"<is><t[^>]*>([^<]*)</t></is>", cell_content)

                    if t_match:
                        val = t_match.group(1)
                    elif v_match:
                        v = v_match.group(1)
                        if 't="s"' in cell_content and v.isdigit():
                            idx = int(v)
                            val = string_map.get(idx, f"[str:{v}]")
                        else:
                            try:
                                val = str(int(v)) if "." not in v else str(float(v))
                            except:
                                val = v
                    else:
                        val = ""

                    if val:
                        row_data.append(val)

                if row_data:
                    result.append(" | ".join(row_data))

            output = "\n".join(result)

            # 写入文件确保中文正确显示
            output_file = f"temp/xlsx_output.txt"
            os.makedirs("temp", exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(output)

            return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"

    except Exception as e:
        return f"Error: {e}"


def read_xls(file_path, sheet_index=0, max_rows=20):
    """读取xls文件，处理中文编码"""
    try:
        # 首先尝试用pandas直接读取
        df = pd.read_excel(file_path, sheet_name=sheet_index, nrows=max_rows)

        # 转换为文本格式
        result = [f"=== Sheet {sheet_index} ==="]
        # 添加列名
        result.append(" | ".join(str(col) for col in df.columns))

        # 添加数据行
        for idx, row in df.iterrows():
            row_data = []
            for val in row:
                if pd.isna(val):
                    row_data.append("")
                else:
                    row_data.append(str(val))
            result.append(" | ".join(row_data))

        output = "\n".join(result)

        # 保存到文件
        output_file = "temp/xls_output.txt"
        os.makedirs("temp", exist_ok=True)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(output)

        return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"

    except Exception as e:
        # 如果pandas失败，尝试手动解析（可能是制表符分隔的csv）
        try:
            encodings = ["gbk", "utf-8", "latin1"]
            for encoding in encodings:
                try:
                    df = pd.read_csv(
                        file_path, encoding=encoding, sep="\t", nrows=max_rows
                    )
                    result = [f"=== Sheet {sheet_index} (CSV format) ==="]
                    result.append(" | ".join(str(col) for col in df.columns))
                    for idx, row in df.iterrows():
                        row_data = []
                        for val in row:
                            if pd.isna(val):
                                row_data.append("")
                            else:
                                row_data.append(str(val))
                        result.append(" | ".join(row_data))
                    output = "\n".join(result)

                    output_file = "temp/xls_output.txt"
                    os.makedirs("temp", exist_ok=True)
                    with open(output_file, "w", encoding="utf-8") as f:
                        f.write(output)

                    return f"[内容已保存到 temp/xls_output.txt]\n\n{output}"
                except:
                    continue
            return f"Error reading xls file: {e}"
        except Exception as e2:
            return f"Error: {e} | {e2}"


def list_sheets(file_path):
    """列出Excel所有sheet"""
    try:
        import pandas as pd

        excel_file = pd.ExcelFile(file_path)
        sheets = excel_file.sheet_names
        return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
    except Exception as e:
        return f"Error: {e}"


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("用法:")
        print("  python file_reader.py <file.docx>")
        print("  python file_reader.py <file.xlsx>                    # 列出所有sheets")
        print("  python file_reader.py <file.xlsx> <sheet_index>      # 读取指定sheet")
        print("  python file_reader.py <file.xlsx> <sheet_index> <max_rows>")
        print("  python file_reader.py <file.xls>                     # 列出所有sheets")
        print("  python file_reader.py <file.xls> <sheet_index>       # 读取指定sheet")
        print("\n示例:")
        print("  python file_reader.py test.xlsx           # 列出sheets")
        print("  python file_reader.py test.xlsx 1         # 读取第2个sheet")
        print("  python file_reader.py test.xlsx 1 30      # 读取前30行")
        print("  python file_reader.py test.xls            # 读取xls文件")
        sys.exit(1)

    file_path = sys.argv[1]

    if not os.path.exists(file_path):
        print(f"文件不存在: {file_path}")
        sys.exit(1)

    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".docx":
        print(read_docx_and_save(file_path))
    elif ext == ".xlsx":
        if len(sys.argv) == 2:
            print(list_sheets(file_path))
        else:
            sheet_index = int(sys.argv[2])
            max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
            print(read_xlsx(file_path, sheet_index, max_rows))
    elif ext == ".xls":
        if len(sys.argv) == 2:
            print(list_sheets(file_path))
        else:
            sheet_index = int(sys.argv[2])
            max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
            print(read_xls(file_path, sheet_index, max_rows))
    else:
        print(f"不支持的文件类型: {ext}")