# -*- coding: utf-8 -*-
"""
文件读取工具 - 支持docx、xlsx等二进制文件读取
解决中文编码问题
"""

import sys
import os
import zipfile
import re


def read_docx(file_path):
    """读取docx文件内容"""
    try:
        import zipfile
        import re

        with zipfile.ZipFile(file_path, "r") as z:
            with z.open("word/document.xml") as f:
                content = f.read()

        # docx内部编码可能是GBK但声明为UTF-8
        try:
            text = content.decode("utf-8", errors="strict")
        except:
            text = content.decode("gbk", errors="ignore")

        # 提取段落
        paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)

        content_lines = []
        for p in paragraphs:
            if p.strip():
                content_lines.append(p)

        # 读取表格
        # 先找到表格
        tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
        for table in tables:
            rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
            for row in rows:
                cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
                row_data = []
                for cell in cells:
                    cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
                    if cell_text:
                        row_data.append("".join(cell_text).strip())
                if row_data:
                    content_lines.append(" | ".join(row_data))

        return "\n".join(content_lines)

    except Exception as e:
        return f"Error reading docx: {e}"


def read_docx_and_save(file_path):
    """读取docx并保存为UTF-8文本文件"""
    try:
        import zipfile
        import re
        import os

        with zipfile.ZipFile(file_path, "r") as z:
            with z.open("word/document.xml") as f:
                content = f.read()

        # 尝试UTF-8，然后尝试GBK
        try:
            text = content.decode("utf-8", errors="strict")
        except:
            try:
                text = content.decode("gbk", errors="ignore")
            except:
                text = content.decode("utf-8", errors="replace")

        # 提取段落
        paragraphs = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", text)

        content_lines = []
        for p in paragraphs:
            if p.strip():
                content_lines.append(p)

        # 读取表格
        tables = re.findall(r"<w:tbl>(.*?)</w:tbl>", text, re.DOTALL)
        for table in tables:
            rows = re.findall(r"<w:tr>(.*?)</w:tr>", table, re.DOTALL)
            for row in rows:
                cells = re.findall(r"<w:tc>(.*?)</w:tc>", row, re.DOTALL)
                row_data = []
                for cell in cells:
                    cell_text = re.findall(r"<w:t[^>]*>([^<]*)</w:t>", cell)
                    if cell_text:
                        row_data.append("".join(cell_text).strip())
                if row_data:
                    content_lines.append(" | ".join(row_data))

        output = "\n".join(content_lines)

        # 保存到文件
        output_file = "temp/docx_output.txt"
        os.makedirs("temp", exist_ok=True)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(output)

        return f"[内容已保存到 temp/docx_output.txt]\n\n{output}"

    except Exception as e:
        return f"Error: {e}"


def read_xlsx(file_path, sheet_index=0, max_rows=20):
    """读取xlsx文件，中文正确显示（保存到文件）"""
    try:
        with zipfile.ZipFile(file_path, "r") as z:
            # 读取sharedStrings - UTF-8编码
            string_map = {}
            if "xl/sharedStrings.xml" in z.namelist():
                with z.open("xl/sharedStrings.xml") as f:
                    ss_content = f.read()
                    text = ss_content.decode("utf-8", errors="ignore")
                    strings = re.findall(r"<t>([^<]*)</t>", text)
                    for i, s in enumerate(strings):
                        string_map[i] = s

            # 读取sheet
            sheet_files = [
                f
                for f in z.namelist()
                if f.startswith("xl/worksheets/sheet") and f.endswith(".xml")
            ]
            if sheet_index >= len(sheet_files):
                return f"Sheet index {sheet_index} out of range"

            sheet_file = sheet_files[sheet_index]
            with z.open(sheet_file) as f:
                sheet_content = f.read().decode("utf-8", errors="ignore")

            # 解析数据
            rows = re.findall(
                r'<row r="(\d+)"[^>]*>(.*?)</row>', sheet_content, re.DOTALL
            )

            result = [f"=== Sheet {sheet_index} ==="]
            for row_num, row_content in rows[:max_rows]:
                cells = re.findall(
                    r'<c r="([A-Z]+\d+)"[^>]*>(.*?)</c>', row_content, re.DOTALL
                )
                row_data = []
                for cell_ref, cell_content in cells:
                    v_match = re.search(r"<v>([^<]*)</v>", cell_content)
                    t_match = re.search(r"<is><t[^>]*>([^<]*)</t></is>", cell_content)

                    if t_match:
                        val = t_match.group(1)
                    elif v_match:
                        v = v_match.group(1)
                        if 't="s"' in cell_content and v.isdigit():
                            idx = int(v)
                            val = string_map.get(idx, f"[str:{v}]")
                        else:
                            try:
                                val = str(int(v)) if "." not in v else str(float(v))
                            except:
                                val = v
                    else:
                        val = ""

                    if val:
                        row_data.append(val)

                if row_data:
                    result.append(" | ".join(row_data))

            output = "\n".join(result)

            # 写入文件确保中文正确显示
            output_file = f"temp/xlsx_output.txt"
            os.makedirs("temp", exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(output)

            return f"[内容已保存到 temp/xlsx_output.txt]\n\n{output}"

    except Exception as e:
        return f"Error: {e}"


def list_sheets(file_path):
    """列出xlsx所有sheet"""
    try:
        import openpyxl

        wb = openpyxl.load_workbook(file_path, data_only=True)
        sheets = wb.sheetnames
        return "可用Sheets:\n" + "\n".join(f"{i}: {s}" for i, s in enumerate(sheets))
    except Exception as e:
        return f"Error: {e}"


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("用法:")
        print("  python file_reader.py <file.docx>")
        print("  python file_reader.py <file.xlsx>                    # 列出所有sheets")
        print("  python file_reader.py <file.xlsx> <sheet_index>      # 读取指定sheet")
        print("  python file_reader.py <file.xlsx> <sheet_index> <max_rows>")
        print("\n示例:")
        print("  python file_reader.py test.xlsx           # 列出sheets")
        print("  python file_reader.py test.xlsx 1         # 读取第2个sheet")
        print("  python file_reader.py test.xlsx 1 30      # 读取前30行")
        sys.exit(1)

    file_path = sys.argv[1]

    if not os.path.exists(file_path):
        print(f"文件不存在: {file_path}")
        sys.exit(1)

    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".docx":
        print(read_docx_and_save(file_path))
    elif ext in [".xlsx", ".xls"]:
        if len(sys.argv) == 2:
            print(list_sheets(file_path))
        else:
            sheet_index = int(sys.argv[2])
            max_rows = int(sys.argv[3]) if len(sys.argv) > 3 else 20
            print(read_xlsx(file_path, sheet_index, max_rows))
    else:
        print(f"不支持的文件类型: {ext}")