Files
skills/file-reader/scripts/pptx_reader_safe.py
T
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

93 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""
安全的PPTX读取脚本 - 不依赖markitdown,避免Google Vision API问题
"""
import sys
import os
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
def extract_text_from_shape(shape):
"""从形状中提取文本"""
if not hasattr(shape, "text"):
return ""
return shape.text
def extract_text_from_slide(slide):
"""从幻灯片中提取所有文本"""
texts = []
# 提取标题
if slide.shapes.title:
title = slide.shapes.title.text
if title.strip():
texts.append(f"标题: {title}")
# 提取所有文本框
for shape in slide.shapes:
if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:
text = extract_text_from_shape(shape)
if text.strip():
texts.append(text)
elif shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER:
text = extract_text_from_shape(shape)
if text.strip():
texts.append(text)
elif hasattr(shape, "text") and shape.text:
text = shape.text
if text.strip():
texts.append(text)
return texts
def read_pptx_safe(file_path):
"""安全读取PPTX文件"""
try:
prs = Presentation(file_path)
all_content = []
for i, slide in enumerate(prs.slides):
slide_content = extract_text_from_slide(slide)
if slide_content:
all_content.append(f"--- 幻灯片 {i + 1} ---")
all_content.extend(slide_content)
all_content.append("")
return "\n".join(all_content)
except Exception as e:
print(f"Error reading PPTX: {e}", file=sys.stderr)
return None
def main():
if len(sys.argv) != 2:
print("Usage: python pptx_reader_safe.py <presentation.pptx>", file=sys.stderr)
sys.exit(1)
file_path = sys.argv[1]
if not os.path.exists(file_path):
print(f"File not found: {file_path}", file=sys.stderr)
sys.exit(1)
content = read_pptx_safe(file_path)
if content:
# 保存到临时文件以避免终端编码问题
output_path = "temp/pptx_output.txt"
os.makedirs("temp", exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
print(f"Content extracted to: {output_path}")
print("\n" + content)
else:
print("Failed to extract content", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()