04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
93 lines
2.5 KiB
Python
93 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
安全的PPTX读取脚本 - 不依赖markitdown,避免Google Vision API问题
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pptx import Presentation
|
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
|
|
|
|
def extract_text_from_shape(shape):
|
|
"""从形状中提取文本"""
|
|
if not hasattr(shape, "text"):
|
|
return ""
|
|
return shape.text
|
|
|
|
|
|
def extract_text_from_slide(slide):
|
|
"""从幻灯片中提取所有文本"""
|
|
texts = []
|
|
|
|
# 提取标题
|
|
if slide.shapes.title:
|
|
title = slide.shapes.title.text
|
|
if title.strip():
|
|
texts.append(f"标题: {title}")
|
|
|
|
# 提取所有文本框
|
|
for shape in slide.shapes:
|
|
if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:
|
|
text = extract_text_from_shape(shape)
|
|
if text.strip():
|
|
texts.append(text)
|
|
elif shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER:
|
|
text = extract_text_from_shape(shape)
|
|
if text.strip():
|
|
texts.append(text)
|
|
elif hasattr(shape, "text") and shape.text:
|
|
text = shape.text
|
|
if text.strip():
|
|
texts.append(text)
|
|
|
|
return texts
|
|
|
|
|
|
def read_pptx_safe(file_path):
|
|
"""安全读取PPTX文件"""
|
|
try:
|
|
prs = Presentation(file_path)
|
|
all_content = []
|
|
|
|
for i, slide in enumerate(prs.slides):
|
|
slide_content = extract_text_from_slide(slide)
|
|
if slide_content:
|
|
all_content.append(f"--- 幻灯片 {i + 1} ---")
|
|
all_content.extend(slide_content)
|
|
all_content.append("")
|
|
|
|
return "\n".join(all_content)
|
|
|
|
except Exception as e:
|
|
print(f"Error reading PPTX: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python pptx_reader_safe.py <presentation.pptx>", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
file_path = sys.argv[1]
|
|
if not os.path.exists(file_path):
|
|
print(f"File not found: {file_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
content = read_pptx_safe(file_path)
|
|
if content:
|
|
# 保存到临时文件以避免终端编码问题
|
|
output_path = "temp/pptx_output.txt"
|
|
os.makedirs("temp", exist_ok=True)
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
print(f"Content extracted to: {output_path}")
|
|
print("\n" + content)
|
|
else:
|
|
print("Failed to extract content", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|