#!/usr/bin/env python3 """ 安全的PPTX读取脚本 - 不依赖markitdown,避免Google Vision API问题 """ import sys import os from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE def extract_text_from_shape(shape): """从形状中提取文本""" if not hasattr(shape, "text"): return "" return shape.text def extract_text_from_slide(slide): """从幻灯片中提取所有文本""" texts = [] # 提取标题 if slide.shapes.title: title = slide.shapes.title.text if title.strip(): texts.append(f"标题: {title}") # 提取所有文本框 for shape in slide.shapes: if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX: text = extract_text_from_shape(shape) if text.strip(): texts.append(text) elif shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER: text = extract_text_from_shape(shape) if text.strip(): texts.append(text) elif hasattr(shape, "text") and shape.text: text = shape.text if text.strip(): texts.append(text) return texts def read_pptx_safe(file_path): """安全读取PPTX文件""" try: prs = Presentation(file_path) all_content = [] for i, slide in enumerate(prs.slides): slide_content = extract_text_from_slide(slide) if slide_content: all_content.append(f"--- 幻灯片 {i + 1} ---") all_content.extend(slide_content) all_content.append("") return "\n".join(all_content) except Exception as e: print(f"Error reading PPTX: {e}", file=sys.stderr) return None def main(): if len(sys.argv) != 2: print("Usage: python pptx_reader_safe.py ", file=sys.stderr) sys.exit(1) file_path = sys.argv[1] if not os.path.exists(file_path): print(f"File not found: {file_path}", file=sys.stderr) sys.exit(1) content = read_pptx_safe(file_path) if content: # 保存到临时文件以避免终端编码问题 output_path = "temp/pptx_output.txt" os.makedirs("temp", exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: f.write(content) print(f"Content extracted to: {output_path}") print("\n" + content) else: print("Failed to extract content", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()