04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
134 lines
3.6 KiB
Python
134 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
extract_pdf_images.py
|
|
|
|
从 PDF 中提取五线谱图片,用于后续 OMR 处理。
|
|
|
|
功能:
|
|
1. 打开 PDF 文件
|
|
2. 提取每页中的图片
|
|
3. 保存为 PNG 文件
|
|
|
|
Usage:
|
|
python extract_pdf_images.py <pdf_path> [output_dir]
|
|
|
|
Examples:
|
|
python extract_pdf_images.py "D:/scores/sheet.pdf"
|
|
|
|
# 提取到指定目录
|
|
python extract_pdf_images.py "D:/scores/sheet.pdf" "D:/output/sheets"
|
|
|
|
Dependencies:
|
|
pip install pymupdf
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
print("Error: PyMuPDF not installed.")
|
|
print("Install with: pip install pymupdf")
|
|
sys.exit(1)
|
|
|
|
|
|
def extract_images_from_pdf(pdf_path: str, output_dir: str = None) -> list:
|
|
"""
|
|
从 PDF 中提取所有图片。
|
|
|
|
Args:
|
|
pdf_path: PDF 文件路径
|
|
output_dir: 输出目录,默认 temp/pdf_sheets
|
|
|
|
Returns:
|
|
提取的图片路径列表
|
|
"""
|
|
if not os.path.exists(pdf_path):
|
|
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
|
|
|
if output_dir is None:
|
|
output_dir = os.path.join(os.path.dirname(pdf_path), 'temp', 'pdf_sheets')
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
doc = fitz.open(pdf_path)
|
|
print(f"[PDF] Opened: {pdf_path}")
|
|
print(f"[PDF] Pages: {len(doc)}")
|
|
|
|
extracted = []
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
images = page.get_images()
|
|
print(f"[PDF] Page {page_num + 1}: {len(images)} image(s)")
|
|
|
|
for img_idx, img in enumerate(images):
|
|
xref = img[0]
|
|
pix = fitz.Pixmap(doc, xref)
|
|
|
|
# 处理颜色模式
|
|
if pix.n - pix.alpha < 4:
|
|
# RGB 或灰度
|
|
out_path = os.path.join(
|
|
output_dir,
|
|
f"page{page_num+1:03d}_img{img_idx+1:02d}.png"
|
|
)
|
|
pix.save(out_path)
|
|
else:
|
|
# CMYK,转换为 RGB
|
|
pix1 = fitz.Pixmap(fitz.csRGB, pix)
|
|
out_path = os.path.join(
|
|
output_dir,
|
|
f"page{page_num+1:03d}_img{img_idx+1:02d}.png"
|
|
)
|
|
pix1.save(out_path)
|
|
|
|
print(f" -> {out_path}")
|
|
extracted.append(out_path)
|
|
|
|
doc.close()
|
|
return extracted
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract images from PDF for OMR processing",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python extract_pdf_images.py "D:\\scores\\sheet.pdf"
|
|
python extract_pdf_images.py "D:\\scores\\sheet.pdf" "D:\\output\\sheets"
|
|
"""
|
|
)
|
|
parser.add_argument('pdf', help="Input PDF file")
|
|
parser.add_argument('output_dir', nargs='?', default=None,
|
|
help="Output directory (default: temp/pdf_sheets in PDF dir)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
pdf_path = os.path.abspath(args.pdf)
|
|
output_dir = os.path.abspath(args.output_dir) if args.output_dir else None
|
|
|
|
try:
|
|
images = extract_images_from_pdf(pdf_path, output_dir)
|
|
|
|
print()
|
|
print("=" * 50)
|
|
print(f"Extracted {len(images)} image(s)")
|
|
print("=" * 50)
|
|
|
|
if images:
|
|
print("\nNext steps:")
|
|
print(f" 1. Audiveris: & 'C:\\Program Files\\Audiveris\\Audiveris.exe' -batch -export -output <dir> {' '.join(images)}")
|
|
print(f" 2. Or use the audiveris_to_musescore.py script for each image")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|