#!/usr/bin/env python3 """ HTML Splitter and PNG Generator This script splits a long HTML file containing multiple tables into smaller HTML files based on content logic, then converts each to high-resolution PNG images. """ import os import argparse import shutil from pathlib import Path from PIL import Image, ImageDraw import numpy as np import subprocess import time from urllib.parse import quote def split_html_by_content(content): """Split HTML content into logical sections based on headings and table groups.""" # Split based on major headings sections = [] # Define split points based on the typical structure we saw split_patterns = [ "

第八届李斯特国际钢琴公开赛

", "

第四届微风·长隆国际钢琴声乐公开赛", "

2025第三届弘杨中国作品·深圳青少年钢琴邀请赛

", "

2025年中国音协考级:

", "

李佳茵荣誉", ] # Find all split positions split_positions = [0] # Start at beginning for pattern in split_patterns: pos = content.find(pattern) if pos != -1: # Try to find a good break point near the pattern (after closing div or table) # Look for the next or tag after the pattern end_pos = content.find("", pos) if end_pos != -1: split_positions.append(end_pos + 7) # Length of '' # Add the end of content as final split point if len(split_positions) == 1: # If no patterns found, we'll try to split roughly equally content_len = len(content) if content_len > 10000: # If longer than ~10k chars num_parts = min( 6, max(2, content_len // 15000) ) # Aim for ~15k chars per part for i in range(1, num_parts): split_pos = (content_len * i) // num_parts # Find a good split point near the desired position for j in range(split_pos, min(len(content), split_pos + 2000)): if content[j : j + 6] in ["", "") if last_table_end != -1: section_content = ( section_content[: last_table_end + 8] + "\n\n" ) # + closing tags sections.append(section_content) return sections def create_complete_html_section(content_part, section_num): """Create a complete HTML document from a content part.""" # Extract the CSS from the original if it's contained in the part css_start = content_part.find("") if css_start != -1 and css_end != -1: css = content_part[css_start : css_end + 8] # Include else: # Fallback CSS css = """""" # Extract the body content body_start = content_part.find("") body_end = content_part.find("") if body_start != -1 and body_end != -1: body_content = content_part[ body_start + 6 : body_end ] # +6 for length of '' else: # If no body tags, assume all content is body content body_content = content_part # Add proper HTML structure html = f""" 2025年光荣板汇总 - 第{section_num}页 {css} {body_content} """ return html def generate_png_from_html(html_file, output_dir): """Use Playwright to generate PNG from HTML.""" try: from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() # Set large viewport to handle long content page.set_viewport_size({"width": 4000, "height": 6000}) # Navigate to the HTML file file_url = f"file://{os.path.abspath(html_file)}" page.goto(file_url, wait_until="networkidle") # Apply 500% zoom page.evaluate("document.body.style.zoom = '500%'") page.wait_for_timeout(2000) # Wait for zoom to apply # Take full page screenshot png_path = str(Path(output_dir) / f"{Path(html_file).stem}_500原图.png") page.screenshot(path=png_path, full_page=True) browser.close() return png_path except Exception as e: print(f"Error generating PNG with Playwright: {e}") return None def make_transparent(png_path): """Create transparent version of PNG.""" try: # Disable PIL decompression bomb warning for large images from PIL import Image Image.MAX_IMAGE_PIXELS = None img = Image.open(png_path) img = img.convert("RGBA") # Create transparent version data = np.array(img) alpha = np.full((data.shape[0], data.shape[1]), 255, dtype=np.uint8) white_mask = ( (data[:, :, 0] >= 249) & (data[:, :, 1] >= 249) & (data[:, :, 2] >= 249) ) alpha[white_mask] = 0 data[:, :, 3] = alpha result = Image.fromarray(data, "RGBA") # Save transparent version transparent_path = png_path.replace("_500原图.png", "_500透明.png") result.save(transparent_path) return transparent_path except Exception as e: print(f"Error creating transparent PNG: {e}") return None def main(): parser = argparse.ArgumentParser(description="Split HTML file and convert to PNGs") parser.add_argument("input_html", help="Input HTML file path") parser.add_argument( "-o", "--output-dir", help="Output directory (default: split-output)" ) args = parser.parse_args() input_path = Path(args.input_html) output_dir = Path(args.output_dir) if args.output_dir else Path("split-output") if not input_path.exists(): print(f"Error: Input file {input_path} does not exist") return 1 # Create output directory output_dir.mkdir(exist_ok=True) # Read input HTML with open(input_path, "r", encoding="utf-8") as f: content = f.read() # Extract content between body tags body_start = content.find("") body_end = content.find("") if body_start != -1 and body_end != -1: body_content = content[ body_start + 6 : body_end + 7 ] # Include both opening and closing tags else: body_content = content # Prepare complete HTML content with head and body structure complete_content = f""" 2025年光荣板汇总 - 全部内容 {"")] + "" if "