skills/html-splitter/scripts/html_splitter.py

#!/usr/bin/env python3
"""
HTML Splitter and PNG Generator

This script splits a long HTML file containing multiple tables into smaller HTML files
based on content logic, then converts each to high-resolution PNG images.
"""

import os
import argparse
import shutil
from pathlib import Path
from PIL import Image, ImageDraw
import numpy as np
import subprocess
import time
from urllib.parse import quote


def split_html_by_content(content):
    """Split HTML content into logical sections based on headings and table groups."""
    # Split based on major headings
    sections = []

    # Define split points based on the typical structure we saw
    split_patterns = [
        "<h4>第八届李斯特国际钢琴公开赛</h4>",
        "<h4>第四届微风·长隆国际钢琴声乐公开赛",
        "<h4>2025第三届弘杨中国作品·深圳青少年钢琴邀请赛</h4>",
        "<h4>2025年中国音协考级：</h4>",
        "<h4>李佳茵荣誉",
    ]

    # Find all split positions
    split_positions = [0]  # Start at beginning

    for pattern in split_patterns:
        pos = content.find(pattern)
        if pos != -1:
            # Try to find a good break point near the pattern (after closing div or table)
            # Look for the next </div> or </table> tag after the pattern
            end_pos = content.find("</body>", pos)
            if end_pos != -1:
                split_positions.append(end_pos + 7)  # Length of '</body>'

    # Add the end of content as final split point
    if len(split_positions) == 1:
        # If no patterns found, we'll try to split roughly equally
        content_len = len(content)
        if content_len > 10000:  # If longer than ~10k chars
            num_parts = min(
                6, max(2, content_len // 15000)
            )  # Aim for ~15k chars per part
            for i in range(1, num_parts):
                split_pos = (content_len * i) // num_parts
                # Find a good split point near the desired position
                for j in range(split_pos, min(len(content), split_pos + 2000)):
                    if content[j : j + 6] in ["</div>", "</tab", "</bod"]:
                        split_positions.append(j + 6)
                        break
        else:
            split_positions.append(len(content))

    # Make sure the end is included
    if split_positions[-1] != len(content):
        split_positions.append(len(content))

    # Create sections based on split positions
    for i in range(len(split_positions) - 1):
        start = split_positions[i]
        end = split_positions[i + 1]
        section_content = content[start:end]

        # If this isn't the last section, we need to close properly
        if i < len(split_positions) - 2:
            # Find where to truncate the content so it's valid HTML
            last_table_end = section_content.rfind("</table>")
            if last_table_end != -1:
                section_content = (
                    section_content[: last_table_end + 8] + "\n</body>\n</html>"
                )  # </table> + closing tags

        sections.append(section_content)

    return sections


def create_complete_html_section(content_part, section_num):
    """Create a complete HTML document from a content part."""
    # Extract the CSS from the original if it's contained in the part
    css_start = content_part.find("<style>")
    css_end = content_part.find("</style>")

    if css_start != -1 and css_end != -1:
        css = content_part[css_start : css_end + 8]  # Include </style>
    else:
        # Fallback CSS
        css = """<style>
        body {
            font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;
            margin: 20px;
            line-height: 1.6;
            background-color: #fff;
        }

        .table-container {
            width: 34em;
            margin: 0 auto 10px auto;
        }

        h4 {
            text-align: center;
            color: #333;
            margin: 30px 0 8px 0;
            font-size: 18px;
            border-bottom: 2px solid #333;
            padding-bottom: 8px;
            width: 100%;
        }

        .table-3col {
            width: 34em;
            border-collapse: collapse;
            margin: 0 auto;
            table-layout: fixed;
        }

        .table-3col td {
            border-bottom: 1px solid #333;
            padding: 12px 8px 4px 8px;
            text-align: center;
            vertical-align: bottom;
            font-size: 14px;
            white-space: nowrap;
        }

        .table-3col td:nth-child(2) {
            padding: 10px 4px 4px 4px;
        }

        .table-3col col:nth-child(1) { width: 20%; }
        .table-3col col:nth-child(2) { width: 60%; }
        .table-3col col:nth-child(3) { width: 20%; }

        .table-2col {
            width: 34em;
            border-collapse: collapse;
            margin: 0 auto;
            table-layout: fixed;
        }

        .table-2col td {
            border-bottom: 1px solid #333;
            padding: 12px 8px 4px 8px;
            text-align: center;
            vertical-align: bottom;
            font-size: 14px;
            word-wrap: break-word;
            word-break: break-all;
        }

        .table-2col col:nth-child(1) { width: 70%; }
        .table-2col col:nth-child(2) { width: 30%; }

        .teacher-award {
            font-size: 12px;
            margin: 2px auto;
            text-align: center;
            color: #666;
            width: 34em;
            padding: 0 5px;
        }

        .teacher-award-long {
            width: 36em;
        }

        .teacher-award strong {
            color: #333;
        }

        .subtitle {
            text-align: center;
            margin: 10px 0;
            font-weight: bold;
            color: #333;
            width: 34em;
            margin-left: auto;
            margin-right: auto;
        }

        @media print {
            body {
                margin: 0;
            }
            table {
                font-size: 12px;
            }
        }
    </style>"""

    # Extract the body content
    body_start = content_part.find("<body>")
    body_end = content_part.find("</body>")

    if body_start != -1 and body_end != -1:
        body_content = content_part[
            body_start + 6 : body_end
        ]  # +6 for length of '<body>'
    else:
        # If no body tags, assume all content is body content
        body_content = content_part

    # Add proper HTML structure
    html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>2025年光荣板汇总 - 第{section_num}页</title>
    {css}
</head>
<body>
{body_content}
</body>
</html>"""

    return html


def generate_png_from_html(html_file, output_dir):
    """Use Playwright to generate PNG from HTML."""
    try:
        from playwright.sync_api import sync_playwright

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()

            # Set large viewport to handle long content
            page.set_viewport_size({"width": 4000, "height": 6000})

            # Navigate to the HTML file
            file_url = f"file://{os.path.abspath(html_file)}"
            page.goto(file_url, wait_until="networkidle")

            # Apply 500% zoom
            page.evaluate("document.body.style.zoom = '500%'")
            page.wait_for_timeout(2000)  # Wait for zoom to apply

            # Take full page screenshot
            png_path = str(Path(output_dir) / f"{Path(html_file).stem}_500原图.png")
            page.screenshot(path=png_path, full_page=True)

            browser.close()

        return png_path
    except Exception as e:
        print(f"Error generating PNG with Playwright: {e}")
        return None


def make_transparent(png_path):
    """Create transparent version of PNG."""
    try:
        # Disable PIL decompression bomb warning for large images
        from PIL import Image

        Image.MAX_IMAGE_PIXELS = None

        img = Image.open(png_path)
        img = img.convert("RGBA")

        # Create transparent version
        data = np.array(img)
        alpha = np.full((data.shape[0], data.shape[1]), 255, dtype=np.uint8)
        white_mask = (
            (data[:, :, 0] >= 249) & (data[:, :, 1] >= 249) & (data[:, :, 2] >= 249)
        )
        alpha[white_mask] = 0
        data[:, :, 3] = alpha
        result = Image.fromarray(data, "RGBA")

        # Save transparent version
        transparent_path = png_path.replace("_500原图.png", "_500透明.png")
        result.save(transparent_path)

        return transparent_path
    except Exception as e:
        print(f"Error creating transparent PNG: {e}")
        return None


def main():
    parser = argparse.ArgumentParser(description="Split HTML file and convert to PNGs")
    parser.add_argument("input_html", help="Input HTML file path")
    parser.add_argument(
        "-o", "--output-dir", help="Output directory (default: split-output)"
    )

    args = parser.parse_args()

    input_path = Path(args.input_html)
    output_dir = Path(args.output_dir) if args.output_dir else Path("split-output")

    if not input_path.exists():
        print(f"Error: Input file {input_path} does not exist")
        return 1

    # Create output directory
    output_dir.mkdir(exist_ok=True)

    # Read input HTML
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Extract content between body tags
    body_start = content.find("<body>")
    body_end = content.find("</body>")

    if body_start != -1 and body_end != -1:
        body_content = content[
            body_start + 6 : body_end + 7
        ]  # Include both opening and closing tags
    else:
        body_content = content

    # Prepare complete HTML content with head and body structure
    complete_content = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>2025年光荣板汇总 - 全部内容</title>
    {"<style>" + content[content.find("<style>") + 7 : content.find("</style>")] + "</style>" if "<style>" in content else ""}
</head>
{body_content}
</html>"""

    # Split into sections
    sections = split_html_by_content(complete_content)

    if not sections:
        print("Could not split content into meaningful sections")
        return 1

    print(f"Split content into {len(sections)} sections")

    # Process each section
    for i, section_content in enumerate(sections, 1):
        print(f"Processing section {i}/{len(sections)}...")

        # Create complete HTML for this section
        section_html = create_complete_html_section(section_content, i)

        # Write HTML file
        html_filename = output_dir / f"2025年光荣板汇总_第{i}页.html"
        with open(html_filename, "w", encoding="utf-8") as f:
            f.write(section_html)

        # Generate PNG from HTML
        png_path = generate_png_from_html(str(html_filename), output_dir)

        if png_path:
            # Generate transparent version
            transparent_path = make_transparent(png_path)
            if transparent_path:
                print(
                    f"  ✓ Created {Path(png_path).name} and {Path(transparent_path).name}"
                )
            else:
                print(f"  ✓ Created {Path(png_path).name} (transparent version failed)")
        else:
            print(f"  ✗ Failed to create PNG for section {i}")

    print(f"\nCompleted! Output saved to: {output_dir}")
    return 0


if __name__ == "__main__":
    exit(main())