Files
skills/html-splitter/scripts/html_splitter.py
T
hmo 04db423416 Initial commit: skills library
- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
2026-04-26 19:27:40 +08:00

382 lines
12 KiB
Python

#!/usr/bin/env python3
"""
HTML Splitter and PNG Generator
This script splits a long HTML file containing multiple tables into smaller HTML files
based on content logic, then converts each to high-resolution PNG images.
"""
import os
import argparse
import shutil
from pathlib import Path
from PIL import Image, ImageDraw
import numpy as np
import subprocess
import time
from urllib.parse import quote
def split_html_by_content(content):
"""Split HTML content into logical sections based on headings and table groups."""
# Split based on major headings
sections = []
# Define split points based on the typical structure we saw
split_patterns = [
"<h4>第八届李斯特国际钢琴公开赛</h4>",
"<h4>第四届微风·长隆国际钢琴声乐公开赛",
"<h4>2025第三届弘杨中国作品·深圳青少年钢琴邀请赛</h4>",
"<h4>2025年中国音协考级:</h4>",
"<h4>李佳茵荣誉",
]
# Find all split positions
split_positions = [0] # Start at beginning
for pattern in split_patterns:
pos = content.find(pattern)
if pos != -1:
# Try to find a good break point near the pattern (after closing div or table)
# Look for the next </div> or </table> tag after the pattern
end_pos = content.find("</body>", pos)
if end_pos != -1:
split_positions.append(end_pos + 7) # Length of '</body>'
# Add the end of content as final split point
if len(split_positions) == 1:
# If no patterns found, we'll try to split roughly equally
content_len = len(content)
if content_len > 10000: # If longer than ~10k chars
num_parts = min(
6, max(2, content_len // 15000)
) # Aim for ~15k chars per part
for i in range(1, num_parts):
split_pos = (content_len * i) // num_parts
# Find a good split point near the desired position
for j in range(split_pos, min(len(content), split_pos + 2000)):
if content[j : j + 6] in ["</div>", "</tab", "</bod"]:
split_positions.append(j + 6)
break
else:
split_positions.append(len(content))
# Make sure the end is included
if split_positions[-1] != len(content):
split_positions.append(len(content))
# Create sections based on split positions
for i in range(len(split_positions) - 1):
start = split_positions[i]
end = split_positions[i + 1]
section_content = content[start:end]
# If this isn't the last section, we need to close properly
if i < len(split_positions) - 2:
# Find where to truncate the content so it's valid HTML
last_table_end = section_content.rfind("</table>")
if last_table_end != -1:
section_content = (
section_content[: last_table_end + 8] + "\n</body>\n</html>"
) # </table> + closing tags
sections.append(section_content)
return sections
def create_complete_html_section(content_part, section_num):
"""Create a complete HTML document from a content part."""
# Extract the CSS from the original if it's contained in the part
css_start = content_part.find("<style>")
css_end = content_part.find("</style>")
if css_start != -1 and css_end != -1:
css = content_part[css_start : css_end + 8] # Include </style>
else:
# Fallback CSS
css = """<style>
body {
font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;
margin: 20px;
line-height: 1.6;
background-color: #fff;
}
.table-container {
width: 34em;
margin: 0 auto 10px auto;
}
h4 {
text-align: center;
color: #333;
margin: 30px 0 8px 0;
font-size: 18px;
border-bottom: 2px solid #333;
padding-bottom: 8px;
width: 100%;
}
.table-3col {
width: 34em;
border-collapse: collapse;
margin: 0 auto;
table-layout: fixed;
}
.table-3col td {
border-bottom: 1px solid #333;
padding: 12px 8px 4px 8px;
text-align: center;
vertical-align: bottom;
font-size: 14px;
white-space: nowrap;
}
.table-3col td:nth-child(2) {
padding: 10px 4px 4px 4px;
}
.table-3col col:nth-child(1) { width: 20%; }
.table-3col col:nth-child(2) { width: 60%; }
.table-3col col:nth-child(3) { width: 20%; }
.table-2col {
width: 34em;
border-collapse: collapse;
margin: 0 auto;
table-layout: fixed;
}
.table-2col td {
border-bottom: 1px solid #333;
padding: 12px 8px 4px 8px;
text-align: center;
vertical-align: bottom;
font-size: 14px;
word-wrap: break-word;
word-break: break-all;
}
.table-2col col:nth-child(1) { width: 70%; }
.table-2col col:nth-child(2) { width: 30%; }
.teacher-award {
font-size: 12px;
margin: 2px auto;
text-align: center;
color: #666;
width: 34em;
padding: 0 5px;
}
.teacher-award-long {
width: 36em;
}
.teacher-award strong {
color: #333;
}
.subtitle {
text-align: center;
margin: 10px 0;
font-weight: bold;
color: #333;
width: 34em;
margin-left: auto;
margin-right: auto;
}
@media print {
body {
margin: 0;
}
table {
font-size: 12px;
}
}
</style>"""
# Extract the body content
body_start = content_part.find("<body>")
body_end = content_part.find("</body>")
if body_start != -1 and body_end != -1:
body_content = content_part[
body_start + 6 : body_end
] # +6 for length of '<body>'
else:
# If no body tags, assume all content is body content
body_content = content_part
# Add proper HTML structure
html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>2025年光荣板汇总 - 第{section_num}页</title>
{css}
</head>
<body>
{body_content}
</body>
</html>"""
return html
def generate_png_from_html(html_file, output_dir):
"""Use Playwright to generate PNG from HTML."""
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Set large viewport to handle long content
page.set_viewport_size({"width": 4000, "height": 6000})
# Navigate to the HTML file
file_url = f"file://{os.path.abspath(html_file)}"
page.goto(file_url, wait_until="networkidle")
# Apply 500% zoom
page.evaluate("document.body.style.zoom = '500%'")
page.wait_for_timeout(2000) # Wait for zoom to apply
# Take full page screenshot
png_path = str(Path(output_dir) / f"{Path(html_file).stem}_500原图.png")
page.screenshot(path=png_path, full_page=True)
browser.close()
return png_path
except Exception as e:
print(f"Error generating PNG with Playwright: {e}")
return None
def make_transparent(png_path):
"""Create transparent version of PNG."""
try:
# Disable PIL decompression bomb warning for large images
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
img = Image.open(png_path)
img = img.convert("RGBA")
# Create transparent version
data = np.array(img)
alpha = np.full((data.shape[0], data.shape[1]), 255, dtype=np.uint8)
white_mask = (
(data[:, :, 0] >= 249) & (data[:, :, 1] >= 249) & (data[:, :, 2] >= 249)
)
alpha[white_mask] = 0
data[:, :, 3] = alpha
result = Image.fromarray(data, "RGBA")
# Save transparent version
transparent_path = png_path.replace("_500原图.png", "_500透明.png")
result.save(transparent_path)
return transparent_path
except Exception as e:
print(f"Error creating transparent PNG: {e}")
return None
def main():
parser = argparse.ArgumentParser(description="Split HTML file and convert to PNGs")
parser.add_argument("input_html", help="Input HTML file path")
parser.add_argument(
"-o", "--output-dir", help="Output directory (default: split-output)"
)
args = parser.parse_args()
input_path = Path(args.input_html)
output_dir = Path(args.output_dir) if args.output_dir else Path("split-output")
if not input_path.exists():
print(f"Error: Input file {input_path} does not exist")
return 1
# Create output directory
output_dir.mkdir(exist_ok=True)
# Read input HTML
with open(input_path, "r", encoding="utf-8") as f:
content = f.read()
# Extract content between body tags
body_start = content.find("<body>")
body_end = content.find("</body>")
if body_start != -1 and body_end != -1:
body_content = content[
body_start + 6 : body_end + 7
] # Include both opening and closing tags
else:
body_content = content
# Prepare complete HTML content with head and body structure
complete_content = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>2025年光荣板汇总 - 全部内容</title>
{"<style>" + content[content.find("<style>") + 7 : content.find("</style>")] + "</style>" if "<style>" in content else ""}
</head>
{body_content}
</html>"""
# Split into sections
sections = split_html_by_content(complete_content)
if not sections:
print("Could not split content into meaningful sections")
return 1
print(f"Split content into {len(sections)} sections")
# Process each section
for i, section_content in enumerate(sections, 1):
print(f"Processing section {i}/{len(sections)}...")
# Create complete HTML for this section
section_html = create_complete_html_section(section_content, i)
# Write HTML file
html_filename = output_dir / f"2025年光荣板汇总_第{i}页.html"
with open(html_filename, "w", encoding="utf-8") as f:
f.write(section_html)
# Generate PNG from HTML
png_path = generate_png_from_html(str(html_filename), output_dir)
if png_path:
# Generate transparent version
transparent_path = make_transparent(png_path)
if transparent_path:
print(
f" ✓ Created {Path(png_path).name} and {Path(transparent_path).name}"
)
else:
print(f" ✓ Created {Path(png_path).name} (transparent version failed)")
else:
print(f" ✗ Failed to create PNG for section {i}")
print(f"\nCompleted! Output saved to: {output_dir}")
return 0
if __name__ == "__main__":
exit(main())