04db423416
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
382 lines
12 KiB
Python
382 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
HTML Splitter and PNG Generator
|
|
|
|
This script splits a long HTML file containing multiple tables into smaller HTML files
|
|
based on content logic, then converts each to high-resolution PNG images.
|
|
"""
|
|
|
|
import os
|
|
import argparse
|
|
import shutil
|
|
from pathlib import Path
|
|
from PIL import Image, ImageDraw
|
|
import numpy as np
|
|
import subprocess
|
|
import time
|
|
from urllib.parse import quote
|
|
|
|
|
|
def split_html_by_content(content):
|
|
"""Split HTML content into logical sections based on headings and table groups."""
|
|
# Split based on major headings
|
|
sections = []
|
|
|
|
# Define split points based on the typical structure we saw
|
|
split_patterns = [
|
|
"<h4>第八届李斯特国际钢琴公开赛</h4>",
|
|
"<h4>第四届微风·长隆国际钢琴声乐公开赛",
|
|
"<h4>2025第三届弘杨中国作品·深圳青少年钢琴邀请赛</h4>",
|
|
"<h4>2025年中国音协考级:</h4>",
|
|
"<h4>李佳茵荣誉",
|
|
]
|
|
|
|
# Find all split positions
|
|
split_positions = [0] # Start at beginning
|
|
|
|
for pattern in split_patterns:
|
|
pos = content.find(pattern)
|
|
if pos != -1:
|
|
# Try to find a good break point near the pattern (after closing div or table)
|
|
# Look for the next </div> or </table> tag after the pattern
|
|
end_pos = content.find("</body>", pos)
|
|
if end_pos != -1:
|
|
split_positions.append(end_pos + 7) # Length of '</body>'
|
|
|
|
# Add the end of content as final split point
|
|
if len(split_positions) == 1:
|
|
# If no patterns found, we'll try to split roughly equally
|
|
content_len = len(content)
|
|
if content_len > 10000: # If longer than ~10k chars
|
|
num_parts = min(
|
|
6, max(2, content_len // 15000)
|
|
) # Aim for ~15k chars per part
|
|
for i in range(1, num_parts):
|
|
split_pos = (content_len * i) // num_parts
|
|
# Find a good split point near the desired position
|
|
for j in range(split_pos, min(len(content), split_pos + 2000)):
|
|
if content[j : j + 6] in ["</div>", "</tab", "</bod"]:
|
|
split_positions.append(j + 6)
|
|
break
|
|
else:
|
|
split_positions.append(len(content))
|
|
|
|
# Make sure the end is included
|
|
if split_positions[-1] != len(content):
|
|
split_positions.append(len(content))
|
|
|
|
# Create sections based on split positions
|
|
for i in range(len(split_positions) - 1):
|
|
start = split_positions[i]
|
|
end = split_positions[i + 1]
|
|
section_content = content[start:end]
|
|
|
|
# If this isn't the last section, we need to close properly
|
|
if i < len(split_positions) - 2:
|
|
# Find where to truncate the content so it's valid HTML
|
|
last_table_end = section_content.rfind("</table>")
|
|
if last_table_end != -1:
|
|
section_content = (
|
|
section_content[: last_table_end + 8] + "\n</body>\n</html>"
|
|
) # </table> + closing tags
|
|
|
|
sections.append(section_content)
|
|
|
|
return sections
|
|
|
|
|
|
def create_complete_html_section(content_part, section_num):
|
|
"""Create a complete HTML document from a content part."""
|
|
# Extract the CSS from the original if it's contained in the part
|
|
css_start = content_part.find("<style>")
|
|
css_end = content_part.find("</style>")
|
|
|
|
if css_start != -1 and css_end != -1:
|
|
css = content_part[css_start : css_end + 8] # Include </style>
|
|
else:
|
|
# Fallback CSS
|
|
css = """<style>
|
|
body {
|
|
font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;
|
|
margin: 20px;
|
|
line-height: 1.6;
|
|
background-color: #fff;
|
|
}
|
|
|
|
.table-container {
|
|
width: 34em;
|
|
margin: 0 auto 10px auto;
|
|
}
|
|
|
|
h4 {
|
|
text-align: center;
|
|
color: #333;
|
|
margin: 30px 0 8px 0;
|
|
font-size: 18px;
|
|
border-bottom: 2px solid #333;
|
|
padding-bottom: 8px;
|
|
width: 100%;
|
|
}
|
|
|
|
.table-3col {
|
|
width: 34em;
|
|
border-collapse: collapse;
|
|
margin: 0 auto;
|
|
table-layout: fixed;
|
|
}
|
|
|
|
.table-3col td {
|
|
border-bottom: 1px solid #333;
|
|
padding: 12px 8px 4px 8px;
|
|
text-align: center;
|
|
vertical-align: bottom;
|
|
font-size: 14px;
|
|
white-space: nowrap;
|
|
}
|
|
|
|
.table-3col td:nth-child(2) {
|
|
padding: 10px 4px 4px 4px;
|
|
}
|
|
|
|
.table-3col col:nth-child(1) { width: 20%; }
|
|
.table-3col col:nth-child(2) { width: 60%; }
|
|
.table-3col col:nth-child(3) { width: 20%; }
|
|
|
|
.table-2col {
|
|
width: 34em;
|
|
border-collapse: collapse;
|
|
margin: 0 auto;
|
|
table-layout: fixed;
|
|
}
|
|
|
|
.table-2col td {
|
|
border-bottom: 1px solid #333;
|
|
padding: 12px 8px 4px 8px;
|
|
text-align: center;
|
|
vertical-align: bottom;
|
|
font-size: 14px;
|
|
word-wrap: break-word;
|
|
word-break: break-all;
|
|
}
|
|
|
|
.table-2col col:nth-child(1) { width: 70%; }
|
|
.table-2col col:nth-child(2) { width: 30%; }
|
|
|
|
.teacher-award {
|
|
font-size: 12px;
|
|
margin: 2px auto;
|
|
text-align: center;
|
|
color: #666;
|
|
width: 34em;
|
|
padding: 0 5px;
|
|
}
|
|
|
|
.teacher-award-long {
|
|
width: 36em;
|
|
}
|
|
|
|
.teacher-award strong {
|
|
color: #333;
|
|
}
|
|
|
|
.subtitle {
|
|
text-align: center;
|
|
margin: 10px 0;
|
|
font-weight: bold;
|
|
color: #333;
|
|
width: 34em;
|
|
margin-left: auto;
|
|
margin-right: auto;
|
|
}
|
|
|
|
@media print {
|
|
body {
|
|
margin: 0;
|
|
}
|
|
table {
|
|
font-size: 12px;
|
|
}
|
|
}
|
|
</style>"""
|
|
|
|
# Extract the body content
|
|
body_start = content_part.find("<body>")
|
|
body_end = content_part.find("</body>")
|
|
|
|
if body_start != -1 and body_end != -1:
|
|
body_content = content_part[
|
|
body_start + 6 : body_end
|
|
] # +6 for length of '<body>'
|
|
else:
|
|
# If no body tags, assume all content is body content
|
|
body_content = content_part
|
|
|
|
# Add proper HTML structure
|
|
html = f"""<!DOCTYPE html>
|
|
<html lang="zh-CN">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>2025年光荣板汇总 - 第{section_num}页</title>
|
|
{css}
|
|
</head>
|
|
<body>
|
|
{body_content}
|
|
</body>
|
|
</html>"""
|
|
|
|
return html
|
|
|
|
|
|
def generate_png_from_html(html_file, output_dir):
|
|
"""Use Playwright to generate PNG from HTML."""
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
|
|
# Set large viewport to handle long content
|
|
page.set_viewport_size({"width": 4000, "height": 6000})
|
|
|
|
# Navigate to the HTML file
|
|
file_url = f"file://{os.path.abspath(html_file)}"
|
|
page.goto(file_url, wait_until="networkidle")
|
|
|
|
# Apply 500% zoom
|
|
page.evaluate("document.body.style.zoom = '500%'")
|
|
page.wait_for_timeout(2000) # Wait for zoom to apply
|
|
|
|
# Take full page screenshot
|
|
png_path = str(Path(output_dir) / f"{Path(html_file).stem}_500原图.png")
|
|
page.screenshot(path=png_path, full_page=True)
|
|
|
|
browser.close()
|
|
|
|
return png_path
|
|
except Exception as e:
|
|
print(f"Error generating PNG with Playwright: {e}")
|
|
return None
|
|
|
|
|
|
def make_transparent(png_path):
|
|
"""Create transparent version of PNG."""
|
|
try:
|
|
# Disable PIL decompression bomb warning for large images
|
|
from PIL import Image
|
|
|
|
Image.MAX_IMAGE_PIXELS = None
|
|
|
|
img = Image.open(png_path)
|
|
img = img.convert("RGBA")
|
|
|
|
# Create transparent version
|
|
data = np.array(img)
|
|
alpha = np.full((data.shape[0], data.shape[1]), 255, dtype=np.uint8)
|
|
white_mask = (
|
|
(data[:, :, 0] >= 249) & (data[:, :, 1] >= 249) & (data[:, :, 2] >= 249)
|
|
)
|
|
alpha[white_mask] = 0
|
|
data[:, :, 3] = alpha
|
|
result = Image.fromarray(data, "RGBA")
|
|
|
|
# Save transparent version
|
|
transparent_path = png_path.replace("_500原图.png", "_500透明.png")
|
|
result.save(transparent_path)
|
|
|
|
return transparent_path
|
|
except Exception as e:
|
|
print(f"Error creating transparent PNG: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Split HTML file and convert to PNGs")
|
|
parser.add_argument("input_html", help="Input HTML file path")
|
|
parser.add_argument(
|
|
"-o", "--output-dir", help="Output directory (default: split-output)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input_html)
|
|
output_dir = Path(args.output_dir) if args.output_dir else Path("split-output")
|
|
|
|
if not input_path.exists():
|
|
print(f"Error: Input file {input_path} does not exist")
|
|
return 1
|
|
|
|
# Create output directory
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Read input HTML
|
|
with open(input_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
# Extract content between body tags
|
|
body_start = content.find("<body>")
|
|
body_end = content.find("</body>")
|
|
|
|
if body_start != -1 and body_end != -1:
|
|
body_content = content[
|
|
body_start + 6 : body_end + 7
|
|
] # Include both opening and closing tags
|
|
else:
|
|
body_content = content
|
|
|
|
# Prepare complete HTML content with head and body structure
|
|
complete_content = f"""<!DOCTYPE html>
|
|
<html lang="zh-CN">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>2025年光荣板汇总 - 全部内容</title>
|
|
{"<style>" + content[content.find("<style>") + 7 : content.find("</style>")] + "</style>" if "<style>" in content else ""}
|
|
</head>
|
|
{body_content}
|
|
</html>"""
|
|
|
|
# Split into sections
|
|
sections = split_html_by_content(complete_content)
|
|
|
|
if not sections:
|
|
print("Could not split content into meaningful sections")
|
|
return 1
|
|
|
|
print(f"Split content into {len(sections)} sections")
|
|
|
|
# Process each section
|
|
for i, section_content in enumerate(sections, 1):
|
|
print(f"Processing section {i}/{len(sections)}...")
|
|
|
|
# Create complete HTML for this section
|
|
section_html = create_complete_html_section(section_content, i)
|
|
|
|
# Write HTML file
|
|
html_filename = output_dir / f"2025年光荣板汇总_第{i}页.html"
|
|
with open(html_filename, "w", encoding="utf-8") as f:
|
|
f.write(section_html)
|
|
|
|
# Generate PNG from HTML
|
|
png_path = generate_png_from_html(str(html_filename), output_dir)
|
|
|
|
if png_path:
|
|
# Generate transparent version
|
|
transparent_path = make_transparent(png_path)
|
|
if transparent_path:
|
|
print(
|
|
f" ✓ Created {Path(png_path).name} and {Path(transparent_path).name}"
|
|
)
|
|
else:
|
|
print(f" ✓ Created {Path(png_path).name} (transparent version failed)")
|
|
else:
|
|
print(f" ✗ Failed to create PNG for section {i}")
|
|
|
|
print(f"\nCompleted! Output saved to: {output_dir}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|