Initial commit: skills library
- 70 skills with code and documentation - Add .gitignore (ignore __pycache__, output/, temp/, venv/) - Clean up test intermediates and caches
This commit is contained in:
@@ -0,0 +1,381 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HTML Splitter and PNG Generator
|
||||
|
||||
This script splits a long HTML file containing multiple tables into smaller HTML files
|
||||
based on content logic, then converts each to high-resolution PNG images.
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from PIL import Image, ImageDraw
|
||||
import numpy as np
|
||||
import subprocess
|
||||
import time
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
def split_html_by_content(content):
|
||||
"""Split HTML content into logical sections based on headings and table groups."""
|
||||
# Split based on major headings
|
||||
sections = []
|
||||
|
||||
# Define split points based on the typical structure we saw
|
||||
split_patterns = [
|
||||
"<h4>第八届李斯特国际钢琴公开赛</h4>",
|
||||
"<h4>第四届微风·长隆国际钢琴声乐公开赛",
|
||||
"<h4>2025第三届弘杨中国作品·深圳青少年钢琴邀请赛</h4>",
|
||||
"<h4>2025年中国音协考级:</h4>",
|
||||
"<h4>李佳茵荣誉",
|
||||
]
|
||||
|
||||
# Find all split positions
|
||||
split_positions = [0] # Start at beginning
|
||||
|
||||
for pattern in split_patterns:
|
||||
pos = content.find(pattern)
|
||||
if pos != -1:
|
||||
# Try to find a good break point near the pattern (after closing div or table)
|
||||
# Look for the next </div> or </table> tag after the pattern
|
||||
end_pos = content.find("</body>", pos)
|
||||
if end_pos != -1:
|
||||
split_positions.append(end_pos + 7) # Length of '</body>'
|
||||
|
||||
# Add the end of content as final split point
|
||||
if len(split_positions) == 1:
|
||||
# If no patterns found, we'll try to split roughly equally
|
||||
content_len = len(content)
|
||||
if content_len > 10000: # If longer than ~10k chars
|
||||
num_parts = min(
|
||||
6, max(2, content_len // 15000)
|
||||
) # Aim for ~15k chars per part
|
||||
for i in range(1, num_parts):
|
||||
split_pos = (content_len * i) // num_parts
|
||||
# Find a good split point near the desired position
|
||||
for j in range(split_pos, min(len(content), split_pos + 2000)):
|
||||
if content[j : j + 6] in ["</div>", "</tab", "</bod"]:
|
||||
split_positions.append(j + 6)
|
||||
break
|
||||
else:
|
||||
split_positions.append(len(content))
|
||||
|
||||
# Make sure the end is included
|
||||
if split_positions[-1] != len(content):
|
||||
split_positions.append(len(content))
|
||||
|
||||
# Create sections based on split positions
|
||||
for i in range(len(split_positions) - 1):
|
||||
start = split_positions[i]
|
||||
end = split_positions[i + 1]
|
||||
section_content = content[start:end]
|
||||
|
||||
# If this isn't the last section, we need to close properly
|
||||
if i < len(split_positions) - 2:
|
||||
# Find where to truncate the content so it's valid HTML
|
||||
last_table_end = section_content.rfind("</table>")
|
||||
if last_table_end != -1:
|
||||
section_content = (
|
||||
section_content[: last_table_end + 8] + "\n</body>\n</html>"
|
||||
) # </table> + closing tags
|
||||
|
||||
sections.append(section_content)
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def create_complete_html_section(content_part, section_num):
|
||||
"""Create a complete HTML document from a content part."""
|
||||
# Extract the CSS from the original if it's contained in the part
|
||||
css_start = content_part.find("<style>")
|
||||
css_end = content_part.find("</style>")
|
||||
|
||||
if css_start != -1 and css_end != -1:
|
||||
css = content_part[css_start : css_end + 8] # Include </style>
|
||||
else:
|
||||
# Fallback CSS
|
||||
css = """<style>
|
||||
body {
|
||||
font-family: "Microsoft YaHei", "SimHei", "Arial", sans-serif;
|
||||
margin: 20px;
|
||||
line-height: 1.6;
|
||||
background-color: #fff;
|
||||
}
|
||||
|
||||
.table-container {
|
||||
width: 34em;
|
||||
margin: 0 auto 10px auto;
|
||||
}
|
||||
|
||||
h4 {
|
||||
text-align: center;
|
||||
color: #333;
|
||||
margin: 30px 0 8px 0;
|
||||
font-size: 18px;
|
||||
border-bottom: 2px solid #333;
|
||||
padding-bottom: 8px;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.table-3col {
|
||||
width: 34em;
|
||||
border-collapse: collapse;
|
||||
margin: 0 auto;
|
||||
table-layout: fixed;
|
||||
}
|
||||
|
||||
.table-3col td {
|
||||
border-bottom: 1px solid #333;
|
||||
padding: 12px 8px 4px 8px;
|
||||
text-align: center;
|
||||
vertical-align: bottom;
|
||||
font-size: 14px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.table-3col td:nth-child(2) {
|
||||
padding: 10px 4px 4px 4px;
|
||||
}
|
||||
|
||||
.table-3col col:nth-child(1) { width: 20%; }
|
||||
.table-3col col:nth-child(2) { width: 60%; }
|
||||
.table-3col col:nth-child(3) { width: 20%; }
|
||||
|
||||
.table-2col {
|
||||
width: 34em;
|
||||
border-collapse: collapse;
|
||||
margin: 0 auto;
|
||||
table-layout: fixed;
|
||||
}
|
||||
|
||||
.table-2col td {
|
||||
border-bottom: 1px solid #333;
|
||||
padding: 12px 8px 4px 8px;
|
||||
text-align: center;
|
||||
vertical-align: bottom;
|
||||
font-size: 14px;
|
||||
word-wrap: break-word;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
.table-2col col:nth-child(1) { width: 70%; }
|
||||
.table-2col col:nth-child(2) { width: 30%; }
|
||||
|
||||
.teacher-award {
|
||||
font-size: 12px;
|
||||
margin: 2px auto;
|
||||
text-align: center;
|
||||
color: #666;
|
||||
width: 34em;
|
||||
padding: 0 5px;
|
||||
}
|
||||
|
||||
.teacher-award-long {
|
||||
width: 36em;
|
||||
}
|
||||
|
||||
.teacher-award strong {
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
text-align: center;
|
||||
margin: 10px 0;
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
width: 34em;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
|
||||
@media print {
|
||||
body {
|
||||
margin: 0;
|
||||
}
|
||||
table {
|
||||
font-size: 12px;
|
||||
}
|
||||
}
|
||||
</style>"""
|
||||
|
||||
# Extract the body content
|
||||
body_start = content_part.find("<body>")
|
||||
body_end = content_part.find("</body>")
|
||||
|
||||
if body_start != -1 and body_end != -1:
|
||||
body_content = content_part[
|
||||
body_start + 6 : body_end
|
||||
] # +6 for length of '<body>'
|
||||
else:
|
||||
# If no body tags, assume all content is body content
|
||||
body_content = content_part
|
||||
|
||||
# Add proper HTML structure
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>2025年光荣板汇总 - 第{section_num}页</title>
|
||||
{css}
|
||||
</head>
|
||||
<body>
|
||||
{body_content}
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def generate_png_from_html(html_file, output_dir):
|
||||
"""Use Playwright to generate PNG from HTML."""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
|
||||
# Set large viewport to handle long content
|
||||
page.set_viewport_size({"width": 4000, "height": 6000})
|
||||
|
||||
# Navigate to the HTML file
|
||||
file_url = f"file://{os.path.abspath(html_file)}"
|
||||
page.goto(file_url, wait_until="networkidle")
|
||||
|
||||
# Apply 500% zoom
|
||||
page.evaluate("document.body.style.zoom = '500%'")
|
||||
page.wait_for_timeout(2000) # Wait for zoom to apply
|
||||
|
||||
# Take full page screenshot
|
||||
png_path = str(Path(output_dir) / f"{Path(html_file).stem}_500原图.png")
|
||||
page.screenshot(path=png_path, full_page=True)
|
||||
|
||||
browser.close()
|
||||
|
||||
return png_path
|
||||
except Exception as e:
|
||||
print(f"Error generating PNG with Playwright: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def make_transparent(png_path):
|
||||
"""Create transparent version of PNG."""
|
||||
try:
|
||||
# Disable PIL decompression bomb warning for large images
|
||||
from PIL import Image
|
||||
|
||||
Image.MAX_IMAGE_PIXELS = None
|
||||
|
||||
img = Image.open(png_path)
|
||||
img = img.convert("RGBA")
|
||||
|
||||
# Create transparent version
|
||||
data = np.array(img)
|
||||
alpha = np.full((data.shape[0], data.shape[1]), 255, dtype=np.uint8)
|
||||
white_mask = (
|
||||
(data[:, :, 0] >= 249) & (data[:, :, 1] >= 249) & (data[:, :, 2] >= 249)
|
||||
)
|
||||
alpha[white_mask] = 0
|
||||
data[:, :, 3] = alpha
|
||||
result = Image.fromarray(data, "RGBA")
|
||||
|
||||
# Save transparent version
|
||||
transparent_path = png_path.replace("_500原图.png", "_500透明.png")
|
||||
result.save(transparent_path)
|
||||
|
||||
return transparent_path
|
||||
except Exception as e:
|
||||
print(f"Error creating transparent PNG: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Split HTML file and convert to PNGs")
|
||||
parser.add_argument("input_html", help="Input HTML file path")
|
||||
parser.add_argument(
|
||||
"-o", "--output-dir", help="Output directory (default: split-output)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input_html)
|
||||
output_dir = Path(args.output_dir) if args.output_dir else Path("split-output")
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file {input_path} does not exist")
|
||||
return 1
|
||||
|
||||
# Create output directory
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Read input HTML
|
||||
with open(input_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Extract content between body tags
|
||||
body_start = content.find("<body>")
|
||||
body_end = content.find("</body>")
|
||||
|
||||
if body_start != -1 and body_end != -1:
|
||||
body_content = content[
|
||||
body_start + 6 : body_end + 7
|
||||
] # Include both opening and closing tags
|
||||
else:
|
||||
body_content = content
|
||||
|
||||
# Prepare complete HTML content with head and body structure
|
||||
complete_content = f"""<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>2025年光荣板汇总 - 全部内容</title>
|
||||
{"<style>" + content[content.find("<style>") + 7 : content.find("</style>")] + "</style>" if "<style>" in content else ""}
|
||||
</head>
|
||||
{body_content}
|
||||
</html>"""
|
||||
|
||||
# Split into sections
|
||||
sections = split_html_by_content(complete_content)
|
||||
|
||||
if not sections:
|
||||
print("Could not split content into meaningful sections")
|
||||
return 1
|
||||
|
||||
print(f"Split content into {len(sections)} sections")
|
||||
|
||||
# Process each section
|
||||
for i, section_content in enumerate(sections, 1):
|
||||
print(f"Processing section {i}/{len(sections)}...")
|
||||
|
||||
# Create complete HTML for this section
|
||||
section_html = create_complete_html_section(section_content, i)
|
||||
|
||||
# Write HTML file
|
||||
html_filename = output_dir / f"2025年光荣板汇总_第{i}页.html"
|
||||
with open(html_filename, "w", encoding="utf-8") as f:
|
||||
f.write(section_html)
|
||||
|
||||
# Generate PNG from HTML
|
||||
png_path = generate_png_from_html(str(html_filename), output_dir)
|
||||
|
||||
if png_path:
|
||||
# Generate transparent version
|
||||
transparent_path = make_transparent(png_path)
|
||||
if transparent_path:
|
||||
print(
|
||||
f" ✓ Created {Path(png_path).name} and {Path(transparent_path).name}"
|
||||
)
|
||||
else:
|
||||
print(f" ✓ Created {Path(png_path).name} (transparent version failed)")
|
||||
else:
|
||||
print(f" ✗ Failed to create PNG for section {i}")
|
||||
|
||||
print(f"\nCompleted! Output saved to: {output_dir}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user