Files
wh40k-points-comparator/build_pdfs.py
root 38bffa491c Initial commit: WH40K Points Comparator
- React + MUI DataGrid app with faction filter, search, change filter
- Biggest movers cards (drops/rises) scoped to current filter view
- Historical points graph modal (5 MFM versions: 1.14 → current)
- URL state sync (faction, dir, q params — shareable URLs)
- Grimdark favicon + OG embed image (Google Imagen)
- Multi-stage Dockerfile (node build → nginx serve)
- docker-compose.yml with Traefik + Cloudflare TLS
- Data pipeline: build_deduped_data.py merges PDF + live scrape
- Ynnari merged into Aeldari (shared codex)
- Mobile responsive: flex columns, no fixed pixel widths
- Color semantics: green=cheaper, red=costlier (consistent everywhere)
- 1,449 units across 31 factions
2026-06-18 02:42:29 +00:00

189 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""Visit each WH40K faction page slowly and print to PDF.
Uses the system chromium binary via Playwright so we don't need to download
the bundled headless shell. Each page gets a polite delay, waits for
content, then prints to PDF with background graphics enabled.
"""
from __future__ import annotations
import json
import re
import sys
import time
from pathlib import Path
from playwright.sync_api import sync_playwright
FACTIONS = [
("adepta-sororitas", "Adepta Sororitas", "https://mfm.warhammer-community.com/en/adepta-sororitas"),
("adeptus-custodes", "Adeptus Custodes", "https://mfm.warhammer-community.com/en/adeptus-custodes"),
("adeptus-mechanicus", "Adeptus Mechanicus", "https://mfm.warhammer-community.com/en/adeptus-mechanicus"),
("aeldari", "Aeldari", "https://mfm.warhammer-community.com/en/aeldari"),
("astra-militarum", "Astra Militarum", "https://mfm.warhammer-community.com/en/astra-militarum"),
("black-templars", "Black Templars", "https://mfm.warhammer-community.com/en/black-templars"),
("blood-angels", "Blood Angels", "https://mfm.warhammer-community.com/en/blood-angels"),
("chaos-daemons", "Chaos Daemons", "https://mfm.warhammer-community.com/en/chaos-daemons"),
("chaos-knights", "Chaos Knights", "https://mfm.warhammer-community.com/en/chaos-knights"),
("chaos-space-marines", "Chaos Space Marines", "https://mfm.warhammer-community.com/en/chaos-space-marines"),
("chaos-titan-legions", "Chaos Titan Legions", "https://mfm.warhammer-community.com/en/chaos-titan-legions"),
("dark-angels", "Dark Angels", "https://mfm.warhammer-community.com/en/dark-angels"),
("death-guard", "Death Guard", "https://mfm.warhammer-community.com/en/death-guard"),
("deathwatch", "Deathwatch", "https://mfm.warhammer-community.com/en/deathwatch"),
("drukhari", "Drukhari", "https://mfm.warhammer-community.com/en/drukhari"),
("emperors-children", "Emperor's Children", "https://mfm.warhammer-community.com/en/emperors-children"),
("genestealer-cults", "Genestealer Cults", "https://mfm.warhammer-community.com/en/genestealer-cults"),
("grey-knights", "Grey Knights", "https://mfm.warhammer-community.com/en/grey-knights"),
("imperial-agents", "Imperial Agents", "https://mfm.warhammer-community.com/en/imperial-agents"),
("imperial-knights", "Imperial Knights", "https://mfm.warhammer-community.com/en/imperial-knights"),
("leagues-of-votann", "Leagues of Votann", "https://mfm.warhammer-community.com/en/leagues-of-votann"),
("necrons", "Necrons", "https://mfm.warhammer-community.com/en/necrons"),
("orks", "Orks", "https://mfm.warhammer-community.com/en/orks"),
("space-marines", "Space Marines", "https://mfm.warhammer-community.com/en/space-marines"),
("space-wolves", "Space Wolves", "https://mfm.warhammer-community.com/en/space-wolves"),
("tau-empire", "T'au Empire", "https://mfm.warhammer-community.com/en/tau-empire"),
("thousand-sons", "Thousand Sons", "https://mfm.warhammer-community.com/en/thousand-sons"),
("titan-legions", "Titan Legions", "https://mfm.warhammer-community.com/en/titan-legions"),
("tyranids", "Tyranids", "https://mfm.warhammer-community.com/en/tyranids"),
("world-eaters", "World Eaters", "https://mfm.warhammer-community.com/en/world-eaters"),
]
OUT_DIR = Path("/root/wh40k-factions/pdfs")
LOG_DIR = Path("/root/wh40k-factions/logs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)
CHROMIUM_BIN = "/usr/bin/chromium"
# Polite crawl pacing
PAGE_DELAY_S = 3.0 # settle time after navigation
NETWORK_IDLE_TIMEOUT_MS = 20000
SLOW_LOAD_BUFFER_MS = 4000
def slugify(s: str) -> str:
return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")
def fetch_one(p, slug: str, name: str, url: str, idx: int, total: int) -> dict:
"""Visit a single URL, render it, save a PDF. Return a status dict."""
pdf_path = OUT_DIR / f"{slug}.pdf"
log_path = LOG_DIR / f"{slug}.log"
status = {
"slug": slug,
"name": name,
"url": url,
"pdf": str(pdf_path),
"ok": False,
"size_bytes": 0,
"error": None,
"elapsed_s": 0.0,
}
print(f"[{idx:>2}/{total}] {name} -> {url}", flush=True)
start = time.time()
browser = None
try:
context = p.chromium.launch_persistent_context(
user_data_dir=f"/tmp/wh40k-chrome-{slug}",
executable_path=CHROMIUM_BIN,
headless=True,
viewport={"width": 1280, "height": 1800},
user_agent=(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"
),
accept_downloads=False,
ignore_https_errors=True,
)
page = context.new_page()
# Quiet the page console
page.on("pageerror", lambda exc: print(f" pageerror: {exc}", flush=True))
page.goto(url, wait_until="domcontentloaded", timeout=45000)
# Give the Next.js client-side hydration time to run, then wait for
# network to settle (images, fonts, etc.).
try:
page.wait_for_load_state("networkidle", timeout=NETWORK_IDLE_TIMEOUT_MS)
except Exception as e:
print(f" networkidle timeout (continuing): {e}", flush=True)
# Scroll to bottom to trigger lazy-loaded images / sections, then
# back to top so the PDF starts at the header.
page.evaluate(
"""
async () => {
const sleep = ms => new Promise(r => setTimeout(r, ms));
const total = document.body.scrollHeight;
for (let y = 0; y <= total; y += 800) {
window.scrollTo(0, y);
await sleep(120);
}
window.scrollTo(0, 0);
await sleep(SLEEP);
}
""".replace("SLEEP", str(SLOW_LOAD_BUFFER_MS))
)
# Final settle
page.wait_for_timeout(int(SLOW_LOAD_BUFFER_MS))
page.pdf(
path=str(pdf_path),
format="A4",
print_background=True,
margin={"top": "10mm", "bottom": "10mm", "left": "10mm", "right": "10mm"},
prefer_css_page_size=False,
)
context.close()
if pdf_path.exists() and pdf_path.stat().st_size > 1024:
status["ok"] = True
status["size_bytes"] = pdf_path.stat().st_size
print(f" OK {pdf_path.name} ({status['size_bytes']/1024:.1f} KiB)", flush=True)
else:
status["error"] = "pdf missing or too small"
print(f" FAIL {status['error']}", flush=True)
except Exception as e:
status["error"] = repr(e)
print(f" FAIL {status['error']}", flush=True)
finally:
if browser:
try:
browser.close()
except Exception:
pass
status["elapsed_s"] = round(time.time() - start, 2)
log_path.write_text(json.dumps(status, indent=2))
return status
def main() -> int:
results = []
with sync_playwright() as p:
for i, (slug, name, url) in enumerate(FACTIONS, 1):
r = fetch_one(p, slug, name, url, i, len(FACTIONS))
results.append(r)
# Inter-page politeness delay (skip after last)
if i < len(FACTIONS):
time.sleep(PAGE_DELAY_S)
# Summary
ok = sum(1 for r in results if r["ok"])
print()
print("=" * 60)
print(f"Done. {ok}/{len(results)} factions converted.")
print("=" * 60)
for r in results:
flag = "OK" if r["ok"] else "FAIL"
size = f"{r['size_bytes']/1024:.1f} KiB" if r["ok"] else r["error"]
print(f" [{flag:>4}] {r['name']:<28} {size}")
(LOG_DIR / "_summary.json").write_text(json.dumps(results, indent=2))
return 0 if ok == len(results) else 1
if __name__ == "__main__":
sys.exit(main())