- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
189 lines
8.4 KiB
Python
189 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Visit each WH40K faction page slowly and print to PDF.
|
|
|
|
Uses the system chromium binary via Playwright so we don't need to download
|
|
the bundled headless shell. Each page gets a polite delay, waits for
|
|
content, then prints to PDF with background graphics enabled.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
FACTIONS = [
|
|
("adepta-sororitas", "Adepta Sororitas", "https://mfm.warhammer-community.com/en/adepta-sororitas"),
|
|
("adeptus-custodes", "Adeptus Custodes", "https://mfm.warhammer-community.com/en/adeptus-custodes"),
|
|
("adeptus-mechanicus", "Adeptus Mechanicus", "https://mfm.warhammer-community.com/en/adeptus-mechanicus"),
|
|
("aeldari", "Aeldari", "https://mfm.warhammer-community.com/en/aeldari"),
|
|
("astra-militarum", "Astra Militarum", "https://mfm.warhammer-community.com/en/astra-militarum"),
|
|
("black-templars", "Black Templars", "https://mfm.warhammer-community.com/en/black-templars"),
|
|
("blood-angels", "Blood Angels", "https://mfm.warhammer-community.com/en/blood-angels"),
|
|
("chaos-daemons", "Chaos Daemons", "https://mfm.warhammer-community.com/en/chaos-daemons"),
|
|
("chaos-knights", "Chaos Knights", "https://mfm.warhammer-community.com/en/chaos-knights"),
|
|
("chaos-space-marines", "Chaos Space Marines", "https://mfm.warhammer-community.com/en/chaos-space-marines"),
|
|
("chaos-titan-legions", "Chaos Titan Legions", "https://mfm.warhammer-community.com/en/chaos-titan-legions"),
|
|
("dark-angels", "Dark Angels", "https://mfm.warhammer-community.com/en/dark-angels"),
|
|
("death-guard", "Death Guard", "https://mfm.warhammer-community.com/en/death-guard"),
|
|
("deathwatch", "Deathwatch", "https://mfm.warhammer-community.com/en/deathwatch"),
|
|
("drukhari", "Drukhari", "https://mfm.warhammer-community.com/en/drukhari"),
|
|
("emperors-children", "Emperor's Children", "https://mfm.warhammer-community.com/en/emperors-children"),
|
|
("genestealer-cults", "Genestealer Cults", "https://mfm.warhammer-community.com/en/genestealer-cults"),
|
|
("grey-knights", "Grey Knights", "https://mfm.warhammer-community.com/en/grey-knights"),
|
|
("imperial-agents", "Imperial Agents", "https://mfm.warhammer-community.com/en/imperial-agents"),
|
|
("imperial-knights", "Imperial Knights", "https://mfm.warhammer-community.com/en/imperial-knights"),
|
|
("leagues-of-votann", "Leagues of Votann", "https://mfm.warhammer-community.com/en/leagues-of-votann"),
|
|
("necrons", "Necrons", "https://mfm.warhammer-community.com/en/necrons"),
|
|
("orks", "Orks", "https://mfm.warhammer-community.com/en/orks"),
|
|
("space-marines", "Space Marines", "https://mfm.warhammer-community.com/en/space-marines"),
|
|
("space-wolves", "Space Wolves", "https://mfm.warhammer-community.com/en/space-wolves"),
|
|
("tau-empire", "T'au Empire", "https://mfm.warhammer-community.com/en/tau-empire"),
|
|
("thousand-sons", "Thousand Sons", "https://mfm.warhammer-community.com/en/thousand-sons"),
|
|
("titan-legions", "Titan Legions", "https://mfm.warhammer-community.com/en/titan-legions"),
|
|
("tyranids", "Tyranids", "https://mfm.warhammer-community.com/en/tyranids"),
|
|
("world-eaters", "World Eaters", "https://mfm.warhammer-community.com/en/world-eaters"),
|
|
]
|
|
|
|
OUT_DIR = Path("/root/wh40k-factions/pdfs")
|
|
LOG_DIR = Path("/root/wh40k-factions/logs")
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
CHROMIUM_BIN = "/usr/bin/chromium"
|
|
|
|
# Polite crawl pacing
|
|
PAGE_DELAY_S = 3.0 # settle time after navigation
|
|
NETWORK_IDLE_TIMEOUT_MS = 20000
|
|
SLOW_LOAD_BUFFER_MS = 4000
|
|
|
|
|
|
def slugify(s: str) -> str:
|
|
return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")
|
|
|
|
|
|
def fetch_one(p, slug: str, name: str, url: str, idx: int, total: int) -> dict:
|
|
"""Visit a single URL, render it, save a PDF. Return a status dict."""
|
|
pdf_path = OUT_DIR / f"{slug}.pdf"
|
|
log_path = LOG_DIR / f"{slug}.log"
|
|
status = {
|
|
"slug": slug,
|
|
"name": name,
|
|
"url": url,
|
|
"pdf": str(pdf_path),
|
|
"ok": False,
|
|
"size_bytes": 0,
|
|
"error": None,
|
|
"elapsed_s": 0.0,
|
|
}
|
|
print(f"[{idx:>2}/{total}] {name} -> {url}", flush=True)
|
|
start = time.time()
|
|
browser = None
|
|
try:
|
|
context = p.chromium.launch_persistent_context(
|
|
user_data_dir=f"/tmp/wh40k-chrome-{slug}",
|
|
executable_path=CHROMIUM_BIN,
|
|
headless=True,
|
|
viewport={"width": 1280, "height": 1800},
|
|
user_agent=(
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"
|
|
),
|
|
accept_downloads=False,
|
|
ignore_https_errors=True,
|
|
)
|
|
page = context.new_page()
|
|
# Quiet the page console
|
|
page.on("pageerror", lambda exc: print(f" pageerror: {exc}", flush=True))
|
|
|
|
page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
|
# Give the Next.js client-side hydration time to run, then wait for
|
|
# network to settle (images, fonts, etc.).
|
|
try:
|
|
page.wait_for_load_state("networkidle", timeout=NETWORK_IDLE_TIMEOUT_MS)
|
|
except Exception as e:
|
|
print(f" networkidle timeout (continuing): {e}", flush=True)
|
|
|
|
# Scroll to bottom to trigger lazy-loaded images / sections, then
|
|
# back to top so the PDF starts at the header.
|
|
page.evaluate(
|
|
"""
|
|
async () => {
|
|
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
const total = document.body.scrollHeight;
|
|
for (let y = 0; y <= total; y += 800) {
|
|
window.scrollTo(0, y);
|
|
await sleep(120);
|
|
}
|
|
window.scrollTo(0, 0);
|
|
await sleep(SLEEP);
|
|
}
|
|
""".replace("SLEEP", str(SLOW_LOAD_BUFFER_MS))
|
|
)
|
|
|
|
# Final settle
|
|
page.wait_for_timeout(int(SLOW_LOAD_BUFFER_MS))
|
|
|
|
page.pdf(
|
|
path=str(pdf_path),
|
|
format="A4",
|
|
print_background=True,
|
|
margin={"top": "10mm", "bottom": "10mm", "left": "10mm", "right": "10mm"},
|
|
prefer_css_page_size=False,
|
|
)
|
|
|
|
context.close()
|
|
if pdf_path.exists() and pdf_path.stat().st_size > 1024:
|
|
status["ok"] = True
|
|
status["size_bytes"] = pdf_path.stat().st_size
|
|
print(f" OK {pdf_path.name} ({status['size_bytes']/1024:.1f} KiB)", flush=True)
|
|
else:
|
|
status["error"] = "pdf missing or too small"
|
|
print(f" FAIL {status['error']}", flush=True)
|
|
except Exception as e:
|
|
status["error"] = repr(e)
|
|
print(f" FAIL {status['error']}", flush=True)
|
|
finally:
|
|
if browser:
|
|
try:
|
|
browser.close()
|
|
except Exception:
|
|
pass
|
|
|
|
status["elapsed_s"] = round(time.time() - start, 2)
|
|
log_path.write_text(json.dumps(status, indent=2))
|
|
return status
|
|
|
|
|
|
def main() -> int:
|
|
results = []
|
|
with sync_playwright() as p:
|
|
for i, (slug, name, url) in enumerate(FACTIONS, 1):
|
|
r = fetch_one(p, slug, name, url, i, len(FACTIONS))
|
|
results.append(r)
|
|
# Inter-page politeness delay (skip after last)
|
|
if i < len(FACTIONS):
|
|
time.sleep(PAGE_DELAY_S)
|
|
|
|
# Summary
|
|
ok = sum(1 for r in results if r["ok"])
|
|
print()
|
|
print("=" * 60)
|
|
print(f"Done. {ok}/{len(results)} factions converted.")
|
|
print("=" * 60)
|
|
for r in results:
|
|
flag = "OK" if r["ok"] else "FAIL"
|
|
size = f"{r['size_bytes']/1024:.1f} KiB" if r["ok"] else r["error"]
|
|
print(f" [{flag:>4}] {r['name']:<28} {size}")
|
|
|
|
(LOG_DIR / "_summary.json").write_text(json.dumps(results, indent=2))
|
|
return 0 if ok == len(results) else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|