wh40k-points-comparator/build_pdfs.py

#!/usr/bin/env python3
"""Visit each WH40K faction page slowly and print to PDF.

Uses the system chromium binary via Playwright so we don't need to download
the bundled headless shell. Each page gets a polite delay, waits for
content, then prints to PDF with background graphics enabled.
"""

from __future__ import annotations

import json
import re
import sys
import time
from pathlib import Path

from playwright.sync_api import sync_playwright

FACTIONS = [
    ("adepta-sororitas",       "Adepta Sororitas",        "https://mfm.warhammer-community.com/en/adepta-sororitas"),
    ("adeptus-custodes",       "Adeptus Custodes",        "https://mfm.warhammer-community.com/en/adeptus-custodes"),
    ("adeptus-mechanicus",     "Adeptus Mechanicus",      "https://mfm.warhammer-community.com/en/adeptus-mechanicus"),
    ("aeldari",                "Aeldari",                 "https://mfm.warhammer-community.com/en/aeldari"),
    ("astra-militarum",        "Astra Militarum",         "https://mfm.warhammer-community.com/en/astra-militarum"),
    ("black-templars",         "Black Templars",          "https://mfm.warhammer-community.com/en/black-templars"),
    ("blood-angels",           "Blood Angels",            "https://mfm.warhammer-community.com/en/blood-angels"),
    ("chaos-daemons",          "Chaos Daemons",           "https://mfm.warhammer-community.com/en/chaos-daemons"),
    ("chaos-knights",          "Chaos Knights",           "https://mfm.warhammer-community.com/en/chaos-knights"),
    ("chaos-space-marines",    "Chaos Space Marines",     "https://mfm.warhammer-community.com/en/chaos-space-marines"),
    ("chaos-titan-legions",    "Chaos Titan Legions",     "https://mfm.warhammer-community.com/en/chaos-titan-legions"),
    ("dark-angels",            "Dark Angels",             "https://mfm.warhammer-community.com/en/dark-angels"),
    ("death-guard",            "Death Guard",             "https://mfm.warhammer-community.com/en/death-guard"),
    ("deathwatch",             "Deathwatch",              "https://mfm.warhammer-community.com/en/deathwatch"),
    ("drukhari",               "Drukhari",                "https://mfm.warhammer-community.com/en/drukhari"),
    ("emperors-children",      "Emperor's Children",      "https://mfm.warhammer-community.com/en/emperors-children"),
    ("genestealer-cults",      "Genestealer Cults",       "https://mfm.warhammer-community.com/en/genestealer-cults"),
    ("grey-knights",           "Grey Knights",            "https://mfm.warhammer-community.com/en/grey-knights"),
    ("imperial-agents",        "Imperial Agents",         "https://mfm.warhammer-community.com/en/imperial-agents"),
    ("imperial-knights",       "Imperial Knights",        "https://mfm.warhammer-community.com/en/imperial-knights"),
    ("leagues-of-votann",      "Leagues of Votann",       "https://mfm.warhammer-community.com/en/leagues-of-votann"),
    ("necrons",                "Necrons",                 "https://mfm.warhammer-community.com/en/necrons"),
    ("orks",                   "Orks",                    "https://mfm.warhammer-community.com/en/orks"),
    ("space-marines",          "Space Marines",           "https://mfm.warhammer-community.com/en/space-marines"),
    ("space-wolves",           "Space Wolves",            "https://mfm.warhammer-community.com/en/space-wolves"),
    ("tau-empire",             "T'au Empire",             "https://mfm.warhammer-community.com/en/tau-empire"),
    ("thousand-sons",          "Thousand Sons",           "https://mfm.warhammer-community.com/en/thousand-sons"),
    ("titan-legions",          "Titan Legions",           "https://mfm.warhammer-community.com/en/titan-legions"),
    ("tyranids",               "Tyranids",                "https://mfm.warhammer-community.com/en/tyranids"),
    ("world-eaters",           "World Eaters",            "https://mfm.warhammer-community.com/en/world-eaters"),
]

OUT_DIR = Path("/root/wh40k-factions/pdfs")
LOG_DIR = Path("/root/wh40k-factions/logs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

CHROMIUM_BIN = "/usr/bin/chromium"

# Polite crawl pacing
PAGE_DELAY_S = 3.0   # settle time after navigation
NETWORK_IDLE_TIMEOUT_MS = 20000
SLOW_LOAD_BUFFER_MS = 4000


def slugify(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")


def fetch_one(p, slug: str, name: str, url: str, idx: int, total: int) -> dict:
    """Visit a single URL, render it, save a PDF. Return a status dict."""
    pdf_path = OUT_DIR / f"{slug}.pdf"
    log_path = LOG_DIR / f"{slug}.log"
    status = {
        "slug": slug,
        "name": name,
        "url": url,
        "pdf": str(pdf_path),
        "ok": False,
        "size_bytes": 0,
        "error": None,
        "elapsed_s": 0.0,
    }
    print(f"[{idx:>2}/{total}] {name}  ->  {url}", flush=True)
    start = time.time()
    browser = None
    try:
        context = p.chromium.launch_persistent_context(
            user_data_dir=f"/tmp/wh40k-chrome-{slug}",
            executable_path=CHROMIUM_BIN,
            headless=True,
            viewport={"width": 1280, "height": 1800},
            user_agent=(
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"
            ),
            accept_downloads=False,
            ignore_https_errors=True,
        )
        page = context.new_page()
        # Quiet the page console
        page.on("pageerror", lambda exc: print(f"   pageerror: {exc}", flush=True))

        page.goto(url, wait_until="domcontentloaded", timeout=45000)
        # Give the Next.js client-side hydration time to run, then wait for
        # network to settle (images, fonts, etc.).
        try:
            page.wait_for_load_state("networkidle", timeout=NETWORK_IDLE_TIMEOUT_MS)
        except Exception as e:
            print(f"   networkidle timeout (continuing): {e}", flush=True)

        # Scroll to bottom to trigger lazy-loaded images / sections, then
        # back to top so the PDF starts at the header.
        page.evaluate(
            """
            async () => {
                const sleep = ms => new Promise(r => setTimeout(r, ms));
                const total = document.body.scrollHeight;
                for (let y = 0; y <= total; y += 800) {
                    window.scrollTo(0, y);
                    await sleep(120);
                }
                window.scrollTo(0, 0);
                await sleep(SLEEP);
            }
            """.replace("SLEEP", str(SLOW_LOAD_BUFFER_MS))
        )

        # Final settle
        page.wait_for_timeout(int(SLOW_LOAD_BUFFER_MS))

        page.pdf(
            path=str(pdf_path),
            format="A4",
            print_background=True,
            margin={"top": "10mm", "bottom": "10mm", "left": "10mm", "right": "10mm"},
            prefer_css_page_size=False,
        )

        context.close()
        if pdf_path.exists() and pdf_path.stat().st_size > 1024:
            status["ok"] = True
            status["size_bytes"] = pdf_path.stat().st_size
            print(f"   OK  {pdf_path.name}  ({status['size_bytes']/1024:.1f} KiB)", flush=True)
        else:
            status["error"] = "pdf missing or too small"
            print(f"   FAIL  {status['error']}", flush=True)
    except Exception as e:
        status["error"] = repr(e)
        print(f"   FAIL  {status['error']}", flush=True)
    finally:
        if browser:
            try:
                browser.close()
            except Exception:
                pass

    status["elapsed_s"] = round(time.time() - start, 2)
    log_path.write_text(json.dumps(status, indent=2))
    return status


def main() -> int:
    results = []
    with sync_playwright() as p:
        for i, (slug, name, url) in enumerate(FACTIONS, 1):
            r = fetch_one(p, slug, name, url, i, len(FACTIONS))
            results.append(r)
            # Inter-page politeness delay (skip after last)
            if i < len(FACTIONS):
                time.sleep(PAGE_DELAY_S)

    # Summary
    ok = sum(1 for r in results if r["ok"])
    print()
    print("=" * 60)
    print(f"Done. {ok}/{len(results)} factions converted.")
    print("=" * 60)
    for r in results:
        flag = "OK" if r["ok"] else "FAIL"
        size = f"{r['size_bytes']/1024:.1f} KiB" if r["ok"] else r["error"]
        print(f"  [{flag:>4}]  {r['name']:<28} {size}")

    (LOG_DIR / "_summary.json").write_text(json.dumps(results, indent=2))
    return 0 if ok == len(results) else 1


if __name__ == "__main__":
    sys.exit(main())