wh40k-points-comparator/parse_pdf_per_faction.py

#!/usr/bin/env python3
"""
Extract the MFM PDF (Full_armies_10th.pdf) into per-faction JSON files,
mirroring the shape of the live scraper output in /root/wh40k-factions/live/.

Output:
  /root/wh40k-factions/pdf/<slug>.json   (one per faction)
  /root/wh40k-factions/pdf/_manifest.json

Schema per file (matches live scraper):
  {
    "slug": "astra-militarum",
    "name": "Astra Militarum",
    "source": "Full_armies_10th.pdf",
    "version": "v4.3",            # parsed from page 1 header
    "extracted_at": "2026-06-17T...",
    "pages": [8, 9],              # all PDF pages that contribute units
    "n_units": 84,
    "n_rows": 162,
    "units": {
      "Valkyrie": [
        {"size": "1 model", "pts": 190}
      ],
      "...": [...]
    }
  }

Page-to-faction mapping is fixed: page 9 is Astra Militarum forge world
(was incorrectly mapped to Black Templars in the original parse_pdf.py).
Detachment-enhancements-only pages are mapped to their parent faction but
contribute zero units (parser skips them).
"""
import json
import re
import sys
import time
from pathlib import Path

import pymupdf

sys.path.insert(0, str(Path(__file__).parent))
# Reuse the parser + regexes + helpers from the original script.
from parse_pdf import parse_page, slug, clean_line

PDF = "/root/.hermes/cache/documents/doc_ed3e1a0bd12e_Full_armies_10th.pdf"
OUT_DIR = Path("/root/wh40k-factions/pdf")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Slug for each page. Fixed mapping (the original had page 9 wrong:
# page 9 contains Astra Militarum forge-world units, not Black Templars).
PAGE_TO_SLUG = {
    1:  None,   # title page
    2:  "adepta-sororitas",
    3:  "adeptus-custodes",
    4:  "adeptus-mechanicus",
    5:  "adeptus-titanicus",     # Forge World Titans (Adeptus Titanicus)
    6:  "aeldari",
    7:  "ynnari",                # Ynnari subset of Aeldari
    8:  "astra-militarum",
    9:  "astra-militarum",       # AM forge world (Valkyrie, Wyvern, etc.)
    10: "black-templars",
    11: "blood-angels",
    12: "chaos-daemons",
    13: "chaos-daemons",         # detachment enhancements (skipped by parser)
    14: "chaos-knights",
    15: "chaos-space-marines",
    16: "chaos-space-marines",   # detachment enhancements
    17: "dark-angels",
    18: "death-guard",
    19: "deathwatch",
    20: "drukhari",
    21: "emperors-children",
    22: "genestealer-cults",
    23: "grey-knights",
    24: "imperial-agents",       # rules page (no units)
    25: "imperial-agents",       # units
    26: "imperial-knights",
    27: "leagues-of-votann",
    28: "necrons",
    29: "necrons",               # detachment enhancements
    30: "orks",
    31: "orks",                  # detachment enhancements
    32: "space-marines",
    33: "space-marines",         # forge world units (Predator etc.)
    34: "space-wolves",
    35: "tau-empire",
    36: "thousand-sons",
    37: "tyranids",
    38: "tyranids",              # detachment enhancements
    39: "world-eaters",
}

# Display name per slug. For names with apostrophes, use curly form for display.
NAME_BY_SLUG = {
    "adepta-sororitas":     "Adepta Sororitas",
    "adeptus-custodes":     "Adeptus Custodes",
    "adeptus-mechanicus":   "Adeptus Mechanicus",
    "adeptus-titanicus":    "Adeptus Titanicus",
    "aeldari":              "Aeldari",
    "ynnari":               "Ynnari",
    "astra-militarum":      "Astra Militarum",
    "black-templars":       "Black Templars",
    "blood-angels":         "Blood Angels",
    "chaos-daemons":        "Chaos Daemons",
    "chaos-knights":        "Chaos Knights",
    "chaos-space-marines":  "Chaos Space Marines",
    "dark-angels":          "Dark Angels",
    "death-guard":          "Death Guard",
    "deathwatch":           "Deathwatch",
    "drukhari":             "Drukhari",
    "emperors-children":    "Emperor's Children",
    "genestealer-cults":    "Genestealer Cults",
    "grey-knights":         "Grey Knights",
    "imperial-agents":      "Imperial Agents",
    "imperial-knights":     "Imperial Knights",
    "leagues-of-votann":    "Leagues of Votann",
    "necrons":              "Necrons",
    "orks":                 "Orks",
    "space-marines":        "Space Marines",
    "space-wolves":         "Space Wolves",
    "tau-empire":           "T'au Empire",
    "thousand-sons":        "Thousand Sons",
    "tyranids":             "Tyranids",
    "world-eaters":         "World Eaters",
}


def extract_version(doc) -> str | None:
    """Pull the MFM version from the title page (e.g. 'VERSION 4.3')."""
    try:
        text = doc[0].get_text()
    except Exception:
        return None
    m = re.search(r"VERSION\s+([\d.]+)", text, re.IGNORECASE)
    return m.group(1) if m else None


def main():
    if not Path(PDF).exists():
        print(f"ERROR: PDF not found at {PDF}", file=sys.stderr)
        sys.exit(1)

    doc = pymupdf.open(PDF)
    version = extract_version(doc)
    print(f"PDF: {PDF}  pages={doc.page_count}  version={version}", flush=True)

    # accumulator per faction slug
    accum: dict[str, dict] = {}
    for page_idx in range(doc.page_count):
        page_num = page_idx + 1
        s = PAGE_TO_SLUG.get(page_num)
        if not s:
            print(f"  page {page_num:2d}: skipped (title/blank)", flush=True)
            continue
        if s not in accum:
            accum[s] = {
                "slug": s,
                "name": NAME_BY_SLUG[s],
                "source": Path(PDF).name,
                "version": version,
                "extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                "pages": [],
                "n_units": 0,
                "n_rows": 0,
                "units": {},
            }
        accum[s]["pages"].append(page_num)
        page_units = parse_page(doc[page_idx].get_text())
        n_added = 0
        for unit, costs in page_units.items():
            existing = accum[s]["units"].setdefault(unit, [])
            seen = {(c["size"], c["pts"]) for c in existing}
            for c in costs:
                if (c["size"], c["pts"]) not in seen:
                    existing.append(c)
                    seen.add((c["size"], c["pts"]))
                    n_added += 1
        print(f"  page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True)
    doc.close()

    # finalize counts + write files
    manifest = {"pdf": Path(PDF).name, "version": version, "factions": []}
    total_units = total_rows = 0
    for s, data in sorted(accum.items()):
        data["n_units"] = len(data["units"])
        data["n_rows"] = sum(len(v) for v in data["units"].values())
        out_path = OUT_DIR / f"{s}.json"
        with open(out_path, "w") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        total_units += data["n_units"]
        total_rows += data["n_rows"]
        manifest["factions"].append({
            "slug": s, "name": data["name"],
            "pages": data["pages"],
            "n_units": data["n_units"],
            "n_rows": data["n_rows"],
            "file": out_path.name,
        })
        print(f"  {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows  "
              f"-> {out_path.name}", flush=True)

    manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
    manifest["n_factions"] = len(manifest["factions"])
    manifest["total_units"] = total_units
    manifest["total_rows"] = total_rows
    with open(OUT_DIR / "_manifest.json", "w") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)

    print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/")
    print(f"Total: {total_units} units / {total_rows} size-rows")
    print(f"Manifest: {OUT_DIR / '_manifest.json'}")


if __name__ == "__main__":
    main()