#!/usr/bin/env python3 """ Extract the MFM PDF (Full_armies_10th.pdf) into per-faction JSON files, mirroring the shape of the live scraper output in /root/wh40k-factions/live/. Output: /root/wh40k-factions/pdf/.json (one per faction) /root/wh40k-factions/pdf/_manifest.json Schema per file (matches live scraper): { "slug": "astra-militarum", "name": "Astra Militarum", "source": "Full_armies_10th.pdf", "version": "v4.3", # parsed from page 1 header "extracted_at": "2026-06-17T...", "pages": [8, 9], # all PDF pages that contribute units "n_units": 84, "n_rows": 162, "units": { "Valkyrie": [ {"size": "1 model", "pts": 190} ], "...": [...] } } Page-to-faction mapping is fixed: page 9 is Astra Militarum forge world (was incorrectly mapped to Black Templars in the original parse_pdf.py). Detachment-enhancements-only pages are mapped to their parent faction but contribute zero units (parser skips them). """ import json import re import sys import time from pathlib import Path import pymupdf sys.path.insert(0, str(Path(__file__).parent)) # Reuse the parser + regexes + helpers from the original script. from parse_pdf import parse_page, slug, clean_line PDF = "/root/.hermes/cache/documents/doc_ed3e1a0bd12e_Full_armies_10th.pdf" OUT_DIR = Path("/root/wh40k-factions/pdf") OUT_DIR.mkdir(parents=True, exist_ok=True) # Slug for each page. Fixed mapping (the original had page 9 wrong: # page 9 contains Astra Militarum forge-world units, not Black Templars). PAGE_TO_SLUG = { 1: None, # title page 2: "adepta-sororitas", 3: "adeptus-custodes", 4: "adeptus-mechanicus", 5: "adeptus-titanicus", # Forge World Titans (Adeptus Titanicus) 6: "aeldari", 7: "ynnari", # Ynnari subset of Aeldari 8: "astra-militarum", 9: "astra-militarum", # AM forge world (Valkyrie, Wyvern, etc.) 10: "black-templars", 11: "blood-angels", 12: "chaos-daemons", 13: "chaos-daemons", # detachment enhancements (skipped by parser) 14: "chaos-knights", 15: "chaos-space-marines", 16: "chaos-space-marines", # detachment enhancements 17: "dark-angels", 18: "death-guard", 19: "deathwatch", 20: "drukhari", 21: "emperors-children", 22: "genestealer-cults", 23: "grey-knights", 24: "imperial-agents", # rules page (no units) 25: "imperial-agents", # units 26: "imperial-knights", 27: "leagues-of-votann", 28: "necrons", 29: "necrons", # detachment enhancements 30: "orks", 31: "orks", # detachment enhancements 32: "space-marines", 33: "space-marines", # forge world units (Predator etc.) 34: "space-wolves", 35: "tau-empire", 36: "thousand-sons", 37: "tyranids", 38: "tyranids", # detachment enhancements 39: "world-eaters", } # Display name per slug. For names with apostrophes, use curly form for display. NAME_BY_SLUG = { "adepta-sororitas": "Adepta Sororitas", "adeptus-custodes": "Adeptus Custodes", "adeptus-mechanicus": "Adeptus Mechanicus", "adeptus-titanicus": "Adeptus Titanicus", "aeldari": "Aeldari", "ynnari": "Ynnari", "astra-militarum": "Astra Militarum", "black-templars": "Black Templars", "blood-angels": "Blood Angels", "chaos-daemons": "Chaos Daemons", "chaos-knights": "Chaos Knights", "chaos-space-marines": "Chaos Space Marines", "dark-angels": "Dark Angels", "death-guard": "Death Guard", "deathwatch": "Deathwatch", "drukhari": "Drukhari", "emperors-children": "Emperor's Children", "genestealer-cults": "Genestealer Cults", "grey-knights": "Grey Knights", "imperial-agents": "Imperial Agents", "imperial-knights": "Imperial Knights", "leagues-of-votann": "Leagues of Votann", "necrons": "Necrons", "orks": "Orks", "space-marines": "Space Marines", "space-wolves": "Space Wolves", "tau-empire": "T'au Empire", "thousand-sons": "Thousand Sons", "tyranids": "Tyranids", "world-eaters": "World Eaters", } def extract_version(doc) -> str | None: """Pull the MFM version from the title page (e.g. 'VERSION 4.3').""" try: text = doc[0].get_text() except Exception: return None m = re.search(r"VERSION\s+([\d.]+)", text, re.IGNORECASE) return m.group(1) if m else None def main(): if not Path(PDF).exists(): print(f"ERROR: PDF not found at {PDF}", file=sys.stderr) sys.exit(1) doc = pymupdf.open(PDF) version = extract_version(doc) print(f"PDF: {PDF} pages={doc.page_count} version={version}", flush=True) # accumulator per faction slug accum: dict[str, dict] = {} for page_idx in range(doc.page_count): page_num = page_idx + 1 s = PAGE_TO_SLUG.get(page_num) if not s: print(f" page {page_num:2d}: skipped (title/blank)", flush=True) continue if s not in accum: accum[s] = { "slug": s, "name": NAME_BY_SLUG[s], "source": Path(PDF).name, "version": version, "extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "pages": [], "n_units": 0, "n_rows": 0, "units": {}, } accum[s]["pages"].append(page_num) page_units = parse_page(doc[page_idx].get_text()) n_added = 0 for unit, costs in page_units.items(): existing = accum[s]["units"].setdefault(unit, []) seen = {(c["size"], c["pts"]) for c in existing} for c in costs: if (c["size"], c["pts"]) not in seen: existing.append(c) seen.add((c["size"], c["pts"])) n_added += 1 print(f" page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True) doc.close() # finalize counts + write files manifest = {"pdf": Path(PDF).name, "version": version, "factions": []} total_units = total_rows = 0 for s, data in sorted(accum.items()): data["n_units"] = len(data["units"]) data["n_rows"] = sum(len(v) for v in data["units"].values()) out_path = OUT_DIR / f"{s}.json" with open(out_path, "w") as f: json.dump(data, f, indent=2, ensure_ascii=False) total_units += data["n_units"] total_rows += data["n_rows"] manifest["factions"].append({ "slug": s, "name": data["name"], "pages": data["pages"], "n_units": data["n_units"], "n_rows": data["n_rows"], "file": out_path.name, }) print(f" {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows " f"-> {out_path.name}", flush=True) manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) manifest["n_factions"] = len(manifest["factions"]) manifest["total_units"] = total_units manifest["total_rows"] = total_rows with open(OUT_DIR / "_manifest.json", "w") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/") print(f"Total: {total_units} units / {total_rows} size-rows") print(f"Manifest: {OUT_DIR / '_manifest.json'}") if __name__ == "__main__": main()