#!/usr/bin/env python3 """ Parse MFM_2.3_March_2025.pdf into per-faction JSON files. Output: /root/wh40k-factions/pdf23/.json """ import json import re import sys import time from pathlib import Path import pymupdf sys.path.insert(0, str(Path(__file__).parent)) from parse_pdf import parse_page PDF = "/root/.hermes/cache/documents/doc_cb9ee828b86b_MFM_2.3_March_2025.pdf" OUT_DIR = Path("/root/wh40k-factions/pdf23") OUT_DIR.mkdir(parents=True, exist_ok=True) # Same page mapping as 3.2 (verified — same 38-page layout) PAGE_TO_SLUG = { 1: None, 2: "adepta-sororitas", 3: "adeptus-custodes", 4: "adeptus-mechanicus", 5: "adeptus-titanicus", 6: "aeldari", 7: "ynnari", 8: "astra-militarum", 9: "astra-militarum", # detachment enhancements (skipped) 10: "black-templars", 11: "blood-angels", 12: "chaos-daemons", 13: "chaos-daemons", # detachment enhancements 14: "chaos-knights", 15: "chaos-space-marines", 16: "chaos-space-marines", # detachment enhancements 17: "dark-angels", 18: "death-guard", 19: "deathwatch", 20: "drukhari", 21: "emperors-children", 22: "genestealer-cults", 23: "grey-knights", 24: "imperial-agents", 25: "imperial-agents", 26: "imperial-knights", 27: "leagues-of-votann", 28: "necrons", 29: "orks", 30: "orks", # detachment enhancements 31: "space-marines", 32: "space-marines", # forge world 33: "space-wolves", 34: "tau-empire", 35: "thousand-sons", 36: "tyranids", 37: "tyranids", # detachment enhancements 38: "world-eaters", } NAME_BY_SLUG = { "adepta-sororitas": "Adepta Sororitas", "adeptus-custodes": "Adeptus Custodes", "adeptus-mechanicus": "Adeptus Mechanicus", "adeptus-titanicus": "Adeptus Titanicus", "aeldari": "Aeldari", "ynnari": "Ynnari", "astra-militarum": "Astra Militarum", "black-templars": "Black Templars", "blood-angels": "Blood Angels", "chaos-daemons": "Chaos Daemons", "chaos-knights": "Chaos Knights", "chaos-space-marines": "Chaos Space Marines", "dark-angels": "Dark Angels", "death-guard": "Death Guard", "deathwatch": "Deathwatch", "drukhari": "Drukhari", "emperors-children": "Emperor's Children", "genestealer-cults": "Genestealer Cults", "grey-knights": "Grey Knights", "imperial-agents": "Imperial Agents", "imperial-knights": "Imperial Knights", "leagues-of-votann": "Leagues of Votann", "necrons": "Necrons", "orks": "Orks", "space-marines": "Space Marines", "space-wolves": "Space Wolves", "tau-empire": "T'au Empire", "thousand-sons": "Thousand Sons", "tyranids": "Tyranids", "world-eaters": "World Eaters", } def main(): doc = pymupdf.open(PDF) print(f"PDF: {PDF} pages={doc.page_count}", flush=True) accum = {} for page_idx in range(doc.page_count): page_num = page_idx + 1 s = PAGE_TO_SLUG.get(page_num) if not s: print(f" page {page_num:2d}: skipped (title/blank)", flush=True) continue if s not in accum: accum[s] = { "slug": s, "name": NAME_BY_SLUG.get(s, s), "source": Path(PDF).name, "version": "2.3", "extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "pages": [], "n_units": 0, "n_rows": 0, "units": {}, } accum[s]["pages"].append(page_num) page_units = parse_page(doc[page_idx].get_text()) n_added = 0 for unit, costs in page_units.items(): existing = accum[s]["units"].setdefault(unit, []) seen = {(c["size"], c["pts"]) for c in existing} for c in costs: if (c["size"], c["pts"]) not in seen: existing.append(c) seen.add((c["size"], c["pts"])) n_added += 1 print(f" page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True) doc.close() manifest = {"pdf": Path(PDF).name, "version": "2.3", "factions": []} total_units = total_rows = 0 for s, data in sorted(accum.items()): data["n_units"] = len(data["units"]) data["n_rows"] = sum(len(v) for v in data["units"].values()) out_path = OUT_DIR / f"{s}.json" with open(out_path, "w") as f: json.dump(data, f, indent=2, ensure_ascii=False) total_units += data["n_units"] total_rows += data["n_rows"] manifest["factions"].append({ "slug": s, "name": data["name"], "pages": data["pages"], "n_units": data["n_units"], "n_rows": data["n_rows"], "file": out_path.name, }) print(f" {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows -> {out_path.name}", flush=True) manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) manifest["n_factions"] = len(manifest["factions"]) manifest["total_units"] = total_units manifest["total_rows"] = total_rows with open(OUT_DIR / "_manifest.json", "w") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/") print(f"Total: {total_units} units / {total_rows} size-rows") if __name__ == "__main__": main()