Files
wh40k-points-comparator/parse_pdf_per_faction.py
root 38bffa491c Initial commit: WH40K Points Comparator
- React + MUI DataGrid app with faction filter, search, change filter
- Biggest movers cards (drops/rises) scoped to current filter view
- Historical points graph modal (5 MFM versions: 1.14 → current)
- URL state sync (faction, dir, q params — shareable URLs)
- Grimdark favicon + OG embed image (Google Imagen)
- Multi-stage Dockerfile (node build → nginx serve)
- docker-compose.yml with Traefik + Cloudflare TLS
- Data pipeline: build_deduped_data.py merges PDF + live scrape
- Ynnari merged into Aeldari (shared codex)
- Mobile responsive: flex columns, no fixed pixel widths
- Color semantics: green=cheaper, red=costlier (consistent everywhere)
- 1,449 units across 31 factions
2026-06-18 02:42:29 +00:00

216 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
Extract the MFM PDF (Full_armies_10th.pdf) into per-faction JSON files,
mirroring the shape of the live scraper output in /root/wh40k-factions/live/.
Output:
/root/wh40k-factions/pdf/<slug>.json (one per faction)
/root/wh40k-factions/pdf/_manifest.json
Schema per file (matches live scraper):
{
"slug": "astra-militarum",
"name": "Astra Militarum",
"source": "Full_armies_10th.pdf",
"version": "v4.3", # parsed from page 1 header
"extracted_at": "2026-06-17T...",
"pages": [8, 9], # all PDF pages that contribute units
"n_units": 84,
"n_rows": 162,
"units": {
"Valkyrie": [
{"size": "1 model", "pts": 190}
],
"...": [...]
}
}
Page-to-faction mapping is fixed: page 9 is Astra Militarum forge world
(was incorrectly mapped to Black Templars in the original parse_pdf.py).
Detachment-enhancements-only pages are mapped to their parent faction but
contribute zero units (parser skips them).
"""
import json
import re
import sys
import time
from pathlib import Path
import pymupdf
sys.path.insert(0, str(Path(__file__).parent))
# Reuse the parser + regexes + helpers from the original script.
from parse_pdf import parse_page, slug, clean_line
PDF = "/root/.hermes/cache/documents/doc_ed3e1a0bd12e_Full_armies_10th.pdf"
OUT_DIR = Path("/root/wh40k-factions/pdf")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# Slug for each page. Fixed mapping (the original had page 9 wrong:
# page 9 contains Astra Militarum forge-world units, not Black Templars).
PAGE_TO_SLUG = {
1: None, # title page
2: "adepta-sororitas",
3: "adeptus-custodes",
4: "adeptus-mechanicus",
5: "adeptus-titanicus", # Forge World Titans (Adeptus Titanicus)
6: "aeldari",
7: "ynnari", # Ynnari subset of Aeldari
8: "astra-militarum",
9: "astra-militarum", # AM forge world (Valkyrie, Wyvern, etc.)
10: "black-templars",
11: "blood-angels",
12: "chaos-daemons",
13: "chaos-daemons", # detachment enhancements (skipped by parser)
14: "chaos-knights",
15: "chaos-space-marines",
16: "chaos-space-marines", # detachment enhancements
17: "dark-angels",
18: "death-guard",
19: "deathwatch",
20: "drukhari",
21: "emperors-children",
22: "genestealer-cults",
23: "grey-knights",
24: "imperial-agents", # rules page (no units)
25: "imperial-agents", # units
26: "imperial-knights",
27: "leagues-of-votann",
28: "necrons",
29: "necrons", # detachment enhancements
30: "orks",
31: "orks", # detachment enhancements
32: "space-marines",
33: "space-marines", # forge world units (Predator etc.)
34: "space-wolves",
35: "tau-empire",
36: "thousand-sons",
37: "tyranids",
38: "tyranids", # detachment enhancements
39: "world-eaters",
}
# Display name per slug. For names with apostrophes, use curly form for display.
NAME_BY_SLUG = {
"adepta-sororitas": "Adepta Sororitas",
"adeptus-custodes": "Adeptus Custodes",
"adeptus-mechanicus": "Adeptus Mechanicus",
"adeptus-titanicus": "Adeptus Titanicus",
"aeldari": "Aeldari",
"ynnari": "Ynnari",
"astra-militarum": "Astra Militarum",
"black-templars": "Black Templars",
"blood-angels": "Blood Angels",
"chaos-daemons": "Chaos Daemons",
"chaos-knights": "Chaos Knights",
"chaos-space-marines": "Chaos Space Marines",
"dark-angels": "Dark Angels",
"death-guard": "Death Guard",
"deathwatch": "Deathwatch",
"drukhari": "Drukhari",
"emperors-children": "Emperor's Children",
"genestealer-cults": "Genestealer Cults",
"grey-knights": "Grey Knights",
"imperial-agents": "Imperial Agents",
"imperial-knights": "Imperial Knights",
"leagues-of-votann": "Leagues of Votann",
"necrons": "Necrons",
"orks": "Orks",
"space-marines": "Space Marines",
"space-wolves": "Space Wolves",
"tau-empire": "T'au Empire",
"thousand-sons": "Thousand Sons",
"tyranids": "Tyranids",
"world-eaters": "World Eaters",
}
def extract_version(doc) -> str | None:
"""Pull the MFM version from the title page (e.g. 'VERSION 4.3')."""
try:
text = doc[0].get_text()
except Exception:
return None
m = re.search(r"VERSION\s+([\d.]+)", text, re.IGNORECASE)
return m.group(1) if m else None
def main():
if not Path(PDF).exists():
print(f"ERROR: PDF not found at {PDF}", file=sys.stderr)
sys.exit(1)
doc = pymupdf.open(PDF)
version = extract_version(doc)
print(f"PDF: {PDF} pages={doc.page_count} version={version}", flush=True)
# accumulator per faction slug
accum: dict[str, dict] = {}
for page_idx in range(doc.page_count):
page_num = page_idx + 1
s = PAGE_TO_SLUG.get(page_num)
if not s:
print(f" page {page_num:2d}: skipped (title/blank)", flush=True)
continue
if s not in accum:
accum[s] = {
"slug": s,
"name": NAME_BY_SLUG[s],
"source": Path(PDF).name,
"version": version,
"extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"pages": [],
"n_units": 0,
"n_rows": 0,
"units": {},
}
accum[s]["pages"].append(page_num)
page_units = parse_page(doc[page_idx].get_text())
n_added = 0
for unit, costs in page_units.items():
existing = accum[s]["units"].setdefault(unit, [])
seen = {(c["size"], c["pts"]) for c in existing}
for c in costs:
if (c["size"], c["pts"]) not in seen:
existing.append(c)
seen.add((c["size"], c["pts"]))
n_added += 1
print(f" page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True)
doc.close()
# finalize counts + write files
manifest = {"pdf": Path(PDF).name, "version": version, "factions": []}
total_units = total_rows = 0
for s, data in sorted(accum.items()):
data["n_units"] = len(data["units"])
data["n_rows"] = sum(len(v) for v in data["units"].values())
out_path = OUT_DIR / f"{s}.json"
with open(out_path, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
total_units += data["n_units"]
total_rows += data["n_rows"]
manifest["factions"].append({
"slug": s, "name": data["name"],
"pages": data["pages"],
"n_units": data["n_units"],
"n_rows": data["n_rows"],
"file": out_path.name,
})
print(f" {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows "
f"-> {out_path.name}", flush=True)
manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
manifest["n_factions"] = len(manifest["factions"])
manifest["total_units"] = total_units
manifest["total_rows"] = total_rows
with open(OUT_DIR / "_manifest.json", "w") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/")
print(f"Total: {total_units} units / {total_rows} size-rows")
print(f"Manifest: {OUT_DIR / '_manifest.json'}")
if __name__ == "__main__":
main()