Files
wh40k-points-comparator/parse_pdf23.py
root 38bffa491c Initial commit: WH40K Points Comparator
- React + MUI DataGrid app with faction filter, search, change filter
- Biggest movers cards (drops/rises) scoped to current filter view
- Historical points graph modal (5 MFM versions: 1.14 → current)
- URL state sync (faction, dir, q params — shareable URLs)
- Grimdark favicon + OG embed image (Google Imagen)
- Multi-stage Dockerfile (node build → nginx serve)
- docker-compose.yml with Traefik + Cloudflare TLS
- Data pipeline: build_deduped_data.py merges PDF + live scrape
- Ynnari merged into Aeldari (shared codex)
- Mobile responsive: flex columns, no fixed pixel widths
- Color semantics: green=cheaper, red=costlier (consistent everywhere)
- 1,449 units across 31 factions
2026-06-18 02:42:29 +00:00

165 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Parse MFM_2.3_March_2025.pdf into per-faction JSON files.
Output: /root/wh40k-factions/pdf23/<slug>.json
"""
import json
import re
import sys
import time
from pathlib import Path
import pymupdf
sys.path.insert(0, str(Path(__file__).parent))
from parse_pdf import parse_page
PDF = "/root/.hermes/cache/documents/doc_cb9ee828b86b_MFM_2.3_March_2025.pdf"
OUT_DIR = Path("/root/wh40k-factions/pdf23")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# Same page mapping as 3.2 (verified — same 38-page layout)
PAGE_TO_SLUG = {
1: None,
2: "adepta-sororitas",
3: "adeptus-custodes",
4: "adeptus-mechanicus",
5: "adeptus-titanicus",
6: "aeldari",
7: "ynnari",
8: "astra-militarum",
9: "astra-militarum", # detachment enhancements (skipped)
10: "black-templars",
11: "blood-angels",
12: "chaos-daemons",
13: "chaos-daemons", # detachment enhancements
14: "chaos-knights",
15: "chaos-space-marines",
16: "chaos-space-marines", # detachment enhancements
17: "dark-angels",
18: "death-guard",
19: "deathwatch",
20: "drukhari",
21: "emperors-children",
22: "genestealer-cults",
23: "grey-knights",
24: "imperial-agents",
25: "imperial-agents",
26: "imperial-knights",
27: "leagues-of-votann",
28: "necrons",
29: "orks",
30: "orks", # detachment enhancements
31: "space-marines",
32: "space-marines", # forge world
33: "space-wolves",
34: "tau-empire",
35: "thousand-sons",
36: "tyranids",
37: "tyranids", # detachment enhancements
38: "world-eaters",
}
NAME_BY_SLUG = {
"adepta-sororitas": "Adepta Sororitas",
"adeptus-custodes": "Adeptus Custodes",
"adeptus-mechanicus": "Adeptus Mechanicus",
"adeptus-titanicus": "Adeptus Titanicus",
"aeldari": "Aeldari",
"ynnari": "Ynnari",
"astra-militarum": "Astra Militarum",
"black-templars": "Black Templars",
"blood-angels": "Blood Angels",
"chaos-daemons": "Chaos Daemons",
"chaos-knights": "Chaos Knights",
"chaos-space-marines": "Chaos Space Marines",
"dark-angels": "Dark Angels",
"death-guard": "Death Guard",
"deathwatch": "Deathwatch",
"drukhari": "Drukhari",
"emperors-children": "Emperor's Children",
"genestealer-cults": "Genestealer Cults",
"grey-knights": "Grey Knights",
"imperial-agents": "Imperial Agents",
"imperial-knights": "Imperial Knights",
"leagues-of-votann": "Leagues of Votann",
"necrons": "Necrons",
"orks": "Orks",
"space-marines": "Space Marines",
"space-wolves": "Space Wolves",
"tau-empire": "T'au Empire",
"thousand-sons": "Thousand Sons",
"tyranids": "Tyranids",
"world-eaters": "World Eaters",
}
def main():
doc = pymupdf.open(PDF)
print(f"PDF: {PDF} pages={doc.page_count}", flush=True)
accum = {}
for page_idx in range(doc.page_count):
page_num = page_idx + 1
s = PAGE_TO_SLUG.get(page_num)
if not s:
print(f" page {page_num:2d}: skipped (title/blank)", flush=True)
continue
if s not in accum:
accum[s] = {
"slug": s,
"name": NAME_BY_SLUG.get(s, s),
"source": Path(PDF).name,
"version": "2.3",
"extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"pages": [],
"n_units": 0,
"n_rows": 0,
"units": {},
}
accum[s]["pages"].append(page_num)
page_units = parse_page(doc[page_idx].get_text())
n_added = 0
for unit, costs in page_units.items():
existing = accum[s]["units"].setdefault(unit, [])
seen = {(c["size"], c["pts"]) for c in existing}
for c in costs:
if (c["size"], c["pts"]) not in seen:
existing.append(c)
seen.add((c["size"], c["pts"]))
n_added += 1
print(f" page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True)
doc.close()
manifest = {"pdf": Path(PDF).name, "version": "2.3", "factions": []}
total_units = total_rows = 0
for s, data in sorted(accum.items()):
data["n_units"] = len(data["units"])
data["n_rows"] = sum(len(v) for v in data["units"].values())
out_path = OUT_DIR / f"{s}.json"
with open(out_path, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
total_units += data["n_units"]
total_rows += data["n_rows"]
manifest["factions"].append({
"slug": s, "name": data["name"],
"pages": data["pages"],
"n_units": data["n_units"],
"n_rows": data["n_rows"],
"file": out_path.name,
})
print(f" {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows -> {out_path.name}", flush=True)
manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
manifest["n_factions"] = len(manifest["factions"])
manifest["total_units"] = total_units
manifest["total_rows"] = total_rows
with open(OUT_DIR / "_manifest.json", "w") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/")
print(f"Total: {total_units} units / {total_rows} size-rows")
if __name__ == "__main__":
main()