- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
165 lines
5.6 KiB
Python
165 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Parse MFM_2.3_March_2025.pdf into per-faction JSON files.
|
|
Output: /root/wh40k-factions/pdf23/<slug>.json
|
|
"""
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pymupdf
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from parse_pdf import parse_page
|
|
|
|
PDF = "/root/.hermes/cache/documents/doc_cb9ee828b86b_MFM_2.3_March_2025.pdf"
|
|
OUT_DIR = Path("/root/wh40k-factions/pdf23")
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Same page mapping as 3.2 (verified — same 38-page layout)
|
|
PAGE_TO_SLUG = {
|
|
1: None,
|
|
2: "adepta-sororitas",
|
|
3: "adeptus-custodes",
|
|
4: "adeptus-mechanicus",
|
|
5: "adeptus-titanicus",
|
|
6: "aeldari",
|
|
7: "ynnari",
|
|
8: "astra-militarum",
|
|
9: "astra-militarum", # detachment enhancements (skipped)
|
|
10: "black-templars",
|
|
11: "blood-angels",
|
|
12: "chaos-daemons",
|
|
13: "chaos-daemons", # detachment enhancements
|
|
14: "chaos-knights",
|
|
15: "chaos-space-marines",
|
|
16: "chaos-space-marines", # detachment enhancements
|
|
17: "dark-angels",
|
|
18: "death-guard",
|
|
19: "deathwatch",
|
|
20: "drukhari",
|
|
21: "emperors-children",
|
|
22: "genestealer-cults",
|
|
23: "grey-knights",
|
|
24: "imperial-agents",
|
|
25: "imperial-agents",
|
|
26: "imperial-knights",
|
|
27: "leagues-of-votann",
|
|
28: "necrons",
|
|
29: "orks",
|
|
30: "orks", # detachment enhancements
|
|
31: "space-marines",
|
|
32: "space-marines", # forge world
|
|
33: "space-wolves",
|
|
34: "tau-empire",
|
|
35: "thousand-sons",
|
|
36: "tyranids",
|
|
37: "tyranids", # detachment enhancements
|
|
38: "world-eaters",
|
|
}
|
|
|
|
NAME_BY_SLUG = {
|
|
"adepta-sororitas": "Adepta Sororitas",
|
|
"adeptus-custodes": "Adeptus Custodes",
|
|
"adeptus-mechanicus": "Adeptus Mechanicus",
|
|
"adeptus-titanicus": "Adeptus Titanicus",
|
|
"aeldari": "Aeldari",
|
|
"ynnari": "Ynnari",
|
|
"astra-militarum": "Astra Militarum",
|
|
"black-templars": "Black Templars",
|
|
"blood-angels": "Blood Angels",
|
|
"chaos-daemons": "Chaos Daemons",
|
|
"chaos-knights": "Chaos Knights",
|
|
"chaos-space-marines": "Chaos Space Marines",
|
|
"dark-angels": "Dark Angels",
|
|
"death-guard": "Death Guard",
|
|
"deathwatch": "Deathwatch",
|
|
"drukhari": "Drukhari",
|
|
"emperors-children": "Emperor's Children",
|
|
"genestealer-cults": "Genestealer Cults",
|
|
"grey-knights": "Grey Knights",
|
|
"imperial-agents": "Imperial Agents",
|
|
"imperial-knights": "Imperial Knights",
|
|
"leagues-of-votann": "Leagues of Votann",
|
|
"necrons": "Necrons",
|
|
"orks": "Orks",
|
|
"space-marines": "Space Marines",
|
|
"space-wolves": "Space Wolves",
|
|
"tau-empire": "T'au Empire",
|
|
"thousand-sons": "Thousand Sons",
|
|
"tyranids": "Tyranids",
|
|
"world-eaters": "World Eaters",
|
|
}
|
|
|
|
|
|
def main():
|
|
doc = pymupdf.open(PDF)
|
|
print(f"PDF: {PDF} pages={doc.page_count}", flush=True)
|
|
|
|
accum = {}
|
|
for page_idx in range(doc.page_count):
|
|
page_num = page_idx + 1
|
|
s = PAGE_TO_SLUG.get(page_num)
|
|
if not s:
|
|
print(f" page {page_num:2d}: skipped (title/blank)", flush=True)
|
|
continue
|
|
if s not in accum:
|
|
accum[s] = {
|
|
"slug": s,
|
|
"name": NAME_BY_SLUG.get(s, s),
|
|
"source": Path(PDF).name,
|
|
"version": "2.3",
|
|
"extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"pages": [],
|
|
"n_units": 0,
|
|
"n_rows": 0,
|
|
"units": {},
|
|
}
|
|
accum[s]["pages"].append(page_num)
|
|
page_units = parse_page(doc[page_idx].get_text())
|
|
n_added = 0
|
|
for unit, costs in page_units.items():
|
|
existing = accum[s]["units"].setdefault(unit, [])
|
|
seen = {(c["size"], c["pts"]) for c in existing}
|
|
for c in costs:
|
|
if (c["size"], c["pts"]) not in seen:
|
|
existing.append(c)
|
|
seen.add((c["size"], c["pts"]))
|
|
n_added += 1
|
|
print(f" page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True)
|
|
doc.close()
|
|
|
|
manifest = {"pdf": Path(PDF).name, "version": "2.3", "factions": []}
|
|
total_units = total_rows = 0
|
|
for s, data in sorted(accum.items()):
|
|
data["n_units"] = len(data["units"])
|
|
data["n_rows"] = sum(len(v) for v in data["units"].values())
|
|
out_path = OUT_DIR / f"{s}.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
total_units += data["n_units"]
|
|
total_rows += data["n_rows"]
|
|
manifest["factions"].append({
|
|
"slug": s, "name": data["name"],
|
|
"pages": data["pages"],
|
|
"n_units": data["n_units"],
|
|
"n_rows": data["n_rows"],
|
|
"file": out_path.name,
|
|
})
|
|
print(f" {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows -> {out_path.name}", flush=True)
|
|
|
|
manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
manifest["n_factions"] = len(manifest["factions"])
|
|
manifest["total_units"] = total_units
|
|
manifest["total_rows"] = total_rows
|
|
with open(OUT_DIR / "_manifest.json", "w") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/")
|
|
print(f"Total: {total_units} units / {total_rows} size-rows")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |