- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
216 lines
7.6 KiB
Python
216 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract the MFM PDF (Full_armies_10th.pdf) into per-faction JSON files,
|
|
mirroring the shape of the live scraper output in /root/wh40k-factions/live/.
|
|
|
|
Output:
|
|
/root/wh40k-factions/pdf/<slug>.json (one per faction)
|
|
/root/wh40k-factions/pdf/_manifest.json
|
|
|
|
Schema per file (matches live scraper):
|
|
{
|
|
"slug": "astra-militarum",
|
|
"name": "Astra Militarum",
|
|
"source": "Full_armies_10th.pdf",
|
|
"version": "v4.3", # parsed from page 1 header
|
|
"extracted_at": "2026-06-17T...",
|
|
"pages": [8, 9], # all PDF pages that contribute units
|
|
"n_units": 84,
|
|
"n_rows": 162,
|
|
"units": {
|
|
"Valkyrie": [
|
|
{"size": "1 model", "pts": 190}
|
|
],
|
|
"...": [...]
|
|
}
|
|
}
|
|
|
|
Page-to-faction mapping is fixed: page 9 is Astra Militarum forge world
|
|
(was incorrectly mapped to Black Templars in the original parse_pdf.py).
|
|
Detachment-enhancements-only pages are mapped to their parent faction but
|
|
contribute zero units (parser skips them).
|
|
"""
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pymupdf
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
# Reuse the parser + regexes + helpers from the original script.
|
|
from parse_pdf import parse_page, slug, clean_line
|
|
|
|
PDF = "/root/.hermes/cache/documents/doc_ed3e1a0bd12e_Full_armies_10th.pdf"
|
|
OUT_DIR = Path("/root/wh40k-factions/pdf")
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Slug for each page. Fixed mapping (the original had page 9 wrong:
|
|
# page 9 contains Astra Militarum forge-world units, not Black Templars).
|
|
PAGE_TO_SLUG = {
|
|
1: None, # title page
|
|
2: "adepta-sororitas",
|
|
3: "adeptus-custodes",
|
|
4: "adeptus-mechanicus",
|
|
5: "adeptus-titanicus", # Forge World Titans (Adeptus Titanicus)
|
|
6: "aeldari",
|
|
7: "ynnari", # Ynnari subset of Aeldari
|
|
8: "astra-militarum",
|
|
9: "astra-militarum", # AM forge world (Valkyrie, Wyvern, etc.)
|
|
10: "black-templars",
|
|
11: "blood-angels",
|
|
12: "chaos-daemons",
|
|
13: "chaos-daemons", # detachment enhancements (skipped by parser)
|
|
14: "chaos-knights",
|
|
15: "chaos-space-marines",
|
|
16: "chaos-space-marines", # detachment enhancements
|
|
17: "dark-angels",
|
|
18: "death-guard",
|
|
19: "deathwatch",
|
|
20: "drukhari",
|
|
21: "emperors-children",
|
|
22: "genestealer-cults",
|
|
23: "grey-knights",
|
|
24: "imperial-agents", # rules page (no units)
|
|
25: "imperial-agents", # units
|
|
26: "imperial-knights",
|
|
27: "leagues-of-votann",
|
|
28: "necrons",
|
|
29: "necrons", # detachment enhancements
|
|
30: "orks",
|
|
31: "orks", # detachment enhancements
|
|
32: "space-marines",
|
|
33: "space-marines", # forge world units (Predator etc.)
|
|
34: "space-wolves",
|
|
35: "tau-empire",
|
|
36: "thousand-sons",
|
|
37: "tyranids",
|
|
38: "tyranids", # detachment enhancements
|
|
39: "world-eaters",
|
|
}
|
|
|
|
# Display name per slug. For names with apostrophes, use curly form for display.
|
|
NAME_BY_SLUG = {
|
|
"adepta-sororitas": "Adepta Sororitas",
|
|
"adeptus-custodes": "Adeptus Custodes",
|
|
"adeptus-mechanicus": "Adeptus Mechanicus",
|
|
"adeptus-titanicus": "Adeptus Titanicus",
|
|
"aeldari": "Aeldari",
|
|
"ynnari": "Ynnari",
|
|
"astra-militarum": "Astra Militarum",
|
|
"black-templars": "Black Templars",
|
|
"blood-angels": "Blood Angels",
|
|
"chaos-daemons": "Chaos Daemons",
|
|
"chaos-knights": "Chaos Knights",
|
|
"chaos-space-marines": "Chaos Space Marines",
|
|
"dark-angels": "Dark Angels",
|
|
"death-guard": "Death Guard",
|
|
"deathwatch": "Deathwatch",
|
|
"drukhari": "Drukhari",
|
|
"emperors-children": "Emperor's Children",
|
|
"genestealer-cults": "Genestealer Cults",
|
|
"grey-knights": "Grey Knights",
|
|
"imperial-agents": "Imperial Agents",
|
|
"imperial-knights": "Imperial Knights",
|
|
"leagues-of-votann": "Leagues of Votann",
|
|
"necrons": "Necrons",
|
|
"orks": "Orks",
|
|
"space-marines": "Space Marines",
|
|
"space-wolves": "Space Wolves",
|
|
"tau-empire": "T'au Empire",
|
|
"thousand-sons": "Thousand Sons",
|
|
"tyranids": "Tyranids",
|
|
"world-eaters": "World Eaters",
|
|
}
|
|
|
|
|
|
def extract_version(doc) -> str | None:
|
|
"""Pull the MFM version from the title page (e.g. 'VERSION 4.3')."""
|
|
try:
|
|
text = doc[0].get_text()
|
|
except Exception:
|
|
return None
|
|
m = re.search(r"VERSION\s+([\d.]+)", text, re.IGNORECASE)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def main():
|
|
if not Path(PDF).exists():
|
|
print(f"ERROR: PDF not found at {PDF}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
doc = pymupdf.open(PDF)
|
|
version = extract_version(doc)
|
|
print(f"PDF: {PDF} pages={doc.page_count} version={version}", flush=True)
|
|
|
|
# accumulator per faction slug
|
|
accum: dict[str, dict] = {}
|
|
for page_idx in range(doc.page_count):
|
|
page_num = page_idx + 1
|
|
s = PAGE_TO_SLUG.get(page_num)
|
|
if not s:
|
|
print(f" page {page_num:2d}: skipped (title/blank)", flush=True)
|
|
continue
|
|
if s not in accum:
|
|
accum[s] = {
|
|
"slug": s,
|
|
"name": NAME_BY_SLUG[s],
|
|
"source": Path(PDF).name,
|
|
"version": version,
|
|
"extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"pages": [],
|
|
"n_units": 0,
|
|
"n_rows": 0,
|
|
"units": {},
|
|
}
|
|
accum[s]["pages"].append(page_num)
|
|
page_units = parse_page(doc[page_idx].get_text())
|
|
n_added = 0
|
|
for unit, costs in page_units.items():
|
|
existing = accum[s]["units"].setdefault(unit, [])
|
|
seen = {(c["size"], c["pts"]) for c in existing}
|
|
for c in costs:
|
|
if (c["size"], c["pts"]) not in seen:
|
|
existing.append(c)
|
|
seen.add((c["size"], c["pts"]))
|
|
n_added += 1
|
|
print(f" page {page_num:2d} -> {s:<22} +{n_added:>3} entries", flush=True)
|
|
doc.close()
|
|
|
|
# finalize counts + write files
|
|
manifest = {"pdf": Path(PDF).name, "version": version, "factions": []}
|
|
total_units = total_rows = 0
|
|
for s, data in sorted(accum.items()):
|
|
data["n_units"] = len(data["units"])
|
|
data["n_rows"] = sum(len(v) for v in data["units"].values())
|
|
out_path = OUT_DIR / f"{s}.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
total_units += data["n_units"]
|
|
total_rows += data["n_rows"]
|
|
manifest["factions"].append({
|
|
"slug": s, "name": data["name"],
|
|
"pages": data["pages"],
|
|
"n_units": data["n_units"],
|
|
"n_rows": data["n_rows"],
|
|
"file": out_path.name,
|
|
})
|
|
print(f" {s:<22} {data['n_units']:>3} units / {data['n_rows']:>3} rows "
|
|
f"-> {out_path.name}", flush=True)
|
|
|
|
manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
manifest["n_factions"] = len(manifest["factions"])
|
|
manifest["total_units"] = total_units
|
|
manifest["total_rows"] = total_rows
|
|
with open(OUT_DIR / "_manifest.json", "w") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nWrote {len(manifest['factions'])} faction files to {OUT_DIR}/")
|
|
print(f"Total: {total_units} units / {total_rows} size-rows")
|
|
print(f"Manifest: {OUT_DIR / '_manifest.json'}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|