- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
215 lines
6.7 KiB
Python
215 lines
6.7 KiB
Python
"""Parse the MFM PDF into structured data.
|
|
|
|
Output: /root/wh40k-factions/pdf_data.json
|
|
|
|
Each page in the PDF has lines like:
|
|
UnitName
|
|
<N> models<N dots> <K> pts
|
|
<N> models<N dots> <K> pts
|
|
NextUnitName
|
|
...
|
|
A unit is followed by one or more "<n> models / <k> pts" rows. Sometimes
|
|
unit names are followed by additional cost lines, sometimes wargear tables
|
|
are inline. The trick: skip until we find a header, then a unit name is
|
|
the next non-empty, non-numeric line.
|
|
|
|
Some pages have "DETACHMENT ENHANCEMENTS" sections — skip them.
|
|
"""
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pymupdf
|
|
|
|
PDF = "/root/.hermes/cache/documents/doc_5ee1da27de4e_Full_armies_10th.pdf"
|
|
OUT = Path("/root/wh40k-factions/pdf_data.json")
|
|
|
|
PAGE_TO_FACTION = {
|
|
2: "Adepta Sororitas",
|
|
3: "Adeptus Custodes",
|
|
4: "Adeptus Mechanicus",
|
|
5: "Adeptus Titanicus",
|
|
6: "Aeldari",
|
|
7: "Ynnari",
|
|
8: "Astra Militarum",
|
|
9: "Black Templars",
|
|
10: "Black Templars",
|
|
11: "Blood Angels",
|
|
12: "Chaos Daemons",
|
|
13: "Chaos Daemons",
|
|
14: "Chaos Knights",
|
|
15: "Chaos Space Marines",
|
|
16: "Chaos Space Marines",
|
|
17: "Dark Angels",
|
|
18: "Death Guard",
|
|
19: "Deathwatch",
|
|
20: "Drukhari",
|
|
21: "Emperor's Children",
|
|
22: "Genestealer Cults",
|
|
23: "Grey Knights",
|
|
24: "Imperial Agents",
|
|
25: "Imperial Agents",
|
|
26: "Imperial Knights",
|
|
27: "Leagues of Votann",
|
|
28: "Necrons",
|
|
29: "Necrons",
|
|
30: "Orks",
|
|
31: "Orks",
|
|
32: "Space Marines",
|
|
33: "Space Marines",
|
|
34: "Space Wolves",
|
|
35: "Tau Empire", # PDF uses straight apostrophe, normalize
|
|
36: "Thousand Sons",
|
|
37: "Tyranids",
|
|
38: "Tyranids",
|
|
39: "World Eaters",
|
|
}
|
|
|
|
# A line that says "<n> models" or "1 model" optionally followed by a pts value
|
|
COST_LINE_RE = re.compile(
|
|
r"^\s*(\d+)\s+models?\s*\x08*[.\s\x00-\x1f]*?(\d+)\s*pts?\s*$",
|
|
re.IGNORECASE,
|
|
)
|
|
# Just a "1 model" line (singular model count) — pts may be on a separate line
|
|
SIZE_ONLY_RE = re.compile(r"^\s*(\d+)\s+models?\s*\x08*\s*$", re.IGNORECASE)
|
|
PTS_LINE_RE = re.compile(r"^\s*(\d+)\s*pts?\s*$", re.IGNORECASE)
|
|
|
|
# Section markers that should be skipped
|
|
SKIP_SECTIONS = (
|
|
"WARGEAR OPTIONS", "DETACHMENT ENHANCEMENTS", "STRATAGEMS",
|
|
"ARMY RULE", "ENHANCEMENTS", "LITANIES", "PSYCHIC", "FACTION PACK",
|
|
)
|
|
|
|
# Unit names start with a capital letter and are not too long.
|
|
def is_unit_name(s: str) -> bool:
|
|
s = s.strip()
|
|
if not s:
|
|
return False
|
|
if len(s) > 80:
|
|
return False
|
|
if not s[0].isalpha():
|
|
return False
|
|
# reject lines that look numeric / header
|
|
if any(kw in s.upper() for kw in ("CODEX:", "INDEX:", "FACTION", "WARGEAR",
|
|
"DETACHMENT", "ENHANCEMENT", "STRATAGEM")):
|
|
return False
|
|
return True
|
|
|
|
|
|
def slug(name: str) -> str:
|
|
s = name.lower().replace("\u2019", "").replace("'", "")
|
|
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
|
|
return s
|
|
|
|
|
|
def clean_line(s: str) -> str:
|
|
"""Strip control chars and trailing dot leaders (PDFium renders them as 0xFFFD)."""
|
|
s = s.replace("\x08", "").replace("\x0c", "")
|
|
# remove dot leaders (literal dots, 0xFFFD replacement chars, and spaces)
|
|
s = re.sub(r"[\.\uFFFD]{2,}", "", s)
|
|
s = re.sub(r"\s{2,}", " ", s) # collapse double spaces
|
|
return s.strip()
|
|
|
|
|
|
def parse_page(text: str) -> dict[str, list[dict]]:
|
|
units: dict[str, list[dict]] = {}
|
|
raw_lines = text.split("\n")
|
|
current_unit: str | None = None
|
|
pending_size: str | None = None
|
|
skip = False
|
|
|
|
for raw in raw_lines:
|
|
line = clean_line(raw)
|
|
if not line:
|
|
continue
|
|
upper = line.upper()
|
|
|
|
# Section break
|
|
if any(kw in upper for kw in SKIP_SECTIONS):
|
|
skip = True
|
|
current_unit = None
|
|
pending_size = None
|
|
continue
|
|
if skip:
|
|
# section ends when we see a new CODEX/INDEX/FACTION PACK header
|
|
if "CODEX" in upper or "INDEX" in upper or "FACTION PACK" in upper:
|
|
skip = False
|
|
else:
|
|
continue
|
|
if upper.startswith("CODEX:") or upper.startswith("INDEX:"):
|
|
continue
|
|
|
|
# Try cost line with both size and pts
|
|
m = COST_LINE_RE.match(line)
|
|
if m:
|
|
if current_unit:
|
|
units.setdefault(current_unit, []).append({
|
|
"size": f"{m.group(1)} models",
|
|
"pts": int(m.group(2)),
|
|
})
|
|
pending_size = None
|
|
# Do NOT reset current_unit — the next line may be another
|
|
# size variant for the same unit (e.g. "10 models ... 140 pts"
|
|
# right after "3 models ... 45 pts"). current_unit is cleared
|
|
# only when we encounter a new unit-name line.
|
|
continue
|
|
# Try size-only line
|
|
m = SIZE_ONLY_RE.match(line)
|
|
if m:
|
|
pending_size = f"{m.group(1)} models"
|
|
continue
|
|
# Try pts-only line (completes a pending size)
|
|
m = PTS_LINE_RE.match(line)
|
|
if m and pending_size and current_unit:
|
|
units.setdefault(current_unit, []).append({
|
|
"size": pending_size,
|
|
"pts": int(m.group(1)),
|
|
})
|
|
pending_size = None
|
|
# Do NOT reset current_unit — same reason as above.
|
|
continue
|
|
if m:
|
|
# pts without a known size — drop pending
|
|
pending_size = None
|
|
continue
|
|
|
|
# Otherwise this is a unit name (or noise)
|
|
if is_unit_name(line):
|
|
current_unit = line
|
|
pending_size = None
|
|
|
|
return units
|
|
|
|
|
|
def main():
|
|
doc = pymupdf.open(PDF)
|
|
out: dict[str, dict] = {}
|
|
for page_idx in range(doc.page_count):
|
|
page_num = page_idx + 1
|
|
if page_num not in PAGE_TO_FACTION:
|
|
continue
|
|
faction = PAGE_TO_FACTION[page_num]
|
|
s = slug(faction)
|
|
if s not in out:
|
|
out[s] = {"name": faction.replace("Tau", "T'au"), "pages": [], "units": {}}
|
|
out[s]["pages"].append(page_num)
|
|
page_units = parse_page(doc[page_idx].get_text())
|
|
for unit, costs in page_units.items():
|
|
existing = out[s]["units"].get(unit, [])
|
|
seen = {(c["size"], c["pts"]) for c in existing}
|
|
for c in costs:
|
|
if (c["size"], c["pts"]) not in seen:
|
|
existing.append(c)
|
|
seen.add((c["size"], c["pts"]))
|
|
out[s]["units"][unit] = existing
|
|
|
|
OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
|
print(f"wrote {OUT}")
|
|
print(f"factions parsed: {len(out)}")
|
|
for s, data in out.items():
|
|
print(f" {s:25s}: {len(data['units']):3d} units ({', '.join(map(str, data['pages']))})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|