Files
wh40k-points-comparator/parse_pdf.py
root 38bffa491c Initial commit: WH40K Points Comparator
- React + MUI DataGrid app with faction filter, search, change filter
- Biggest movers cards (drops/rises) scoped to current filter view
- Historical points graph modal (5 MFM versions: 1.14 → current)
- URL state sync (faction, dir, q params — shareable URLs)
- Grimdark favicon + OG embed image (Google Imagen)
- Multi-stage Dockerfile (node build → nginx serve)
- docker-compose.yml with Traefik + Cloudflare TLS
- Data pipeline: build_deduped_data.py merges PDF + live scrape
- Ynnari merged into Aeldari (shared codex)
- Mobile responsive: flex columns, no fixed pixel widths
- Color semantics: green=cheaper, red=costlier (consistent everywhere)
- 1,449 units across 31 factions
2026-06-18 02:42:29 +00:00

215 lines
6.7 KiB
Python

"""Parse the MFM PDF into structured data.
Output: /root/wh40k-factions/pdf_data.json
Each page in the PDF has lines like:
UnitName
<N> models<N dots> <K> pts
<N> models<N dots> <K> pts
NextUnitName
...
A unit is followed by one or more "<n> models / <k> pts" rows. Sometimes
unit names are followed by additional cost lines, sometimes wargear tables
are inline. The trick: skip until we find a header, then a unit name is
the next non-empty, non-numeric line.
Some pages have "DETACHMENT ENHANCEMENTS" sections — skip them.
"""
import json
import re
from pathlib import Path
import pymupdf
PDF = "/root/.hermes/cache/documents/doc_5ee1da27de4e_Full_armies_10th.pdf"
OUT = Path("/root/wh40k-factions/pdf_data.json")
PAGE_TO_FACTION = {
2: "Adepta Sororitas",
3: "Adeptus Custodes",
4: "Adeptus Mechanicus",
5: "Adeptus Titanicus",
6: "Aeldari",
7: "Ynnari",
8: "Astra Militarum",
9: "Black Templars",
10: "Black Templars",
11: "Blood Angels",
12: "Chaos Daemons",
13: "Chaos Daemons",
14: "Chaos Knights",
15: "Chaos Space Marines",
16: "Chaos Space Marines",
17: "Dark Angels",
18: "Death Guard",
19: "Deathwatch",
20: "Drukhari",
21: "Emperor's Children",
22: "Genestealer Cults",
23: "Grey Knights",
24: "Imperial Agents",
25: "Imperial Agents",
26: "Imperial Knights",
27: "Leagues of Votann",
28: "Necrons",
29: "Necrons",
30: "Orks",
31: "Orks",
32: "Space Marines",
33: "Space Marines",
34: "Space Wolves",
35: "Tau Empire", # PDF uses straight apostrophe, normalize
36: "Thousand Sons",
37: "Tyranids",
38: "Tyranids",
39: "World Eaters",
}
# A line that says "<n> models" or "1 model" optionally followed by a pts value
COST_LINE_RE = re.compile(
r"^\s*(\d+)\s+models?\s*\x08*[.\s\x00-\x1f]*?(\d+)\s*pts?\s*$",
re.IGNORECASE,
)
# Just a "1 model" line (singular model count) — pts may be on a separate line
SIZE_ONLY_RE = re.compile(r"^\s*(\d+)\s+models?\s*\x08*\s*$", re.IGNORECASE)
PTS_LINE_RE = re.compile(r"^\s*(\d+)\s*pts?\s*$", re.IGNORECASE)
# Section markers that should be skipped
SKIP_SECTIONS = (
"WARGEAR OPTIONS", "DETACHMENT ENHANCEMENTS", "STRATAGEMS",
"ARMY RULE", "ENHANCEMENTS", "LITANIES", "PSYCHIC", "FACTION PACK",
)
# Unit names start with a capital letter and are not too long.
def is_unit_name(s: str) -> bool:
s = s.strip()
if not s:
return False
if len(s) > 80:
return False
if not s[0].isalpha():
return False
# reject lines that look numeric / header
if any(kw in s.upper() for kw in ("CODEX:", "INDEX:", "FACTION", "WARGEAR",
"DETACHMENT", "ENHANCEMENT", "STRATAGEM")):
return False
return True
def slug(name: str) -> str:
s = name.lower().replace("\u2019", "").replace("'", "")
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def clean_line(s: str) -> str:
"""Strip control chars and trailing dot leaders (PDFium renders them as 0xFFFD)."""
s = s.replace("\x08", "").replace("\x0c", "")
# remove dot leaders (literal dots, 0xFFFD replacement chars, and spaces)
s = re.sub(r"[\.\uFFFD]{2,}", "", s)
s = re.sub(r"\s{2,}", " ", s) # collapse double spaces
return s.strip()
def parse_page(text: str) -> dict[str, list[dict]]:
units: dict[str, list[dict]] = {}
raw_lines = text.split("\n")
current_unit: str | None = None
pending_size: str | None = None
skip = False
for raw in raw_lines:
line = clean_line(raw)
if not line:
continue
upper = line.upper()
# Section break
if any(kw in upper for kw in SKIP_SECTIONS):
skip = True
current_unit = None
pending_size = None
continue
if skip:
# section ends when we see a new CODEX/INDEX/FACTION PACK header
if "CODEX" in upper or "INDEX" in upper or "FACTION PACK" in upper:
skip = False
else:
continue
if upper.startswith("CODEX:") or upper.startswith("INDEX:"):
continue
# Try cost line with both size and pts
m = COST_LINE_RE.match(line)
if m:
if current_unit:
units.setdefault(current_unit, []).append({
"size": f"{m.group(1)} models",
"pts": int(m.group(2)),
})
pending_size = None
# Do NOT reset current_unit — the next line may be another
# size variant for the same unit (e.g. "10 models ... 140 pts"
# right after "3 models ... 45 pts"). current_unit is cleared
# only when we encounter a new unit-name line.
continue
# Try size-only line
m = SIZE_ONLY_RE.match(line)
if m:
pending_size = f"{m.group(1)} models"
continue
# Try pts-only line (completes a pending size)
m = PTS_LINE_RE.match(line)
if m and pending_size and current_unit:
units.setdefault(current_unit, []).append({
"size": pending_size,
"pts": int(m.group(1)),
})
pending_size = None
# Do NOT reset current_unit — same reason as above.
continue
if m:
# pts without a known size — drop pending
pending_size = None
continue
# Otherwise this is a unit name (or noise)
if is_unit_name(line):
current_unit = line
pending_size = None
return units
def main():
doc = pymupdf.open(PDF)
out: dict[str, dict] = {}
for page_idx in range(doc.page_count):
page_num = page_idx + 1
if page_num not in PAGE_TO_FACTION:
continue
faction = PAGE_TO_FACTION[page_num]
s = slug(faction)
if s not in out:
out[s] = {"name": faction.replace("Tau", "T'au"), "pages": [], "units": {}}
out[s]["pages"].append(page_num)
page_units = parse_page(doc[page_idx].get_text())
for unit, costs in page_units.items():
existing = out[s]["units"].get(unit, [])
seen = {(c["size"], c["pts"]) for c in existing}
for c in costs:
if (c["size"], c["pts"]) not in seen:
existing.append(c)
seen.add((c["size"], c["pts"]))
out[s]["units"][unit] = existing
OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
print(f"wrote {OUT}")
print(f"factions parsed: {len(out)}")
for s, data in out.items():
print(f" {s:25s}: {len(data['units']):3d} units ({', '.join(map(str, data['pages']))})")
if __name__ == "__main__":
main()