"""Parse the MFM PDF into structured data. Output: /root/wh40k-factions/pdf_data.json Each page in the PDF has lines like: UnitName models pts models pts NextUnitName ... A unit is followed by one or more " models / pts" rows. Sometimes unit names are followed by additional cost lines, sometimes wargear tables are inline. The trick: skip until we find a header, then a unit name is the next non-empty, non-numeric line. Some pages have "DETACHMENT ENHANCEMENTS" sections — skip them. """ import json import re from pathlib import Path import pymupdf PDF = "/root/.hermes/cache/documents/doc_5ee1da27de4e_Full_armies_10th.pdf" OUT = Path("/root/wh40k-factions/pdf_data.json") PAGE_TO_FACTION = { 2: "Adepta Sororitas", 3: "Adeptus Custodes", 4: "Adeptus Mechanicus", 5: "Adeptus Titanicus", 6: "Aeldari", 7: "Ynnari", 8: "Astra Militarum", 9: "Black Templars", 10: "Black Templars", 11: "Blood Angels", 12: "Chaos Daemons", 13: "Chaos Daemons", 14: "Chaos Knights", 15: "Chaos Space Marines", 16: "Chaos Space Marines", 17: "Dark Angels", 18: "Death Guard", 19: "Deathwatch", 20: "Drukhari", 21: "Emperor's Children", 22: "Genestealer Cults", 23: "Grey Knights", 24: "Imperial Agents", 25: "Imperial Agents", 26: "Imperial Knights", 27: "Leagues of Votann", 28: "Necrons", 29: "Necrons", 30: "Orks", 31: "Orks", 32: "Space Marines", 33: "Space Marines", 34: "Space Wolves", 35: "Tau Empire", # PDF uses straight apostrophe, normalize 36: "Thousand Sons", 37: "Tyranids", 38: "Tyranids", 39: "World Eaters", } # A line that says " models" or "1 model" optionally followed by a pts value COST_LINE_RE = re.compile( r"^\s*(\d+)\s+models?\s*\x08*[.\s\x00-\x1f]*?(\d+)\s*pts?\s*$", re.IGNORECASE, ) # Just a "1 model" line (singular model count) — pts may be on a separate line SIZE_ONLY_RE = re.compile(r"^\s*(\d+)\s+models?\s*\x08*\s*$", re.IGNORECASE) PTS_LINE_RE = re.compile(r"^\s*(\d+)\s*pts?\s*$", re.IGNORECASE) # Section markers that should be skipped SKIP_SECTIONS = ( "WARGEAR OPTIONS", "DETACHMENT ENHANCEMENTS", "STRATAGEMS", "ARMY RULE", "ENHANCEMENTS", "LITANIES", "PSYCHIC", "FACTION PACK", ) # Unit names start with a capital letter and are not too long. def is_unit_name(s: str) -> bool: s = s.strip() if not s: return False if len(s) > 80: return False if not s[0].isalpha(): return False # reject lines that look numeric / header if any(kw in s.upper() for kw in ("CODEX:", "INDEX:", "FACTION", "WARGEAR", "DETACHMENT", "ENHANCEMENT", "STRATAGEM")): return False return True def slug(name: str) -> str: s = name.lower().replace("\u2019", "").replace("'", "") s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") return s def clean_line(s: str) -> str: """Strip control chars and trailing dot leaders (PDFium renders them as 0xFFFD).""" s = s.replace("\x08", "").replace("\x0c", "") # remove dot leaders (literal dots, 0xFFFD replacement chars, and spaces) s = re.sub(r"[\.\uFFFD]{2,}", "", s) s = re.sub(r"\s{2,}", " ", s) # collapse double spaces return s.strip() def parse_page(text: str) -> dict[str, list[dict]]: units: dict[str, list[dict]] = {} raw_lines = text.split("\n") current_unit: str | None = None pending_size: str | None = None skip = False for raw in raw_lines: line = clean_line(raw) if not line: continue upper = line.upper() # Section break if any(kw in upper for kw in SKIP_SECTIONS): skip = True current_unit = None pending_size = None continue if skip: # section ends when we see a new CODEX/INDEX/FACTION PACK header if "CODEX" in upper or "INDEX" in upper or "FACTION PACK" in upper: skip = False else: continue if upper.startswith("CODEX:") or upper.startswith("INDEX:"): continue # Try cost line with both size and pts m = COST_LINE_RE.match(line) if m: if current_unit: units.setdefault(current_unit, []).append({ "size": f"{m.group(1)} models", "pts": int(m.group(2)), }) pending_size = None # Do NOT reset current_unit — the next line may be another # size variant for the same unit (e.g. "10 models ... 140 pts" # right after "3 models ... 45 pts"). current_unit is cleared # only when we encounter a new unit-name line. continue # Try size-only line m = SIZE_ONLY_RE.match(line) if m: pending_size = f"{m.group(1)} models" continue # Try pts-only line (completes a pending size) m = PTS_LINE_RE.match(line) if m and pending_size and current_unit: units.setdefault(current_unit, []).append({ "size": pending_size, "pts": int(m.group(1)), }) pending_size = None # Do NOT reset current_unit — same reason as above. continue if m: # pts without a known size — drop pending pending_size = None continue # Otherwise this is a unit name (or noise) if is_unit_name(line): current_unit = line pending_size = None return units def main(): doc = pymupdf.open(PDF) out: dict[str, dict] = {} for page_idx in range(doc.page_count): page_num = page_idx + 1 if page_num not in PAGE_TO_FACTION: continue faction = PAGE_TO_FACTION[page_num] s = slug(faction) if s not in out: out[s] = {"name": faction.replace("Tau", "T'au"), "pages": [], "units": {}} out[s]["pages"].append(page_num) page_units = parse_page(doc[page_idx].get_text()) for unit, costs in page_units.items(): existing = out[s]["units"].get(unit, []) seen = {(c["size"], c["pts"]) for c in existing} for c in costs: if (c["size"], c["pts"]) not in seen: existing.append(c) seen.add((c["size"], c["pts"])) out[s]["units"][unit] = existing OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False)) print(f"wrote {OUT}") print(f"factions parsed: {len(out)}") for s, data in out.items(): print(f" {s:25s}: {len(data['units']):3d} units ({', '.join(map(str, data['pages']))})") if __name__ == "__main__": main()