wh40k-points-comparator/parse_pdf.py

"""Parse the MFM PDF into structured data.

Output: /root/wh40k-factions/pdf_data.json

Each page in the PDF has lines like:
  UnitName
  <N> models<N dots> <K> pts
  <N> models<N dots> <K> pts
  NextUnitName
  ...
A unit is followed by one or more "<n> models / <k> pts" rows. Sometimes
unit names are followed by additional cost lines, sometimes wargear tables
are inline. The trick: skip until we find a header, then a unit name is
the next non-empty, non-numeric line.

Some pages have "DETACHMENT ENHANCEMENTS" sections — skip them.
"""
import json
import re
from pathlib import Path

import pymupdf

PDF = "/root/.hermes/cache/documents/doc_5ee1da27de4e_Full_armies_10th.pdf"
OUT = Path("/root/wh40k-factions/pdf_data.json")

PAGE_TO_FACTION = {
    2: "Adepta Sororitas",
    3: "Adeptus Custodes",
    4: "Adeptus Mechanicus",
    5: "Adeptus Titanicus",
    6: "Aeldari",
    7: "Ynnari",
    8: "Astra Militarum",
    9: "Black Templars",
    10: "Black Templars",
    11: "Blood Angels",
    12: "Chaos Daemons",
    13: "Chaos Daemons",
    14: "Chaos Knights",
    15: "Chaos Space Marines",
    16: "Chaos Space Marines",
    17: "Dark Angels",
    18: "Death Guard",
    19: "Deathwatch",
    20: "Drukhari",
    21: "Emperor's Children",
    22: "Genestealer Cults",
    23: "Grey Knights",
    24: "Imperial Agents",
    25: "Imperial Agents",
    26: "Imperial Knights",
    27: "Leagues of Votann",
    28: "Necrons",
    29: "Necrons",
    30: "Orks",
    31: "Orks",
    32: "Space Marines",
    33: "Space Marines",
    34: "Space Wolves",
    35: "Tau Empire",          # PDF uses straight apostrophe, normalize
    36: "Thousand Sons",
    37: "Tyranids",
    38: "Tyranids",
    39: "World Eaters",
}

# A line that says "<n> models" or "1 model" optionally followed by a pts value
COST_LINE_RE = re.compile(
    r"^\s*(\d+)\s+models?\s*\x08*[.\s\x00-\x1f]*?(\d+)\s*pts?\s*$",
    re.IGNORECASE,
)
# Just a "1 model" line (singular model count) — pts may be on a separate line
SIZE_ONLY_RE = re.compile(r"^\s*(\d+)\s+models?\s*\x08*\s*$", re.IGNORECASE)
PTS_LINE_RE = re.compile(r"^\s*(\d+)\s*pts?\s*$", re.IGNORECASE)

# Section markers that should be skipped
SKIP_SECTIONS = (
    "WARGEAR OPTIONS", "DETACHMENT ENHANCEMENTS", "STRATAGEMS",
    "ARMY RULE", "ENHANCEMENTS", "LITANIES", "PSYCHIC", "FACTION PACK",
)

# Unit names start with a capital letter and are not too long.
def is_unit_name(s: str) -> bool:
    s = s.strip()
    if not s:
        return False
    if len(s) > 80:
        return False
    if not s[0].isalpha():
        return False
    # reject lines that look numeric / header
    if any(kw in s.upper() for kw in ("CODEX:", "INDEX:", "FACTION", "WARGEAR",
                                       "DETACHMENT", "ENHANCEMENT", "STRATAGEM")):
        return False
    return True


def slug(name: str) -> str:
    s = name.lower().replace("\u2019", "").replace("'", "")
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s


def clean_line(s: str) -> str:
    """Strip control chars and trailing dot leaders (PDFium renders them as 0xFFFD)."""
    s = s.replace("\x08", "").replace("\x0c", "")
    # remove dot leaders (literal dots, 0xFFFD replacement chars, and spaces)
    s = re.sub(r"[\.\uFFFD]{2,}", "", s)
    s = re.sub(r"\s{2,}", " ", s)  # collapse double spaces
    return s.strip()


def parse_page(text: str) -> dict[str, list[dict]]:
    units: dict[str, list[dict]] = {}
    raw_lines = text.split("\n")
    current_unit: str | None = None
    pending_size: str | None = None
    skip = False

    for raw in raw_lines:
        line = clean_line(raw)
        if not line:
            continue
        upper = line.upper()

        # Section break
        if any(kw in upper for kw in SKIP_SECTIONS):
            skip = True
            current_unit = None
            pending_size = None
            continue
        if skip:
            # section ends when we see a new CODEX/INDEX/FACTION PACK header
            if "CODEX" in upper or "INDEX" in upper or "FACTION PACK" in upper:
                skip = False
            else:
                continue
        if upper.startswith("CODEX:") or upper.startswith("INDEX:"):
            continue

        # Try cost line with both size and pts
        m = COST_LINE_RE.match(line)
        if m:
            if current_unit:
                units.setdefault(current_unit, []).append({
                    "size": f"{m.group(1)} models",
                    "pts": int(m.group(2)),
                })
            pending_size = None
            # Do NOT reset current_unit — the next line may be another
            # size variant for the same unit (e.g. "10 models ... 140 pts"
            # right after "3 models ... 45 pts").  current_unit is cleared
            # only when we encounter a new unit-name line.
            continue
        # Try size-only line
        m = SIZE_ONLY_RE.match(line)
        if m:
            pending_size = f"{m.group(1)} models"
            continue
        # Try pts-only line (completes a pending size)
        m = PTS_LINE_RE.match(line)
        if m and pending_size and current_unit:
            units.setdefault(current_unit, []).append({
                "size": pending_size,
                "pts": int(m.group(1)),
            })
            pending_size = None
            # Do NOT reset current_unit — same reason as above.
            continue
        if m:
            # pts without a known size — drop pending
            pending_size = None
            continue

        # Otherwise this is a unit name (or noise)
        if is_unit_name(line):
            current_unit = line
            pending_size = None

    return units


def main():
    doc = pymupdf.open(PDF)
    out: dict[str, dict] = {}
    for page_idx in range(doc.page_count):
        page_num = page_idx + 1
        if page_num not in PAGE_TO_FACTION:
            continue
        faction = PAGE_TO_FACTION[page_num]
        s = slug(faction)
        if s not in out:
            out[s] = {"name": faction.replace("Tau", "T'au"), "pages": [], "units": {}}
        out[s]["pages"].append(page_num)
        page_units = parse_page(doc[page_idx].get_text())
        for unit, costs in page_units.items():
            existing = out[s]["units"].get(unit, [])
            seen = {(c["size"], c["pts"]) for c in existing}
            for c in costs:
                if (c["size"], c["pts"]) not in seen:
                    existing.append(c)
                    seen.add((c["size"], c["pts"]))
            out[s]["units"][unit] = existing

    OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
    print(f"wrote {OUT}")
    print(f"factions parsed: {len(out)}")
    for s, data in out.items():
        print(f"  {s:25s}: {len(data['units']):3d} units ({', '.join(map(str, data['pages']))})")


if __name__ == "__main__":
    main()