wh40k-points-comparator/build_deduped_data.py

#!/usr/bin/env python3
"""
Rebuild react-app/public/data.json with:
  1. Filter out weapon-upgrade rows (size starts with "per " or "+ ").
  2. Collapse each (faction, name) into ONE row, with a `sizes` array of
     {size, original, new, change_pct, change_pts, tier, history} variants.
  3. Only keep sizes that the MFM (new/live) actually listed.
  4. Fill missing originals by scaling proportionally to model count.
  5. Build a `history` array per size with {date, version, pts} from all 3 sources:
     - v3.2 PDF (Aug 20, 2025)
     - v4.3 PDF (Jun 5, 2026)
     - Live MFM (Jun 17, 2026)

The history data does NOT appear in the table — it's only used when the user
clicks a unit name to open the graph modal.
"""
import json
import re
import time
from pathlib import Path
from collections import defaultdict

ROOT = Path("/root/wh40k-factions")
PDF32_DIR = ROOT / "pdf32"   # v3.2
PDF_DIR = ROOT / "pdf"        # v4.3
LIVE_DIR = ROOT / "live"      # current MFM
OUT = ROOT / "react-app" / "public" / "data.json"
OUT.parent.mkdir(parents=True, exist_ok=True)

DP_RE = re.compile(r"\b\d+DP\b", re.IGNORECASE)
DETACHMENT_TIERS = {"ENHANCEMENTS", "DETACHMENT"}
UPGRADE_PREFIXES = ("per ", "+ ")

# Version metadata (oldest → newest)
VERSIONS = [
    {"version": "1.14", "date": "2024-12-01", "label": "MFM 1.14", "dir": ROOT / "pdf114"},
    {"version": "2.3", "date": "2025-03-01", "label": "MFM 2.3", "dir": ROOT / "pdf23"},
    {"version": "3.2", "date": "2025-08-20", "label": "MFM 3.2", "dir": PDF32_DIR},
    {"version": "4.3", "date": "2026-06-05", "label": "MFM 4.3", "dir": PDF_DIR},
    {"version": "current", "date": "2026-06-17", "label": "MFM (current)", "dir": LIVE_DIR},
]


def norm_name(s: str) -> str:
    if not s:
        return ""
    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"[^a-z0-9 ]", "", s)
    return s


def norm_size(s: str) -> str:
    if not s:
        return ""
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    m = re.search(r"(\d+)\s*model", s)
    if m:
        n = int(m.group(1))
        return f"{n} model{'s' if n != 1 else ''}"
    return s


def is_upgrade_size(size: str) -> bool:
    s = (size or "").lower().strip()
    return s.startswith(UPGRADE_PREFIXES)


def is_detachment_name(name: str) -> bool:
    return bool(DP_RE.search(name or ""))


def is_detachment_tier(tier) -> bool:
    if not tier:
        return False
    return str(tier).upper() in DETACHMENT_TIERS


def model_count(s):
    m = re.search(r"(\d+)", s)
    return int(m.group(1)) if m else 1


def load_version(ver_info):
    """Load all rows from a version's directory. Returns dict keyed by (slug, norm_name, norm_size) -> pts."""
    rows = {}
    slug_dir = ver_info["dir"]
    if not slug_dir.exists():
        return rows
    for path in sorted(slug_dir.glob("*.json")):
        if path.stem.startswith("_"):
            continue
        data = json.load(open(path))
        slug = data.get("slug", path.stem)
        # Ynnari shares the Aeldari codex — merge into aeldari
        if slug == "ynnari":
            slug = "aeldari"
        for unit, entries in data.get("units", {}).items():
            if is_detachment_name(unit):
                continue
            for e in entries:
                if ver_info["version"] == "current" and is_detachment_tier(e.get("tier")):
                    continue
                size_disp = e.get("size", "")
                if is_upgrade_size(size_disp):
                    continue
                size = norm_size(size_disp)
                if not size:
                    continue
                k = (slug, norm_name(unit), size)
                pts = e.get("pts")
                if pts is not None:
                    # Keep lowest pts if duplicates
                    if k not in rows or pts < rows[k]:
                        rows[k] = pts
    return rows


def main():
    # Load each version
    version_data = {}
    for ver in VERSIONS:
        rows = load_version(ver)
        version_data[ver["version"]] = rows
        print(f"{ver['label']}: {len(rows)} size-rows loaded")

    # Use "current" (live) as the primary set of units/sizes
    # and "4.3" as the source of "original" (old codex) values
    live_rows = version_data.get("current", {})
    pdf43_rows = version_data.get("4.3", {})
    pdf32_rows = version_data.get("3.2", {})

    # Also load faction names from live data
    faction_names = {}
    for path in sorted(LIVE_DIR.glob("*.json")):
        if path.stem.startswith("_"):
            continue
        data = json.load(open(path))
        slug = data.get("slug", path.stem)
        faction_names[slug] = data.get("name", slug)

    # Group live rows by (slug, norm_name)
    groups = defaultdict(list)
    for (slug, name_norm, size), pts in live_rows.items():
        groups[(slug, name_norm)].append({"size": size, "new": pts})

    # Also include PDF-only units (removed from MFM)
    for (slug, name_norm, size), pts in pdf43_rows.items():
        if (slug, name_norm, size) not in live_rows:
            groups[(slug, name_norm)].append({"size": size, "new": None, "original": pts})

    out_units = []
    for (slug, name_norm), grp in groups.items():
        # Sort by numeric size
        grp.sort(key=lambda r: model_count(r["size"]))

        # Deduplicate sizes (keep first occurrence)
        seen_sizes = set()
        unique = []
        for r in grp:
            if r["size"] not in seen_sizes:
                seen_sizes.add(r["size"])
                unique.append(r)
        grp = unique

        # Only keep sizes that the MFM (new/live) actually listed
        mfm_sizes = [r for r in grp if r["new"] is not None]
        if not mfm_sizes:
            mfm_sizes = [grp[0]]  # removed unit, keep one PDF entry

        # Find base original (smallest size with a non-None original in 4.3 PDF)
        base_orig = None
        base_count = None
        for (s, n, sz), pts in pdf43_rows.items():
            if s == slug and n == name_norm:
                if base_orig is None or model_count(sz) < base_count:
                    base_orig = pts
                    base_count = model_count(sz)

        # Fill missing originals on MFM sizes by scaling from base original
        for r in mfm_sizes:
            if r.get("original") is None:
                # Try exact match in 4.3 PDF first
                key = (slug, name_norm, r["size"])
                if key in pdf43_rows:
                    r["original"] = pdf43_rows[key]
                elif base_orig is not None and base_count is not None:
                    cnt = model_count(r["size"])
                    if base_count > 0 and cnt > 0:
                        r["original"] = round(base_orig * cnt / base_count)
                else:
                    r["original"] = None

        # Build sizes[] array with history
        sizes = []
        for r in mfm_sizes:
            o, n = r.get("original"), r["new"]
            change_pct = round((n - o) / o * 100, 2) if (o is not None and n is not None and o > 0) else None
            change_pts = (n - o) if (o is not None and n is not None) else None

            # Build history for this size
            history = []
            for ver in VERSIONS:
                key = (slug, name_norm, r["size"])
                pts_map = version_data[ver["version"]]
                if key in pts_map:
                    history.append({
                        "date": ver["date"],
                        "version": ver["label"],
                        "pts": pts_map[key],
                    })

            sizes.append({
                "size": r["size"],
                "original": o,
                "new": n,
                "tier": None,
                "change_pct": change_pct,
                "change_pts": change_pts,
                "history": history,
            })

        # default_size = smallest
        default = sizes[0]
        default_size = default["size"]

        # Display name: try to find Title Case from PDF data
        display_name = name_norm.title()
        for path in sorted(PDF_DIR.glob("*.json")):
            if path.stem.startswith("_"):
                continue
            data = json.load(open(path))
            if data.get("slug") == slug:
                for unit in data.get("units", {}):
                    if norm_name(unit) == name_norm:
                        display_name = unit
                        break
                break

        faction_name = faction_names.get(slug, slug)

        out_units.append({
            "faction": slug,
            "faction_name": faction_name,
            "name": display_name,
            "size": default["size"],
            "original": default["original"],
            "new": default["new"],
            "tier": default.get("tier"),
            "change_pct": default["change_pct"],
            "change_pts": default["change_pts"],
            "sizes": sizes,
            "default_size": default_size,
        })

    # Stats
    has_both = sum(1 for u in out_units if u["original"] is not None and u["new"] is not None)
    only_pdf = sum(1 for u in out_units if u["original"] is not None and u["new"] is None)
    only_live = sum(1 for u in out_units if u["original"] is None and u["new"] is not None)
    pct_changes = [u["change_pct"] for u in out_units if u["change_pct"] is not None]
    pct_changes_sorted = sorted(pct_changes, key=lambda x: x)
    units_with_history = sum(1 for u in out_units if any(len(s.get("history", [])) > 1 for s in u["sizes"]))

    payload = {
        "generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "versions": [{"date": v["date"], "label": v["label"]} for v in VERSIONS],
        "factions": sorted({u["faction"] for u in out_units}),
        "faction_names": {u["faction"]: u["faction_name"] for u in out_units},
        "stats": {
            "total_rows": len(out_units),
            "rows_with_both": has_both,
            "rows_pdf_only": only_pdf,
            "rows_live_only": only_live,
            "biggest_drop_pct": pct_changes_sorted[0] if pct_changes_sorted else None,
            "biggest_rise_pct": pct_changes_sorted[-1] if pct_changes_sorted else None,
            "multi_size": sum(1 for u in out_units if len(u["sizes"]) > 1),
            "units_with_history": units_with_history,
        },
        "units": out_units,
    }

    OUT.write_text(json.dumps(payload, ensure_ascii=False))
    print(f"\nWrote {OUT}")
    print(f"  total rows:        {len(out_units)}")
    print(f"  with both:         {has_both}")
    print(f"  PDF only:          {only_pdf}")
    print(f"  LIVE only:         {only_live}")
    print(f"  multi-size:        {sum(1 for u in out_units if len(u['sizes']) > 1)}")
    print(f"  with history:      {units_with_history}")
    if pct_changes:
        print(f"  biggest drop:      {pct_changes_sorted[0]:.2f}%")
        print(f"  biggest rise:      {pct_changes_sorted[-1]:.2f}%")
    print(f"  size:              {OUT.stat().st_size / 1024:.1f} KB")


if __name__ == "__main__":
    main()