wh40k-points-comparator/build_site_data.py

#!/usr/bin/env python3
"""
Merge per-faction PDF (original) + LIVE (new) data into a single
client-loadable manifest. Skip detachment entries (names with DP suffix
or ENHANCEMENTS tier). Compute % change. Write to:
  /root/wh40k-factions/site/data.json

Schema:
  {
    "generated_at": "...",
    "factions": ["adepta-sororitas", ...],   # ordered list
    "units": [
      {
        "faction": "astra-militarum",
        "faction_name": "Astra Militarum",
        "name": "Valkyrie",
        "size": "1 model",
        "original": 190,         # may be null if unit not in PDF
        "new": 170,              # may be null if unit not in LIVE
        "tier": "YOUR 1ST TO 2ND UNITS COST",  # may be null
        "change_pct": -10.53,    # may be null
        "change_pts": -20
      },
      ...
    ]
  }
"""
import json
import re
import time
from pathlib import Path

ROOT = Path("/root/wh40k-factions")
PDF_DIR = ROOT / "pdf"
LIVE_DIR = ROOT / "live"
OUT = ROOT / "site" / "data.json"
OUT.parent.mkdir(parents=True, exist_ok=True)

# Detachments: skip any unit name with DP suffix, and skip ENHANCEMENTS-tier rows.
DP_RE = re.compile(r"\b\d+DP\b", re.IGNORECASE)
DETACHMENT_TIERS = {"ENHANCEMENTS", "DETACHMENT"}


def norm_name(s: str) -> str:
    """Normalize unit name for cross-source matching.
    PDF has Title Case, LIVE has UPPERCASE — fold them together."""
    if not s:
        return ""
    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"[^a-z0-9 ]", "", s)
    return s


def norm_size(s: str) -> str:
    if not s:
        return ""
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    m = re.search(r"(\d+)\s*model", s)
    if m:
        n = int(m.group(1))
        return f"{n} model{'s' if n != 1 else ''}"
    return s


def is_detachment_name(name: str) -> bool:
    return bool(DP_RE.search(name or ""))


def is_detachment_tier(tier) -> bool:
    if not tier:
        return False
    return str(tier).upper() in DETACHMENT_TIERS


def main():
    # Discover factions present in both dirs
    pdf_slugs  = {p.stem for p in PDF_DIR.glob("*.json")
                  if not p.stem.startswith("_")}
    live_slugs = {p.stem for p in LIVE_DIR.glob("*.json")
                  if not p.stem.startswith("_")}
    slugs = sorted(pdf_slugs | live_slugs)
    print(f"PDF factions:  {len(pdf_slugs)}")
    print(f"LIVE factions: {len(live_slugs)}")
    print(f"Total slugs:   {len(slugs)}")

    # Build a { (slug, name_norm, size_norm): {pdf_pts, live_pts, tier, name_display} }
    rows = {}

    # PDF
    for slug in slugs:
        path = PDF_DIR / f"{slug}.json"
        if not path.exists():
            continue
        data = json.load(open(path))
        faction_name = data.get("name", slug)
        for unit, entries in data.get("units", {}).items():
            if is_detachment_name(unit):
                continue
            for e in entries:
                size = norm_size(e.get("size"))
                if not size:
                    continue
                k = (slug, norm_name(unit), size)
                rec = rows.setdefault(k, {
                    "faction": slug,
                    "faction_name": faction_name,
                    "name": unit,
                    "size": e.get("size", size),  # keep display form
                    "original": None,
                    "new": None,
                    "tier": None,
                })
                # PDF may have multiple rows per (unit, size) — take the lowest as "base"
                pts = e.get("pts")
                if pts is not None:
                    if rec["original"] is None or pts < rec["original"]:
                        rec["original"] = pts

    # LIVE
    for slug in slugs:
        path = LIVE_DIR / f"{slug}.json"
        if not path.exists():
            continue
        data = json.load(open(path))
        faction_name = data.get("name", slug)
        for unit, entries in data.get("units", {}).items():
            if is_detachment_name(unit):
                continue
            for e in entries:
                if is_detachment_tier(e.get("tier")):
                    continue
                size = norm_size(e.get("size"))
                if not size:
                    continue
                k = (slug, norm_name(unit), size)
                rec = rows.setdefault(k, {
                    "faction": slug,
                    "faction_name": faction_name,
                    "name": unit,
                    "size": e.get("size", size),
                    "original": None,
                    "new": None,
                    "tier": None,
                })
                pts = e.get("pts")
                if pts is not None:
                    if rec["new"] is None or pts < rec["new"]:
                        rec["new"] = pts
                # Use the cheapest tier as the "primary" tier label
                tier = e.get("tier")
                if tier and not rec["tier"]:
                    rec["tier"] = tier

    # Compute change
    out_units = []
    for k, r in rows.items():
        o, n = r["original"], r["new"]
        if o is not None and n is not None and o > 0:
            r["change_pct"] = round((n - o) / o * 100, 2)
            r["change_pts"] = n - o
        else:
            r["change_pct"] = None
            r["change_pts"] = None
        out_units.append(r)

    # Stats
    has_both = sum(1 for u in out_units if u["original"] is not None and u["new"] is not None)
    only_pdf = sum(1 for u in out_units if u["original"] is not None and u["new"] is None)
    only_live = sum(1 for u in out_units if u["original"] is None and u["new"] is not None)
    pct_changes = [u["change_pct"] for u in out_units if u["change_pct"] is not None]
    pct_changes_sorted = sorted(pct_changes, key=lambda x: x)

    payload = {
        "generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "factions": sorted({u["faction"] for u in out_units}),
        "faction_names": {u["faction"]: u["faction_name"] for u in out_units},
        "stats": {
            "total_rows": len(out_units),
            "rows_with_both": has_both,
            "rows_pdf_only": only_pdf,
            "rows_live_only": only_live,
            "biggest_drop_pct": pct_changes_sorted[0] if pct_changes_sorted else None,
            "biggest_rise_pct": pct_changes_sorted[-1] if pct_changes_sorted else None,
        },
        "units": out_units,
    }

    OUT.write_text(json.dumps(payload, ensure_ascii=False))
    print(f"\nWrote {OUT}")
    print(f"  total rows:    {len(out_units)}")
    print(f"  with both:     {has_both}")
    print(f"  PDF only:      {only_pdf}")
    print(f"  LIVE only:     {only_live}")
    if pct_changes:
        print(f"  biggest drop:  {pct_changes_sorted[0]:.2f}%")
        print(f"  biggest rise:  {pct_changes_sorted[-1]:.2f}%")
    print(f"  size:          {OUT.stat().st_size / 1024:.1f} KB")


if __name__ == "__main__":
    main()