wh40k-points-comparator/join_and_export.py

"""Join PDF (original) vs live MFM (new) and emit per-faction CSVs + combined.

Output schema (matches the user's reference image):
    Faction, Unidad, Coste original, Coste nuevo, % de cambio,
    2 o más unidades, 3 o más unidades, 4 o más unidades

Key:
  - For each unit present in the live site (source of truth for "new"),
    find the matching cost row in the PDF by name + size.
  - The "base" cost row is the cheapest / smallest tier (the "YOUR UNIT
    COSTS" tier from the PDF, or the "YOUR 1ST TO 2ND UNITS COST" /
    "YOUR 1ST UNIT COSTS" tier from the live site).
  - The "2 o más" / "3 o más" / "4 o más" columns are populated from any
    other cost rows for the same unit (looking up by unit size).
  - If a unit is in the live site but not the PDF, original is blank.
  - If a unit is in the PDF but not the live site, it's omitted (the
    PDF is a snapshot — the live site is current).
"""
import csv
import json
import re
from pathlib import Path

ROOT = Path("/root/wh40k-factions")
PDF = ROOT / "pdf_data.json"
LIVE = ROOT / "live_data.json"
CSV_DIR = ROOT / "csv"
CSV_DIR.mkdir(parents=True, exist_ok=True)

# Chapter supplements inherit unit data from the parent faction's PDF section.
# When the live site lists a generic SM unit (Aggressor Squad, Intercessor Squad,
# etc.) and the chapter's PDF supplement doesn't include it, fall back to the
# parent Space Marines PDF entries for "original".
INHERIT_FROM_PARENT = {
    "black-templars":   "space-marines",
    "blood-angels":     "space-marines",
    "dark-angels":      "space-marines",
    "space-wolves":     "space-marines",
    "deathwatch":       "space-marines",
    "emperors-children":"chaos-space-marines",
    "thousand-sons":    "chaos-space-marines",
    "world-eaters":     "chaos-space-marines",
    "death-guard":      "chaos-space-marines",
}

# Spanish column headers (matching the image)
HEADERS = [
    "Faction",
    "Unidad",
    "Coste original",
    "Coste nuevo",
    "% de cambio",
    "2 o más unidades",
    "3 o más unidades",
    "4 o más unidades",
]


def normalize_unit_name(s: str) -> str:
    """Normalize a unit name for cross-source matching.

    The PDF has 'Title Case' (e.g. 'Broadside Battlesuits'), the live site
    has 'UPPERCASE' (e.g. 'BROADSIDE BATTLESUITS'). Normalize:
      - uppercase
      - strip leading/trailing whitespace
      - collapse multiple spaces
      - remove apostrophes and smart quotes
    """
    s = s.upper()
    s = s.replace("\u2019", "").replace("'", "")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def normalize_size(s: str) -> str:
    """Normalize a size string like '1 model', '1 models', '10 models'."""
    s = s.lower().strip()
    s = s.replace("\u2019", "").replace("'", "")
    # '1 model' and '1 models' both become '1 model'
    if "model" in s:
        m = re.match(r"(\d+)\s+models?", s)
        if m:
            return f"{m.group(1)} model"
    return s


# Tiers considered "bulk" cost (more units). Live site tier names.
# (These are looked up by exact label inside the join logic.)
BULK_TIERS = {  # noqa: F841 — kept for documentation
    "YOUR 3RD + UNIT COSTS",
    "YOUR 2ND + UNIT COSTS",
}


def pct_change(orig, new):
    if not orig or orig == 0:
        return ""
    delta = (new - orig) / orig * 100
    # Match the image's format: 2 decimals, comma decimal separator
    s = f"{abs(delta):.2f}"
    return f"{'-' if delta < 0 else ''}{s}".replace(".", ",") + "%"


def fmt_pts(p):
    if p is None:
        return ""
    return str(p)


def _size_n(size_str: str) -> int:
    m = re.match(r"(\d+)", size_str)
    return int(m.group(1)) if m else 1


def find_tier_cost_at_size(model_rows, tier_label: str, size_n: int):
    """Find the row matching a tier label and a specific model count.

    Live site tier labels we care about for the bulk columns:
      'YOUR 2ND + UNIT COSTS' -> 2 o más
      'YOUR 3RD + UNIT COSTS' -> 3 o más
      'YOUR 4TH + UNIT COSTS' -> 4 o más (if it ever exists)
    """
    for r in model_rows:
        tier = (r.get("tier") or "").upper().strip()
        if tier == tier_label:
            n = _size_n(r["size"])
            if n == size_n:
                return r["pts"]
    return None


def join_faction(faction_slug: str, pdf: dict, live: dict, all_pdf: dict) -> list[dict]:
    """Build the comparison rows for one faction."""
    pdf_units = pdf.get("units", {})
    live_units = live.get("units", {})

    # Build PDF lookup: (unit_normalized, size_normalized) -> row
    pdf_idx = {}
    for unit, rows in pdf_units.items():
        nu = normalize_unit_name(unit)
        for r in rows:
            if "model" not in r["size"].lower():
                continue
            ns = normalize_size(r["size"])
            pdf_idx[(nu, ns)] = r

    # Inherit parent-faction PDF entries for chapter supplements.
    # If a unit isn't in the chapter's PDF, look it up in the parent.
    parent = INHERIT_FROM_PARENT.get(faction_slug)
    if parent and parent in all_pdf:
        for unit, rows in all_pdf[parent].get("units", {}).items():
            nu = normalize_unit_name(unit)
            for r in rows:
                if "model" not in r["size"].lower():
                    continue
                ns = normalize_size(r["size"])
                # don't overwrite chapter-specific entries
                if (nu, ns) not in pdf_idx:
                    pdf_idx[(nu, ns)] = r

    rows = []

    # Walk live units (source of truth for "new")
    for unit, unit_rows in live_units.items():
        nu = normalize_unit_name(unit)
        # skip wargear-only units
        model_rows = [r for r in unit_rows if "model" in r["size"].lower()]
        if not model_rows:
            continue

        # Pick the base tier: prefer YOUR UNIT COSTS, else first tier.
        base_tier = None
        for r in model_rows:
            tier = (r.get("tier") or "").upper().strip()
            if tier == "YOUR UNIT COSTS":
                base_tier = tier
                break
        if base_tier is None:
            base_tier = (model_rows[0].get("tier") or "").upper().strip()

        # Find all distinct sizes in the base tier -> one row per size
        base_size_rows = [r for r in model_rows
                          if (r.get("tier") or "").upper().strip() == base_tier]
        if not base_size_rows:
            # fallback: any rows
            base_size_rows = model_rows

        # Distinct sizes in display order (smallest first)
        seen_sizes = set()
        base_sizes = []
        for r in base_size_rows:
            ns_r = normalize_size(r["size"])
            n_r = _size_n(r["size"])
            key = (n_r, ns_r)
            if key in seen_sizes:
                continue
            seen_sizes.add(key)
            base_sizes.append((ns_r, n_r))

        if not base_sizes:
            continue

        for ns, base_n in base_sizes:
            # Find the live row matching this size+base tier
            live_row = next((r for r in base_size_rows
                             if normalize_size(r["size"]) == ns), None)
            if not live_row:
                continue
            new_pts = live_row["pts"]

            # Find original in PDF
            pdf_row = pdf_idx.get((nu, ns))
            orig_pts = pdf_row["pts"] if pdf_row else None

            # Bulk columns: cost of THIS size in 2ND+ / 3RD+ / 4TH+ tiers
            bulk_2 = find_tier_cost_at_size(model_rows, "YOUR 2ND + UNIT COSTS", base_n)
            bulk_3 = find_tier_cost_at_size(model_rows, "YOUR 3RD + UNIT COSTS", base_n)
            bulk_4 = find_tier_cost_at_size(model_rows, "YOUR 4TH + UNIT COSTS", base_n)

            # Display the unit name with the size prefix if there's only one
            # size in the base tier, use the unit name alone; if multiple
            # sizes, prefix with the count.
            if len(base_sizes) == 1:
                display_unit = unit
            else:
                display_unit = f"{base_n} {unit.title()}"

            rows.append({
                "Faction": live.get("name", pdf.get("name", faction_slug)),
                "Unidad": display_unit,
                "Coste original": fmt_pts(orig_pts),
                "Coste nuevo": fmt_pts(new_pts),
                "% de cambio": pct_change(orig_pts, new_pts) if orig_pts else "",
                "2 o más unidades": fmt_pts(bulk_2),
                "3 o más unidades": fmt_pts(bulk_3),
                "4 o más unidades": fmt_pts(bulk_4),
            })

    # Sort: by Faction then Unidad
    rows.sort(key=lambda r: (r["Faction"], r["Unidad"]))
    return rows


def main():
    pdf = json.loads(PDF.read_text())
    live = json.loads(LIVE.read_text())
    if not pdf or not live:
        print("ERR: pdf or live data missing — run parse_pdf.py and scrape_live.py first")
        return 1

    all_rows: list[dict] = []
    summary = []

    # Build per-faction CSVs
    for slug in sorted(set(list(pdf.keys()) + list(live.keys()))):
        p = pdf.get(slug, {"name": slug, "units": {}})
        l = live.get(slug, {"name": slug, "units": {}})
        rows = join_faction(slug, p, l, pdf)
        if not rows:
            print(f"  {slug}: no rows (skipped)")
            continue
        out_path = CSV_DIR / f"{slug}.csv"
        with out_path.open("w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=HEADERS)
            w.writeheader()
            w.writerows(rows)
        all_rows.extend(rows)
        n_with_orig = sum(1 for r in rows if r["Coste original"])
        summary.append((slug, l.get("name", slug), len(rows), n_with_orig))
        print(f"  {slug:25s}  {len(rows):3d} rows  ({n_with_orig} with original)")

    # Combined
    combined = CSV_DIR / "_all_factions.csv"
    with combined.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=HEADERS)
        w.writeheader()
        w.writerows(all_rows)

    print()
    print(f"per-faction CSVs: {CSV_DIR}/*.csv  ({len(summary)} files)")
    print(f"combined CSV:     {combined}  ({len(all_rows)} rows total)")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())