Files
wh40k-points-comparator/join_and_export.py
root 38bffa491c Initial commit: WH40K Points Comparator
- React + MUI DataGrid app with faction filter, search, change filter
- Biggest movers cards (drops/rises) scoped to current filter view
- Historical points graph modal (5 MFM versions: 1.14 → current)
- URL state sync (faction, dir, q params — shareable URLs)
- Grimdark favicon + OG embed image (Google Imagen)
- Multi-stage Dockerfile (node build → nginx serve)
- docker-compose.yml with Traefik + Cloudflare TLS
- Data pipeline: build_deduped_data.py merges PDF + live scrape
- Ynnari merged into Aeldari (shared codex)
- Mobile responsive: flex columns, no fixed pixel widths
- Color semantics: green=cheaper, red=costlier (consistent everywhere)
- 1,449 units across 31 factions
2026-06-18 02:42:29 +00:00

287 lines
10 KiB
Python

"""Join PDF (original) vs live MFM (new) and emit per-faction CSVs + combined.
Output schema (matches the user's reference image):
Faction, Unidad, Coste original, Coste nuevo, % de cambio,
2 o más unidades, 3 o más unidades, 4 o más unidades
Key:
- For each unit present in the live site (source of truth for "new"),
find the matching cost row in the PDF by name + size.
- The "base" cost row is the cheapest / smallest tier (the "YOUR UNIT
COSTS" tier from the PDF, or the "YOUR 1ST TO 2ND UNITS COST" /
"YOUR 1ST UNIT COSTS" tier from the live site).
- The "2 o más" / "3 o más" / "4 o más" columns are populated from any
other cost rows for the same unit (looking up by unit size).
- If a unit is in the live site but not the PDF, original is blank.
- If a unit is in the PDF but not the live site, it's omitted (the
PDF is a snapshot — the live site is current).
"""
import csv
import json
import re
from pathlib import Path
ROOT = Path("/root/wh40k-factions")
PDF = ROOT / "pdf_data.json"
LIVE = ROOT / "live_data.json"
CSV_DIR = ROOT / "csv"
CSV_DIR.mkdir(parents=True, exist_ok=True)
# Chapter supplements inherit unit data from the parent faction's PDF section.
# When the live site lists a generic SM unit (Aggressor Squad, Intercessor Squad,
# etc.) and the chapter's PDF supplement doesn't include it, fall back to the
# parent Space Marines PDF entries for "original".
INHERIT_FROM_PARENT = {
"black-templars": "space-marines",
"blood-angels": "space-marines",
"dark-angels": "space-marines",
"space-wolves": "space-marines",
"deathwatch": "space-marines",
"emperors-children":"chaos-space-marines",
"thousand-sons": "chaos-space-marines",
"world-eaters": "chaos-space-marines",
"death-guard": "chaos-space-marines",
}
# Spanish column headers (matching the image)
HEADERS = [
"Faction",
"Unidad",
"Coste original",
"Coste nuevo",
"% de cambio",
"2 o más unidades",
"3 o más unidades",
"4 o más unidades",
]
def normalize_unit_name(s: str) -> str:
"""Normalize a unit name for cross-source matching.
The PDF has 'Title Case' (e.g. 'Broadside Battlesuits'), the live site
has 'UPPERCASE' (e.g. 'BROADSIDE BATTLESUITS'). Normalize:
- uppercase
- strip leading/trailing whitespace
- collapse multiple spaces
- remove apostrophes and smart quotes
"""
s = s.upper()
s = s.replace("\u2019", "").replace("'", "")
s = re.sub(r"\s+", " ", s).strip()
return s
def normalize_size(s: str) -> str:
"""Normalize a size string like '1 model', '1 models', '10 models'."""
s = s.lower().strip()
s = s.replace("\u2019", "").replace("'", "")
# '1 model' and '1 models' both become '1 model'
if "model" in s:
m = re.match(r"(\d+)\s+models?", s)
if m:
return f"{m.group(1)} model"
return s
# Tiers considered "bulk" cost (more units). Live site tier names.
# (These are looked up by exact label inside the join logic.)
BULK_TIERS = { # noqa: F841 — kept for documentation
"YOUR 3RD + UNIT COSTS",
"YOUR 2ND + UNIT COSTS",
}
def pct_change(orig, new):
if not orig or orig == 0:
return ""
delta = (new - orig) / orig * 100
# Match the image's format: 2 decimals, comma decimal separator
s = f"{abs(delta):.2f}"
return f"{'-' if delta < 0 else ''}{s}".replace(".", ",") + "%"
def fmt_pts(p):
if p is None:
return ""
return str(p)
def _size_n(size_str: str) -> int:
m = re.match(r"(\d+)", size_str)
return int(m.group(1)) if m else 1
def find_tier_cost_at_size(model_rows, tier_label: str, size_n: int):
"""Find the row matching a tier label and a specific model count.
Live site tier labels we care about for the bulk columns:
'YOUR 2ND + UNIT COSTS' -> 2 o más
'YOUR 3RD + UNIT COSTS' -> 3 o más
'YOUR 4TH + UNIT COSTS' -> 4 o más (if it ever exists)
"""
for r in model_rows:
tier = (r.get("tier") or "").upper().strip()
if tier == tier_label:
n = _size_n(r["size"])
if n == size_n:
return r["pts"]
return None
def join_faction(faction_slug: str, pdf: dict, live: dict, all_pdf: dict) -> list[dict]:
"""Build the comparison rows for one faction."""
pdf_units = pdf.get("units", {})
live_units = live.get("units", {})
# Build PDF lookup: (unit_normalized, size_normalized) -> row
pdf_idx = {}
for unit, rows in pdf_units.items():
nu = normalize_unit_name(unit)
for r in rows:
if "model" not in r["size"].lower():
continue
ns = normalize_size(r["size"])
pdf_idx[(nu, ns)] = r
# Inherit parent-faction PDF entries for chapter supplements.
# If a unit isn't in the chapter's PDF, look it up in the parent.
parent = INHERIT_FROM_PARENT.get(faction_slug)
if parent and parent in all_pdf:
for unit, rows in all_pdf[parent].get("units", {}).items():
nu = normalize_unit_name(unit)
for r in rows:
if "model" not in r["size"].lower():
continue
ns = normalize_size(r["size"])
# don't overwrite chapter-specific entries
if (nu, ns) not in pdf_idx:
pdf_idx[(nu, ns)] = r
rows = []
# Walk live units (source of truth for "new")
for unit, unit_rows in live_units.items():
nu = normalize_unit_name(unit)
# skip wargear-only units
model_rows = [r for r in unit_rows if "model" in r["size"].lower()]
if not model_rows:
continue
# Pick the base tier: prefer YOUR UNIT COSTS, else first tier.
base_tier = None
for r in model_rows:
tier = (r.get("tier") or "").upper().strip()
if tier == "YOUR UNIT COSTS":
base_tier = tier
break
if base_tier is None:
base_tier = (model_rows[0].get("tier") or "").upper().strip()
# Find all distinct sizes in the base tier -> one row per size
base_size_rows = [r for r in model_rows
if (r.get("tier") or "").upper().strip() == base_tier]
if not base_size_rows:
# fallback: any rows
base_size_rows = model_rows
# Distinct sizes in display order (smallest first)
seen_sizes = set()
base_sizes = []
for r in base_size_rows:
ns_r = normalize_size(r["size"])
n_r = _size_n(r["size"])
key = (n_r, ns_r)
if key in seen_sizes:
continue
seen_sizes.add(key)
base_sizes.append((ns_r, n_r))
if not base_sizes:
continue
for ns, base_n in base_sizes:
# Find the live row matching this size+base tier
live_row = next((r for r in base_size_rows
if normalize_size(r["size"]) == ns), None)
if not live_row:
continue
new_pts = live_row["pts"]
# Find original in PDF
pdf_row = pdf_idx.get((nu, ns))
orig_pts = pdf_row["pts"] if pdf_row else None
# Bulk columns: cost of THIS size in 2ND+ / 3RD+ / 4TH+ tiers
bulk_2 = find_tier_cost_at_size(model_rows, "YOUR 2ND + UNIT COSTS", base_n)
bulk_3 = find_tier_cost_at_size(model_rows, "YOUR 3RD + UNIT COSTS", base_n)
bulk_4 = find_tier_cost_at_size(model_rows, "YOUR 4TH + UNIT COSTS", base_n)
# Display the unit name with the size prefix if there's only one
# size in the base tier, use the unit name alone; if multiple
# sizes, prefix with the count.
if len(base_sizes) == 1:
display_unit = unit
else:
display_unit = f"{base_n} {unit.title()}"
rows.append({
"Faction": live.get("name", pdf.get("name", faction_slug)),
"Unidad": display_unit,
"Coste original": fmt_pts(orig_pts),
"Coste nuevo": fmt_pts(new_pts),
"% de cambio": pct_change(orig_pts, new_pts) if orig_pts else "",
"2 o más unidades": fmt_pts(bulk_2),
"3 o más unidades": fmt_pts(bulk_3),
"4 o más unidades": fmt_pts(bulk_4),
})
# Sort: by Faction then Unidad
rows.sort(key=lambda r: (r["Faction"], r["Unidad"]))
return rows
def main():
pdf = json.loads(PDF.read_text())
live = json.loads(LIVE.read_text())
if not pdf or not live:
print("ERR: pdf or live data missing — run parse_pdf.py and scrape_live.py first")
return 1
all_rows: list[dict] = []
summary = []
# Build per-faction CSVs
for slug in sorted(set(list(pdf.keys()) + list(live.keys()))):
p = pdf.get(slug, {"name": slug, "units": {}})
l = live.get(slug, {"name": slug, "units": {}})
rows = join_faction(slug, p, l, pdf)
if not rows:
print(f" {slug}: no rows (skipped)")
continue
out_path = CSV_DIR / f"{slug}.csv"
with out_path.open("w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=HEADERS)
w.writeheader()
w.writerows(rows)
all_rows.extend(rows)
n_with_orig = sum(1 for r in rows if r["Coste original"])
summary.append((slug, l.get("name", slug), len(rows), n_with_orig))
print(f" {slug:25s} {len(rows):3d} rows ({n_with_orig} with original)")
# Combined
combined = CSV_DIR / "_all_factions.csv"
with combined.open("w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=HEADERS)
w.writeheader()
w.writerows(all_rows)
print()
print(f"per-faction CSVs: {CSV_DIR}/*.csv ({len(summary)} files)")
print(f"combined CSV: {combined} ({len(all_rows)} rows total)")
return 0
if __name__ == "__main__":
raise SystemExit(main())