- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
287 lines
10 KiB
Python
287 lines
10 KiB
Python
"""Join PDF (original) vs live MFM (new) and emit per-faction CSVs + combined.
|
|
|
|
Output schema (matches the user's reference image):
|
|
Faction, Unidad, Coste original, Coste nuevo, % de cambio,
|
|
2 o más unidades, 3 o más unidades, 4 o más unidades
|
|
|
|
Key:
|
|
- For each unit present in the live site (source of truth for "new"),
|
|
find the matching cost row in the PDF by name + size.
|
|
- The "base" cost row is the cheapest / smallest tier (the "YOUR UNIT
|
|
COSTS" tier from the PDF, or the "YOUR 1ST TO 2ND UNITS COST" /
|
|
"YOUR 1ST UNIT COSTS" tier from the live site).
|
|
- The "2 o más" / "3 o más" / "4 o más" columns are populated from any
|
|
other cost rows for the same unit (looking up by unit size).
|
|
- If a unit is in the live site but not the PDF, original is blank.
|
|
- If a unit is in the PDF but not the live site, it's omitted (the
|
|
PDF is a snapshot — the live site is current).
|
|
"""
|
|
import csv
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
ROOT = Path("/root/wh40k-factions")
|
|
PDF = ROOT / "pdf_data.json"
|
|
LIVE = ROOT / "live_data.json"
|
|
CSV_DIR = ROOT / "csv"
|
|
CSV_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Chapter supplements inherit unit data from the parent faction's PDF section.
|
|
# When the live site lists a generic SM unit (Aggressor Squad, Intercessor Squad,
|
|
# etc.) and the chapter's PDF supplement doesn't include it, fall back to the
|
|
# parent Space Marines PDF entries for "original".
|
|
INHERIT_FROM_PARENT = {
|
|
"black-templars": "space-marines",
|
|
"blood-angels": "space-marines",
|
|
"dark-angels": "space-marines",
|
|
"space-wolves": "space-marines",
|
|
"deathwatch": "space-marines",
|
|
"emperors-children":"chaos-space-marines",
|
|
"thousand-sons": "chaos-space-marines",
|
|
"world-eaters": "chaos-space-marines",
|
|
"death-guard": "chaos-space-marines",
|
|
}
|
|
|
|
# Spanish column headers (matching the image)
|
|
HEADERS = [
|
|
"Faction",
|
|
"Unidad",
|
|
"Coste original",
|
|
"Coste nuevo",
|
|
"% de cambio",
|
|
"2 o más unidades",
|
|
"3 o más unidades",
|
|
"4 o más unidades",
|
|
]
|
|
|
|
|
|
def normalize_unit_name(s: str) -> str:
|
|
"""Normalize a unit name for cross-source matching.
|
|
|
|
The PDF has 'Title Case' (e.g. 'Broadside Battlesuits'), the live site
|
|
has 'UPPERCASE' (e.g. 'BROADSIDE BATTLESUITS'). Normalize:
|
|
- uppercase
|
|
- strip leading/trailing whitespace
|
|
- collapse multiple spaces
|
|
- remove apostrophes and smart quotes
|
|
"""
|
|
s = s.upper()
|
|
s = s.replace("\u2019", "").replace("'", "")
|
|
s = re.sub(r"\s+", " ", s).strip()
|
|
return s
|
|
|
|
|
|
def normalize_size(s: str) -> str:
|
|
"""Normalize a size string like '1 model', '1 models', '10 models'."""
|
|
s = s.lower().strip()
|
|
s = s.replace("\u2019", "").replace("'", "")
|
|
# '1 model' and '1 models' both become '1 model'
|
|
if "model" in s:
|
|
m = re.match(r"(\d+)\s+models?", s)
|
|
if m:
|
|
return f"{m.group(1)} model"
|
|
return s
|
|
|
|
|
|
# Tiers considered "bulk" cost (more units). Live site tier names.
|
|
# (These are looked up by exact label inside the join logic.)
|
|
BULK_TIERS = { # noqa: F841 — kept for documentation
|
|
"YOUR 3RD + UNIT COSTS",
|
|
"YOUR 2ND + UNIT COSTS",
|
|
}
|
|
|
|
|
|
def pct_change(orig, new):
|
|
if not orig or orig == 0:
|
|
return ""
|
|
delta = (new - orig) / orig * 100
|
|
# Match the image's format: 2 decimals, comma decimal separator
|
|
s = f"{abs(delta):.2f}"
|
|
return f"{'-' if delta < 0 else ''}{s}".replace(".", ",") + "%"
|
|
|
|
|
|
def fmt_pts(p):
|
|
if p is None:
|
|
return ""
|
|
return str(p)
|
|
|
|
|
|
def _size_n(size_str: str) -> int:
|
|
m = re.match(r"(\d+)", size_str)
|
|
return int(m.group(1)) if m else 1
|
|
|
|
|
|
def find_tier_cost_at_size(model_rows, tier_label: str, size_n: int):
|
|
"""Find the row matching a tier label and a specific model count.
|
|
|
|
Live site tier labels we care about for the bulk columns:
|
|
'YOUR 2ND + UNIT COSTS' -> 2 o más
|
|
'YOUR 3RD + UNIT COSTS' -> 3 o más
|
|
'YOUR 4TH + UNIT COSTS' -> 4 o más (if it ever exists)
|
|
"""
|
|
for r in model_rows:
|
|
tier = (r.get("tier") or "").upper().strip()
|
|
if tier == tier_label:
|
|
n = _size_n(r["size"])
|
|
if n == size_n:
|
|
return r["pts"]
|
|
return None
|
|
|
|
|
|
def join_faction(faction_slug: str, pdf: dict, live: dict, all_pdf: dict) -> list[dict]:
|
|
"""Build the comparison rows for one faction."""
|
|
pdf_units = pdf.get("units", {})
|
|
live_units = live.get("units", {})
|
|
|
|
# Build PDF lookup: (unit_normalized, size_normalized) -> row
|
|
pdf_idx = {}
|
|
for unit, rows in pdf_units.items():
|
|
nu = normalize_unit_name(unit)
|
|
for r in rows:
|
|
if "model" not in r["size"].lower():
|
|
continue
|
|
ns = normalize_size(r["size"])
|
|
pdf_idx[(nu, ns)] = r
|
|
|
|
# Inherit parent-faction PDF entries for chapter supplements.
|
|
# If a unit isn't in the chapter's PDF, look it up in the parent.
|
|
parent = INHERIT_FROM_PARENT.get(faction_slug)
|
|
if parent and parent in all_pdf:
|
|
for unit, rows in all_pdf[parent].get("units", {}).items():
|
|
nu = normalize_unit_name(unit)
|
|
for r in rows:
|
|
if "model" not in r["size"].lower():
|
|
continue
|
|
ns = normalize_size(r["size"])
|
|
# don't overwrite chapter-specific entries
|
|
if (nu, ns) not in pdf_idx:
|
|
pdf_idx[(nu, ns)] = r
|
|
|
|
rows = []
|
|
|
|
# Walk live units (source of truth for "new")
|
|
for unit, unit_rows in live_units.items():
|
|
nu = normalize_unit_name(unit)
|
|
# skip wargear-only units
|
|
model_rows = [r for r in unit_rows if "model" in r["size"].lower()]
|
|
if not model_rows:
|
|
continue
|
|
|
|
# Pick the base tier: prefer YOUR UNIT COSTS, else first tier.
|
|
base_tier = None
|
|
for r in model_rows:
|
|
tier = (r.get("tier") or "").upper().strip()
|
|
if tier == "YOUR UNIT COSTS":
|
|
base_tier = tier
|
|
break
|
|
if base_tier is None:
|
|
base_tier = (model_rows[0].get("tier") or "").upper().strip()
|
|
|
|
# Find all distinct sizes in the base tier -> one row per size
|
|
base_size_rows = [r for r in model_rows
|
|
if (r.get("tier") or "").upper().strip() == base_tier]
|
|
if not base_size_rows:
|
|
# fallback: any rows
|
|
base_size_rows = model_rows
|
|
|
|
# Distinct sizes in display order (smallest first)
|
|
seen_sizes = set()
|
|
base_sizes = []
|
|
for r in base_size_rows:
|
|
ns_r = normalize_size(r["size"])
|
|
n_r = _size_n(r["size"])
|
|
key = (n_r, ns_r)
|
|
if key in seen_sizes:
|
|
continue
|
|
seen_sizes.add(key)
|
|
base_sizes.append((ns_r, n_r))
|
|
|
|
if not base_sizes:
|
|
continue
|
|
|
|
for ns, base_n in base_sizes:
|
|
# Find the live row matching this size+base tier
|
|
live_row = next((r for r in base_size_rows
|
|
if normalize_size(r["size"]) == ns), None)
|
|
if not live_row:
|
|
continue
|
|
new_pts = live_row["pts"]
|
|
|
|
# Find original in PDF
|
|
pdf_row = pdf_idx.get((nu, ns))
|
|
orig_pts = pdf_row["pts"] if pdf_row else None
|
|
|
|
# Bulk columns: cost of THIS size in 2ND+ / 3RD+ / 4TH+ tiers
|
|
bulk_2 = find_tier_cost_at_size(model_rows, "YOUR 2ND + UNIT COSTS", base_n)
|
|
bulk_3 = find_tier_cost_at_size(model_rows, "YOUR 3RD + UNIT COSTS", base_n)
|
|
bulk_4 = find_tier_cost_at_size(model_rows, "YOUR 4TH + UNIT COSTS", base_n)
|
|
|
|
# Display the unit name with the size prefix if there's only one
|
|
# size in the base tier, use the unit name alone; if multiple
|
|
# sizes, prefix with the count.
|
|
if len(base_sizes) == 1:
|
|
display_unit = unit
|
|
else:
|
|
display_unit = f"{base_n} {unit.title()}"
|
|
|
|
rows.append({
|
|
"Faction": live.get("name", pdf.get("name", faction_slug)),
|
|
"Unidad": display_unit,
|
|
"Coste original": fmt_pts(orig_pts),
|
|
"Coste nuevo": fmt_pts(new_pts),
|
|
"% de cambio": pct_change(orig_pts, new_pts) if orig_pts else "",
|
|
"2 o más unidades": fmt_pts(bulk_2),
|
|
"3 o más unidades": fmt_pts(bulk_3),
|
|
"4 o más unidades": fmt_pts(bulk_4),
|
|
})
|
|
|
|
# Sort: by Faction then Unidad
|
|
rows.sort(key=lambda r: (r["Faction"], r["Unidad"]))
|
|
return rows
|
|
|
|
|
|
def main():
|
|
pdf = json.loads(PDF.read_text())
|
|
live = json.loads(LIVE.read_text())
|
|
if not pdf or not live:
|
|
print("ERR: pdf or live data missing — run parse_pdf.py and scrape_live.py first")
|
|
return 1
|
|
|
|
all_rows: list[dict] = []
|
|
summary = []
|
|
|
|
# Build per-faction CSVs
|
|
for slug in sorted(set(list(pdf.keys()) + list(live.keys()))):
|
|
p = pdf.get(slug, {"name": slug, "units": {}})
|
|
l = live.get(slug, {"name": slug, "units": {}})
|
|
rows = join_faction(slug, p, l, pdf)
|
|
if not rows:
|
|
print(f" {slug}: no rows (skipped)")
|
|
continue
|
|
out_path = CSV_DIR / f"{slug}.csv"
|
|
with out_path.open("w", newline="", encoding="utf-8") as f:
|
|
w = csv.DictWriter(f, fieldnames=HEADERS)
|
|
w.writeheader()
|
|
w.writerows(rows)
|
|
all_rows.extend(rows)
|
|
n_with_orig = sum(1 for r in rows if r["Coste original"])
|
|
summary.append((slug, l.get("name", slug), len(rows), n_with_orig))
|
|
print(f" {slug:25s} {len(rows):3d} rows ({n_with_orig} with original)")
|
|
|
|
# Combined
|
|
combined = CSV_DIR / "_all_factions.csv"
|
|
with combined.open("w", newline="", encoding="utf-8") as f:
|
|
w = csv.DictWriter(f, fieldnames=HEADERS)
|
|
w.writeheader()
|
|
w.writerows(all_rows)
|
|
|
|
print()
|
|
print(f"per-faction CSVs: {CSV_DIR}/*.csv ({len(summary)} files)")
|
|
print(f"combined CSV: {combined} ({len(all_rows)} rows total)")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|