- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
115 lines
4.2 KiB
Python
115 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Re-scrape live MFM data for all 30 factions, one JSON file per faction.
|
|
|
|
Output:
|
|
/root/wh40k-factions/live/<slug>.json (one per faction)
|
|
/root/wh40k-factions/live/_manifest.json (index of all factions + counts)
|
|
"""
|
|
import json, sys, time
|
|
from pathlib import Path
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from scrape_live import FACTIONS, EXTRACT_JS
|
|
|
|
OUT_DIR = Path("/root/wh40k-factions/live")
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def fetch_one(context, slug: str, name: str, url: str) -> dict:
|
|
page = context.new_page()
|
|
try:
|
|
page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
|
try:
|
|
page.wait_for_load_state("networkidle", timeout=20000)
|
|
except Exception as e:
|
|
print(f" networkidle timeout (continuing): {e}", flush=True)
|
|
page.wait_for_timeout(2500)
|
|
|
|
# Detect version
|
|
version = page.evaluate("""
|
|
() => {
|
|
const m = (document.body.innerText || '').match(/v\\d+\\.\\d+/);
|
|
return m ? m[0] : null;
|
|
}
|
|
""")
|
|
|
|
rows = page.evaluate(EXTRACT_JS)
|
|
# Group rows by unit
|
|
units = {}
|
|
for r in rows:
|
|
units.setdefault(r["unit"], []).append({
|
|
"size": r["size"],
|
|
"pts": r["pts"],
|
|
"tier": r["tier"],
|
|
})
|
|
|
|
return {
|
|
"slug": slug,
|
|
"name": name,
|
|
"url": url,
|
|
"version": version,
|
|
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"n_units": len(units),
|
|
"n_rows": len(rows),
|
|
"units": units,
|
|
}
|
|
finally:
|
|
page.close()
|
|
|
|
|
|
def main():
|
|
manifest = {"factions": []}
|
|
n_total = len(FACTIONS)
|
|
print(f"Scraping {n_total} factions to {OUT_DIR}/", flush=True)
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(executable_path="/usr/bin/chromium", headless=True)
|
|
try:
|
|
for idx, (slug, name, url) in enumerate(FACTIONS, 1):
|
|
t0 = time.time()
|
|
print(f"[{idx:>2}/{n_total}] {name} -> {url}", flush=True)
|
|
context = browser.new_context(
|
|
user_agent=("Mozilla/5.0 (X11; Linux x86_64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/148.0.0.0 Safari/537.36"),
|
|
viewport={"width": 1280, "height": 1800},
|
|
)
|
|
try:
|
|
data = fetch_one(context, slug, name, url)
|
|
out_path = OUT_DIR / f"{slug}.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print(f" {data['n_units']:>3} units / {data['n_rows']:>3} rows "
|
|
f"-> {out_path.name} ({time.time()-t0:.1f}s)", flush=True)
|
|
manifest["factions"].append({
|
|
"slug": slug, "name": name, "url": url,
|
|
"version": data["version"],
|
|
"n_units": data["n_units"], "n_rows": data["n_rows"],
|
|
"file": out_path.name,
|
|
"elapsed_s": round(time.time() - t0, 1),
|
|
})
|
|
except Exception as e:
|
|
print(f" ERROR: {e}", flush=True)
|
|
manifest["factions"].append({
|
|
"slug": slug, "name": name, "url": url,
|
|
"error": str(e),
|
|
})
|
|
finally:
|
|
context.close()
|
|
finally:
|
|
browser.close()
|
|
|
|
manifest["generated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
with open(OUT_DIR / "_manifest.json", "w") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
|
|
n_ok = sum(1 for f in manifest["factions"] if "error" not in f)
|
|
n_err = len(manifest["factions"]) - n_ok
|
|
print(f"\nDone. {n_ok}/{n_total} factions OK, {n_err} errors.")
|
|
print(f"Manifest: {OUT_DIR / '_manifest.json'}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|