Files
wh40k-points-comparator/scrape_live.py
root 38bffa491c Initial commit: WH40K Points Comparator
- React + MUI DataGrid app with faction filter, search, change filter
- Biggest movers cards (drops/rises) scoped to current filter view
- Historical points graph modal (5 MFM versions: 1.14 → current)
- URL state sync (faction, dir, q params — shareable URLs)
- Grimdark favicon + OG embed image (Google Imagen)
- Multi-stage Dockerfile (node build → nginx serve)
- docker-compose.yml with Traefik + Cloudflare TLS
- Data pipeline: build_deduped_data.py merges PDF + live scrape
- Ynnari merged into Aeldari (shared codex)
- Mobile responsive: flex columns, no fixed pixel widths
- Color semantics: green=cheaper, red=costlier (consistent everywhere)
- 1,449 units across 31 factions
2026-06-18 02:42:29 +00:00

260 lines
11 KiB
Python

"""Scrape live MFM data for all 30 factions.
Output: /root/wh40k-factions/live_data.json
{
"<faction-slug>": {
"name": "T'au Empire",
"version": "v1.0",
"url": "...",
"units": {
"Broadside Battlesuits": [
{
"size": "1 models",
"pts": 75,
"tier": "YOUR 1ST TO 2ND UNITS COST"
},
{"size": "2 models", "pts": 150, "tier": "YOUR 1ST TO 2ND UNITS COST"},
...
{"size": "1 models", "pts": 95, "tier": "YOUR 3RD + UNIT COSTS"},
],
...
}
}
}
"""
import json
import re
import sys
import time
from pathlib import Path
from playwright.sync_api import sync_playwright
FACTIONS = [
("adepta-sororitas", "Adepta Sororitas", "https://mfm.warhammer-community.com/en/adepta-sororitas"),
("adeptus-custodes", "Adeptus Custodes", "https://mfm.warhammer-community.com/en/adeptus-custodes"),
("adeptus-mechanicus", "Adeptus Mechanicus", "https://mfm.warhammer-community.com/en/adeptus-mechanicus"),
("aeldari", "Aeldari", "https://mfm.warhammer-community.com/en/aeldari"),
("astra-militarum", "Astra Militarum", "https://mfm.warhammer-community.com/en/astra-militarum"),
("black-templars", "Black Templars", "https://mfm.warhammer-community.com/en/black-templars"),
("blood-angels", "Blood Angels", "https://mfm.warhammer-community.com/en/blood-angels"),
("chaos-daemons", "Chaos Daemons", "https://mfm.warhammer-community.com/en/chaos-daemons"),
("chaos-knights", "Chaos Knights", "https://mfm.warhammer-community.com/en/chaos-knights"),
("chaos-space-marines", "Chaos Space Marines", "https://mfm.warhammer-community.com/en/chaos-space-marines"),
("chaos-titan-legions", "Chaos Titan Legions", "https://mfm.warhammer-community.com/en/chaos-titan-legions"),
("dark-angels", "Dark Angels", "https://mfm.warhammer-community.com/en/dark-angels"),
("death-guard", "Death Guard", "https://mfm.warhammer-community.com/en/death-guard"),
("deathwatch", "Deathwatch", "https://mfm.warhammer-community.com/en/deathwatch"),
("drukhari", "Drukhari", "https://mfm.warhammer-community.com/en/drukhari"),
("emperors-children", "Emperor's Children", "https://mfm.warhammer-community.com/en/emperors-children"),
("genestealer-cults", "Genestealer Cults", "https://mfm.warhammer-community.com/en/genestealer-cults"),
("grey-knights", "Grey Knights", "https://mfm.warhammer-community.com/en/grey-knights"),
("imperial-agents", "Imperial Agents", "https://mfm.warhammer-community.com/en/imperial-agents"),
("imperial-knights", "Imperial Knights", "https://mfm.warhammer-community.com/en/imperial-knights"),
("leagues-of-votann", "Leagues of Votann", "https://mfm.warhammer-community.com/en/leagues-of-votann"),
("necrons", "Necrons", "https://mfm.warhammer-community.com/en/necrons"),
("orks", "Orks", "https://mfm.warhammer-community.com/en/orks"),
("space-marines", "Space Marines", "https://mfm.warhammer-community.com/en/space-marines"),
("space-wolves", "Space Wolves", "https://mfm.warhammer-community.com/en/space-wolves"),
("tau-empire", "T'au Empire", "https://mfm.warhammer-community.com/en/tau-empire"),
("thousand-sons", "Thousand Sons", "https://mfm.warhammer-community.com/en/thousand-sons"),
("titan-legions", "Titan Legions", "https://mfm.warhammer-community.com/en/titan-legions"),
("tyranids", "Tyranids", "https://mfm.warhammer-community.com/en/tyranids"),
("world-eaters", "World Eaters", "https://mfm.warhammer-community.com/en/world-eaters"),
]
OUT = Path("/root/wh40k-factions/live_data.json")
# JavaScript extractor — runs in the page context. Returns a list of:
# {unit: str, tier: str|null, size: str, pts: int}
EXTRACT_JS = r"""
() => {
// Walk the entire body. The MFM site renders unit cards with class
// "flex flex-col space-y-1 m-1 print:break-inside-avoid-page"
// Each card has:
// - a unit-name heading (h2/h3)
// - one or more tier headers (e.g. "YOUR 1ST TO 2ND UNITS COST")
// - a list of <li> items with "<n> models" and "<k> pts"
const out = [];
const cards = document.querySelectorAll('div.flex.flex-col.space-y-1.m-1');
for (const card of cards) {
// Unit name: first heading child
const heading = card.querySelector('h1, h2, h3, h4, [class*="font-bold"], [class*="uppercase"]');
if (!heading) continue;
const unit = heading.innerText.trim();
if (!unit) continue;
// Now find tier headers and cost lists within the card
// The DOM order is: heading, tier1-label, tier1-list, tier2-label, tier2-list, ...
// Tier labels are short text in CAPS containing "UNIT" or "MODEL"
// Lists are <ul><li>...</li></ul>
const children = Array.from(card.children);
let currentTier = null;
for (const child of children) {
const txt = (child.innerText || '').trim();
if (!txt) continue;
if (/^YOUR\b/i.test(txt) && (txt.includes('UNIT') || txt.includes('COST') || txt.includes('MODEL'))) {
currentTier = txt.replace(/\s+/g, ' ');
continue;
}
if (child.tagName === 'UL' || child.tagName === 'OL' || child.querySelector('li')) {
const items = child.querySelectorAll('li');
for (const li of items) {
const liText = (li.innerText || '').trim();
// Format: "<n> models\n<k> pts" or "<n> model\n<k> pts"
const m = liText.match(/(\d+)\s+models?\s*\n?\s*(\d+)\s*pts?/i);
if (m) {
out.push({
unit: unit,
tier: currentTier,
size: m[1) + ' models',
pts: parseInt(m[2], 10),
});
}
}
continue;
}
}
}
return out;
}
"""
# Wait — the JS had a syntax error above. Rewrite cleanly:
EXTRACT_JS = r"""
() => {
const out = [];
// Card root: <div class="flex flex-col space-y-1 m-1 print:break-inside-avoid-page">
// First child is the unit-name banner. Then come tier blocks:
// <div class="space-y-1">
// <div ...>TIER LABEL</div>
// <ul><li><span>SIZE</span><span>PTS</span></li>...</ul>
// </div>
const cards = document.querySelectorAll('div.flex.flex-col.space-y-1.m-1');
for (const card of cards) {
const header = card.firstElementChild;
if (!header) continue;
const unit = (header.innerText || '').trim();
if (!unit) continue;
// Find all <ul> within the card and walk backwards to find the
// most recent tier label
const uls = card.querySelectorAll('ul');
for (const ul of uls) {
// Find the tier label: walk up to the .space-y-1 wrapper, then
// take its first child (the label div)
let tier = null;
let parent = ul.parentElement; // .space-y-1
if (parent) {
const labelDiv = parent.querySelector(':scope > div');
if (labelDiv) tier = (labelDiv.innerText || '').trim().replace(/\s+/g, ' ');
}
for (const li of ul.querySelectorAll('li')) {
const spans = li.querySelectorAll('span');
if (spans.length < 2) continue;
const size = (spans[0].innerText || '').trim();
const ptsText = (spans[1].innerText || '').trim();
const pts = parseInt(ptsText.replace(/[^\d]/g, ''), 10);
if (!size || isNaN(pts)) continue;
out.push({ unit, tier, size, pts });
}
}
}
return out;
}
"""
def fetch_one(p, slug: str, name: str, url: str, idx: int, total: int) -> dict:
print(f"[{idx:>2}/{total}] {name} -> {url}", flush=True)
start = time.time()
status = {
"slug": slug, "name": name, "url": url,
"ok": False, "n_units": 0, "n_rows": 0, "error": None,
"elapsed_s": 0.0,
}
try:
context = p.chromium.launch_persistent_context(
user_data_dir=f"/tmp/wh40k-live-{slug}",
executable_path="/usr/bin/chromium",
headless=True,
viewport={"width": 1280, "height": 1800},
user_agent=(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"
),
)
page = context.new_page()
page.goto(url, wait_until="domcontentloaded", timeout=45000)
try:
page.wait_for_load_state("networkidle", timeout=20000)
except Exception as e:
print(f" networkidle timeout (continuing): {e}", flush=True)
page.wait_for_timeout(2500)
# Detect version
version = page.evaluate("""
() => {
const m = (document.body.innerText || '').match(/v\\d+\\.\\d+/);
return m ? m[0] : null;
}
""")
rows = page.evaluate(EXTRACT_JS)
context.close()
# Group rows by unit
units: dict[str, list[dict]] = {}
for r in rows:
units.setdefault(r["unit"], []).append({
"size": r["size"],
"pts": r["pts"],
"tier": r["tier"],
})
status["ok"] = True
status["n_units"] = len(units)
status["n_rows"] = len(rows)
status["version"] = version
print(f" OK units={len(units)} rows={len(rows)} version={version}", flush=True)
return {
"slug": slug,
"name": name,
"url": url,
"version": version,
"units": units,
"_status": status,
}
except Exception as e:
status["error"] = repr(e)
status["elapsed_s"] = round(time.time() - start, 2)
print(f" FAIL {status['error']}", flush=True)
return {
"slug": slug,
"name": name,
"url": url,
"version": None,
"units": {},
"_status": status,
}
def main() -> int:
out: dict = {}
with sync_playwright() as p:
for i, (slug, name, url) in enumerate(FACTIONS, 1):
r = fetch_one(p, slug, name, url, i, len(FACTIONS))
out[slug] = r
if i < len(FACTIONS):
time.sleep(2.0) # politeness between pages
OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
print(f"\nwrote {OUT}")
ok = sum(1 for r in out.values() if r["_status"]["ok"])
print(f"summary: {ok}/{len(out)} factions scraped")
return 0 if ok == len(out) else 1
if __name__ == "__main__":
sys.exit(main())