- React + MUI DataGrid app with faction filter, search, change filter - Biggest movers cards (drops/rises) scoped to current filter view - Historical points graph modal (5 MFM versions: 1.14 → current) - URL state sync (faction, dir, q params — shareable URLs) - Grimdark favicon + OG embed image (Google Imagen) - Multi-stage Dockerfile (node build → nginx serve) - docker-compose.yml with Traefik + Cloudflare TLS - Data pipeline: build_deduped_data.py merges PDF + live scrape - Ynnari merged into Aeldari (shared codex) - Mobile responsive: flex columns, no fixed pixel widths - Color semantics: green=cheaper, red=costlier (consistent everywhere) - 1,449 units across 31 factions
260 lines
11 KiB
Python
260 lines
11 KiB
Python
"""Scrape live MFM data for all 30 factions.
|
|
|
|
Output: /root/wh40k-factions/live_data.json
|
|
{
|
|
"<faction-slug>": {
|
|
"name": "T'au Empire",
|
|
"version": "v1.0",
|
|
"url": "...",
|
|
"units": {
|
|
"Broadside Battlesuits": [
|
|
{
|
|
"size": "1 models",
|
|
"pts": 75,
|
|
"tier": "YOUR 1ST TO 2ND UNITS COST"
|
|
},
|
|
{"size": "2 models", "pts": 150, "tier": "YOUR 1ST TO 2ND UNITS COST"},
|
|
...
|
|
{"size": "1 models", "pts": 95, "tier": "YOUR 3RD + UNIT COSTS"},
|
|
],
|
|
...
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
FACTIONS = [
|
|
("adepta-sororitas", "Adepta Sororitas", "https://mfm.warhammer-community.com/en/adepta-sororitas"),
|
|
("adeptus-custodes", "Adeptus Custodes", "https://mfm.warhammer-community.com/en/adeptus-custodes"),
|
|
("adeptus-mechanicus", "Adeptus Mechanicus", "https://mfm.warhammer-community.com/en/adeptus-mechanicus"),
|
|
("aeldari", "Aeldari", "https://mfm.warhammer-community.com/en/aeldari"),
|
|
("astra-militarum", "Astra Militarum", "https://mfm.warhammer-community.com/en/astra-militarum"),
|
|
("black-templars", "Black Templars", "https://mfm.warhammer-community.com/en/black-templars"),
|
|
("blood-angels", "Blood Angels", "https://mfm.warhammer-community.com/en/blood-angels"),
|
|
("chaos-daemons", "Chaos Daemons", "https://mfm.warhammer-community.com/en/chaos-daemons"),
|
|
("chaos-knights", "Chaos Knights", "https://mfm.warhammer-community.com/en/chaos-knights"),
|
|
("chaos-space-marines", "Chaos Space Marines", "https://mfm.warhammer-community.com/en/chaos-space-marines"),
|
|
("chaos-titan-legions", "Chaos Titan Legions", "https://mfm.warhammer-community.com/en/chaos-titan-legions"),
|
|
("dark-angels", "Dark Angels", "https://mfm.warhammer-community.com/en/dark-angels"),
|
|
("death-guard", "Death Guard", "https://mfm.warhammer-community.com/en/death-guard"),
|
|
("deathwatch", "Deathwatch", "https://mfm.warhammer-community.com/en/deathwatch"),
|
|
("drukhari", "Drukhari", "https://mfm.warhammer-community.com/en/drukhari"),
|
|
("emperors-children", "Emperor's Children", "https://mfm.warhammer-community.com/en/emperors-children"),
|
|
("genestealer-cults", "Genestealer Cults", "https://mfm.warhammer-community.com/en/genestealer-cults"),
|
|
("grey-knights", "Grey Knights", "https://mfm.warhammer-community.com/en/grey-knights"),
|
|
("imperial-agents", "Imperial Agents", "https://mfm.warhammer-community.com/en/imperial-agents"),
|
|
("imperial-knights", "Imperial Knights", "https://mfm.warhammer-community.com/en/imperial-knights"),
|
|
("leagues-of-votann", "Leagues of Votann", "https://mfm.warhammer-community.com/en/leagues-of-votann"),
|
|
("necrons", "Necrons", "https://mfm.warhammer-community.com/en/necrons"),
|
|
("orks", "Orks", "https://mfm.warhammer-community.com/en/orks"),
|
|
("space-marines", "Space Marines", "https://mfm.warhammer-community.com/en/space-marines"),
|
|
("space-wolves", "Space Wolves", "https://mfm.warhammer-community.com/en/space-wolves"),
|
|
("tau-empire", "T'au Empire", "https://mfm.warhammer-community.com/en/tau-empire"),
|
|
("thousand-sons", "Thousand Sons", "https://mfm.warhammer-community.com/en/thousand-sons"),
|
|
("titan-legions", "Titan Legions", "https://mfm.warhammer-community.com/en/titan-legions"),
|
|
("tyranids", "Tyranids", "https://mfm.warhammer-community.com/en/tyranids"),
|
|
("world-eaters", "World Eaters", "https://mfm.warhammer-community.com/en/world-eaters"),
|
|
]
|
|
|
|
OUT = Path("/root/wh40k-factions/live_data.json")
|
|
|
|
# JavaScript extractor — runs in the page context. Returns a list of:
|
|
# {unit: str, tier: str|null, size: str, pts: int}
|
|
EXTRACT_JS = r"""
|
|
() => {
|
|
// Walk the entire body. The MFM site renders unit cards with class
|
|
// "flex flex-col space-y-1 m-1 print:break-inside-avoid-page"
|
|
// Each card has:
|
|
// - a unit-name heading (h2/h3)
|
|
// - one or more tier headers (e.g. "YOUR 1ST TO 2ND UNITS COST")
|
|
// - a list of <li> items with "<n> models" and "<k> pts"
|
|
const out = [];
|
|
const cards = document.querySelectorAll('div.flex.flex-col.space-y-1.m-1');
|
|
for (const card of cards) {
|
|
// Unit name: first heading child
|
|
const heading = card.querySelector('h1, h2, h3, h4, [class*="font-bold"], [class*="uppercase"]');
|
|
if (!heading) continue;
|
|
const unit = heading.innerText.trim();
|
|
if (!unit) continue;
|
|
|
|
// Now find tier headers and cost lists within the card
|
|
// The DOM order is: heading, tier1-label, tier1-list, tier2-label, tier2-list, ...
|
|
// Tier labels are short text in CAPS containing "UNIT" or "MODEL"
|
|
// Lists are <ul><li>...</li></ul>
|
|
const children = Array.from(card.children);
|
|
let currentTier = null;
|
|
for (const child of children) {
|
|
const txt = (child.innerText || '').trim();
|
|
if (!txt) continue;
|
|
if (/^YOUR\b/i.test(txt) && (txt.includes('UNIT') || txt.includes('COST') || txt.includes('MODEL'))) {
|
|
currentTier = txt.replace(/\s+/g, ' ');
|
|
continue;
|
|
}
|
|
if (child.tagName === 'UL' || child.tagName === 'OL' || child.querySelector('li')) {
|
|
const items = child.querySelectorAll('li');
|
|
for (const li of items) {
|
|
const liText = (li.innerText || '').trim();
|
|
// Format: "<n> models\n<k> pts" or "<n> model\n<k> pts"
|
|
const m = liText.match(/(\d+)\s+models?\s*\n?\s*(\d+)\s*pts?/i);
|
|
if (m) {
|
|
out.push({
|
|
unit: unit,
|
|
tier: currentTier,
|
|
size: m[1) + ' models',
|
|
pts: parseInt(m[2], 10),
|
|
});
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
"""
|
|
|
|
# Wait — the JS had a syntax error above. Rewrite cleanly:
|
|
|
|
EXTRACT_JS = r"""
|
|
() => {
|
|
const out = [];
|
|
// Card root: <div class="flex flex-col space-y-1 m-1 print:break-inside-avoid-page">
|
|
// First child is the unit-name banner. Then come tier blocks:
|
|
// <div class="space-y-1">
|
|
// <div ...>TIER LABEL</div>
|
|
// <ul><li><span>SIZE</span><span>PTS</span></li>...</ul>
|
|
// </div>
|
|
const cards = document.querySelectorAll('div.flex.flex-col.space-y-1.m-1');
|
|
for (const card of cards) {
|
|
const header = card.firstElementChild;
|
|
if (!header) continue;
|
|
const unit = (header.innerText || '').trim();
|
|
if (!unit) continue;
|
|
|
|
// Find all <ul> within the card and walk backwards to find the
|
|
// most recent tier label
|
|
const uls = card.querySelectorAll('ul');
|
|
for (const ul of uls) {
|
|
// Find the tier label: walk up to the .space-y-1 wrapper, then
|
|
// take its first child (the label div)
|
|
let tier = null;
|
|
let parent = ul.parentElement; // .space-y-1
|
|
if (parent) {
|
|
const labelDiv = parent.querySelector(':scope > div');
|
|
if (labelDiv) tier = (labelDiv.innerText || '').trim().replace(/\s+/g, ' ');
|
|
}
|
|
for (const li of ul.querySelectorAll('li')) {
|
|
const spans = li.querySelectorAll('span');
|
|
if (spans.length < 2) continue;
|
|
const size = (spans[0].innerText || '').trim();
|
|
const ptsText = (spans[1].innerText || '').trim();
|
|
const pts = parseInt(ptsText.replace(/[^\d]/g, ''), 10);
|
|
if (!size || isNaN(pts)) continue;
|
|
out.push({ unit, tier, size, pts });
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
"""
|
|
|
|
|
|
def fetch_one(p, slug: str, name: str, url: str, idx: int, total: int) -> dict:
|
|
print(f"[{idx:>2}/{total}] {name} -> {url}", flush=True)
|
|
start = time.time()
|
|
status = {
|
|
"slug": slug, "name": name, "url": url,
|
|
"ok": False, "n_units": 0, "n_rows": 0, "error": None,
|
|
"elapsed_s": 0.0,
|
|
}
|
|
try:
|
|
context = p.chromium.launch_persistent_context(
|
|
user_data_dir=f"/tmp/wh40k-live-{slug}",
|
|
executable_path="/usr/bin/chromium",
|
|
headless=True,
|
|
viewport={"width": 1280, "height": 1800},
|
|
user_agent=(
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"
|
|
),
|
|
)
|
|
page = context.new_page()
|
|
page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
|
try:
|
|
page.wait_for_load_state("networkidle", timeout=20000)
|
|
except Exception as e:
|
|
print(f" networkidle timeout (continuing): {e}", flush=True)
|
|
page.wait_for_timeout(2500)
|
|
|
|
# Detect version
|
|
version = page.evaluate("""
|
|
() => {
|
|
const m = (document.body.innerText || '').match(/v\\d+\\.\\d+/);
|
|
return m ? m[0] : null;
|
|
}
|
|
""")
|
|
|
|
rows = page.evaluate(EXTRACT_JS)
|
|
context.close()
|
|
|
|
# Group rows by unit
|
|
units: dict[str, list[dict]] = {}
|
|
for r in rows:
|
|
units.setdefault(r["unit"], []).append({
|
|
"size": r["size"],
|
|
"pts": r["pts"],
|
|
"tier": r["tier"],
|
|
})
|
|
|
|
status["ok"] = True
|
|
status["n_units"] = len(units)
|
|
status["n_rows"] = len(rows)
|
|
status["version"] = version
|
|
print(f" OK units={len(units)} rows={len(rows)} version={version}", flush=True)
|
|
return {
|
|
"slug": slug,
|
|
"name": name,
|
|
"url": url,
|
|
"version": version,
|
|
"units": units,
|
|
"_status": status,
|
|
}
|
|
except Exception as e:
|
|
status["error"] = repr(e)
|
|
status["elapsed_s"] = round(time.time() - start, 2)
|
|
print(f" FAIL {status['error']}", flush=True)
|
|
return {
|
|
"slug": slug,
|
|
"name": name,
|
|
"url": url,
|
|
"version": None,
|
|
"units": {},
|
|
"_status": status,
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
out: dict = {}
|
|
with sync_playwright() as p:
|
|
for i, (slug, name, url) in enumerate(FACTIONS, 1):
|
|
r = fetch_one(p, slug, name, url, i, len(FACTIONS))
|
|
out[slug] = r
|
|
if i < len(FACTIONS):
|
|
time.sleep(2.0) # politeness between pages
|
|
OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
|
print(f"\nwrote {OUT}")
|
|
ok = sum(1 for r in out.values() if r["_status"]["ok"])
|
|
print(f"summary: {ok}/{len(out)} factions scraped")
|
|
return 0 if ok == len(out) else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|