This commit is contained in:
41
.env.example
41
.env.example
@@ -66,4 +66,43 @@ LOG_LEVEL=debug
|
||||
|
||||
LITELLM_BASE_URL=
|
||||
LITELLM_API_KEY=
|
||||
LITELLM_MODEL=ollama-cloud
|
||||
LITELLM_MODEL=ollama-cloud
|
||||
|
||||
# ── Live integration tests (tests/integration/graphmcp/) ──────────────────────
|
||||
# Opt-in gates for the live E2E suite. With neither set, `npm run test:int`
|
||||
# skips all 16 graphmcp tests (and the 2 phase1 tests) and exits 0 — CI-safe.
|
||||
#
|
||||
# RUN_GRAPHMCP_LIVE=1 activates ONLY the AC1 contract suite, which needs a
|
||||
# reachable GraphMCP and nothing else (no Discord/LLM/Redis).
|
||||
# RUN_FULL_E2E=1 activates AC2–AC4 (and AC1). Needs the full live stack:
|
||||
# real Discord gateway, real LLM, real Redis, real GraphMCP.
|
||||
# RUN_GRAPHMCP_LIVE=1
|
||||
# RUN_FULL_E2E=1
|
||||
|
||||
# ── Required for RUN_FULL_E2E=1 (AC2–AC4) ──────────────────────────────────────
|
||||
# A dedicated Discord test guild + channel (NOT a production server).
|
||||
# E2E_TEST_GUILD_ID=123456789012345678
|
||||
# E2E_TEST_CHANNEL_ID=1517576125172289787
|
||||
|
||||
# Token for a SECOND bot that posts chat messages / @mentions into the thread
|
||||
# (the bot under test cannot be driven by another bot's slash commands).
|
||||
# E2E_DRIVER_TOKEN=your_second_bot_token
|
||||
|
||||
# Discord user ID of whoever the driver bot acts as. Used as interaction.user.id
|
||||
# in the hybrid slash-command fakes. If DISCORD_ALLOWED_USERS (above) is non-empty,
|
||||
# this ID MUST be listed there or /encounter start|end will be rejected.
|
||||
# E2E_DRIVER_USER_ID=123456789012345678
|
||||
|
||||
# ── Optional test knobs ───────────────────────────────────────────────────────
|
||||
# Real NPC name present in the graph — enables AC1 S1.1 (query_as_npc). When
|
||||
# unset, S1.1 is skipped; the rest of AC1 still runs.
|
||||
# E2E_TEST_NPC=miriam-merchant-mardonar
|
||||
|
||||
# Spec to start for AC2/AC3 encounters (defaults to market-thief).
|
||||
# E2E_SPEC=market-thief
|
||||
#
|
||||
# NOTE: when RUN_FULL_E2E=1, the test bootstrap (tests/integration/graphmcp/support/env.ts)
|
||||
# auto-seeds DISCORD_ALLOWED_CHANNELS from E2E_TEST_CHANNEL_ID if you haven't set
|
||||
# it — so you don't have to edit DISCORD_ALLOWED_CHANNELS just to run the suite.
|
||||
# It also injects harmless DISCORD_TOKEN/DISCORD_CLIENT_ID stubs when absent, so
|
||||
# the AC1 contract suite can run without any Discord creds at all.
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@ coverage/
|
||||
.env
|
||||
*.log
|
||||
.DS_Store
|
||||
data/
|
||||
@@ -1,22 +1,66 @@
|
||||
{
|
||||
"market-thief": {
|
||||
"runs": 4,
|
||||
"lastRun": "2026-05-26T21:44:33.947Z"
|
||||
"runs": 9,
|
||||
"lastRun": "2026-06-19T23:21:11.305Z"
|
||||
},
|
||||
"mawfang-pursuit": {
|
||||
"runs": 2,
|
||||
"lastRun": "2026-05-26T03:22:23.938Z"
|
||||
},
|
||||
"cog-claw-debt": {
|
||||
"runs": 3,
|
||||
"lastRun": "2026-05-26T03:22:19.935Z"
|
||||
"runs": 4,
|
||||
"lastRun": "2026-06-19T23:05:08.525Z"
|
||||
},
|
||||
"stormscar-pilgrim": {
|
||||
"runs": 2,
|
||||
"lastRun": "2026-05-30T05:49:10.825Z"
|
||||
},
|
||||
"silt-leak": {
|
||||
"runs": 3,
|
||||
"lastRun": "2026-06-19T23:28:07.201Z"
|
||||
},
|
||||
"e2e-e2e-1781890729662-3355702": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-05-30T03:07:28.390Z"
|
||||
"lastRun": "2026-06-19T17:38:54.782Z"
|
||||
},
|
||||
"e2e-e2e-1781890851529-3357649": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T17:40:55.920Z"
|
||||
},
|
||||
"e2e-e2e-1781891305502-3365683": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T17:48:29.982Z"
|
||||
},
|
||||
"e2e-e2e-1781891467455-3368263": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T17:51:11.725Z"
|
||||
},
|
||||
"e2e-e2e-1781891592524-3371960": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T17:53:17.101Z"
|
||||
},
|
||||
"e2e-e2e-1781891643550-3373409": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T17:54:07.817Z"
|
||||
},
|
||||
"e2e-e2e-1781891844521-3377360": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T17:57:29.044Z"
|
||||
},
|
||||
"e2e-e2e-1781892020208-3381134": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T18:00:24.481Z"
|
||||
},
|
||||
"e2e-e2e-1781892172019-3384843": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T18:02:56.469Z"
|
||||
},
|
||||
"whispering-stone": {
|
||||
"runs": 2,
|
||||
"lastRun": "2026-06-19T23:00:42.503Z"
|
||||
},
|
||||
"velvet-auction": {
|
||||
"runs": 1,
|
||||
"lastRun": "2026-06-19T23:42:21.918Z"
|
||||
}
|
||||
}
|
||||
@@ -309,80 +309,115 @@ export async function runLLMTurn(
|
||||
}
|
||||
}
|
||||
|
||||
if (response.narrative) {
|
||||
// Skip roll-claim filter when a skill check result is in recent context —
|
||||
// the LLM is narrating a known outcome, not fabricating a pre-roll result.
|
||||
const recentHistory = session.history.slice(-6);
|
||||
const rollResultRecent = recentHistory.some(m => m.content.startsWith('[SKILL CHECK RESULT]'));
|
||||
const filter = filterLLMResponse(response.narrative, { skipRollClaim: rollResultRecent });
|
||||
if (!filter.ok) {
|
||||
logFiltered(filter.reason!, response.narrative, {
|
||||
threadId: session.threadId,
|
||||
encounterId: session.encounterId,
|
||||
});
|
||||
// A turn must always grow history by ≥1 so the generation completes and the
|
||||
// scheduler drains. Several paths used to silently drop a turn — a filtered
|
||||
// response that was already retried, a tool-call turn whose session vanished,
|
||||
// an LLM reply with neither narrative nor tool, or an exception thrown inside
|
||||
// this block (the scheduler's try/finally has no catch, so it killed the turn
|
||||
// and the narrator went quiet). `appended` tracks whether anything persisted;
|
||||
// the fallback at the end guarantees progress and surfaces the failure mode.
|
||||
let appended = false;
|
||||
try {
|
||||
if (response.narrative) {
|
||||
// Skip roll-claim filter when a skill check result is in recent context —
|
||||
// the LLM is narrating a known outcome, not fabricating a pre-roll result.
|
||||
const recentHistory = session.history.slice(-6);
|
||||
const rollResultRecent = recentHistory.some(
|
||||
m => typeof m.content === 'string' && m.content.startsWith('[SKILL CHECK RESULT]'),
|
||||
);
|
||||
const filter = filterLLMResponse(response.narrative, { skipRollClaim: rollResultRecent });
|
||||
if (!filter.ok) {
|
||||
logFiltered(filter.reason!, response.narrative, {
|
||||
threadId: session.threadId,
|
||||
encounterId: session.encounterId,
|
||||
});
|
||||
|
||||
// Guard against tight retry loops: skip if we just injected a correction.
|
||||
const lastMsg = session.history[session.history.length - 1];
|
||||
const alreadyRetried = lastMsg?.role === 'system' && lastMsg.content.startsWith('[FILTER CORRECTION]');
|
||||
// Guard against tight retry loops: skip if we just injected a correction.
|
||||
const lastMsg = session.history[session.history.length - 1];
|
||||
const alreadyRetried =
|
||||
lastMsg?.role === 'system' &&
|
||||
typeof lastMsg.content === 'string' &&
|
||||
lastMsg.content.startsWith('[FILTER CORRECTION]');
|
||||
|
||||
if (!alreadyRetried) {
|
||||
const correctionText = filter.reason === 'fabricated_roll_result'
|
||||
? 'Do NOT state or imply a specific dice result. Wait for the [SKILL CHECK RESULT] system message before narrating any outcome.'
|
||||
: filter.reason === 'echoed_system_tag'
|
||||
? 'Do NOT echo internal system tags like [TOOL], [SESSION], or [SKILL CHECK] verbatim in your response.'
|
||||
: 'Your previous response was empty. Continue the scene.';
|
||||
if (!alreadyRetried) {
|
||||
const correctionText = filter.reason === 'fabricated_roll_result'
|
||||
? 'Do NOT state or imply a specific dice result. Wait for the [SKILL CHECK RESULT] system message before narrating any outcome.'
|
||||
: filter.reason === 'echoed_system_tag'
|
||||
? 'Do NOT echo internal system tags like [TOOL], [SESSION], or [SKILL CHECK] verbatim in your response.'
|
||||
: 'Your previous response was empty. Continue the scene.';
|
||||
|
||||
const correction: ChatMessage = {
|
||||
role: 'system',
|
||||
content: `[FILTER CORRECTION] Your last response was suppressed (${filter.reason}). ${correctionText}`,
|
||||
const correction: ChatMessage = {
|
||||
role: 'system',
|
||||
content: `[FILTER CORRECTION] Your last response was suppressed (${filter.reason}). ${correctionText}`,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
await sessionManager.addMessage(session.threadId, correction);
|
||||
appended = true;
|
||||
|
||||
// Retry once with the correction in context.
|
||||
scheduleEncounterLLMTurn(session.threadId, thread, _client, true);
|
||||
}
|
||||
// Fall through so any accompanying tool call still fires.
|
||||
} else {
|
||||
await thread.send(response.narrative);
|
||||
// Only store an assistant message when there is actual narrative.
|
||||
// Tool-call-only turns are represented solely by the system message the
|
||||
// tool handler writes. Storing a placeholder teaches the LLM to echo it.
|
||||
const assistantMsg: ChatMessage = {
|
||||
role: 'assistant',
|
||||
content: response.narrative,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
await sessionManager.addMessage(session.threadId, correction);
|
||||
|
||||
// Retry once with the correction in context.
|
||||
scheduleEncounterLLMTurn(session.threadId, thread, _client, true);
|
||||
await sessionManager.addMessage(session.threadId, assistantMsg);
|
||||
appended = true;
|
||||
}
|
||||
// Fall through so any accompanying tool call still fires.
|
||||
} else {
|
||||
await thread.send(response.narrative);
|
||||
// Only store an assistant message when there is actual narrative.
|
||||
// Tool-call-only turns are represented solely by the system message the
|
||||
// tool handler writes. Storing a placeholder teaches the LLM to echo it.
|
||||
const assistantMsg: ChatMessage = {
|
||||
role: 'assistant',
|
||||
content: response.narrative,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
await sessionManager.addMessage(session.threadId, assistantMsg);
|
||||
}
|
||||
|
||||
if (response.toolCall) {
|
||||
const freshSession = await sessionManager.get(session.threadId);
|
||||
if (freshSession) {
|
||||
const result = await dispatchTool(response.toolCall, { session: freshSession, thread });
|
||||
|
||||
const toolMsg: ChatMessage = {
|
||||
role: 'system',
|
||||
content: result.systemMessage,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
await sessionManager.addMessage(session.threadId, toolMsg);
|
||||
appended = true;
|
||||
|
||||
if (result.error) {
|
||||
await thread.send('*The narrator stumbles… something went wrong behind the scenes. Try your action again.*');
|
||||
}
|
||||
|
||||
if (result.resolved) {
|
||||
await sessionManager.update(session.threadId, {
|
||||
phase: 'resolved',
|
||||
outcome: result.resolved.outcomeId,
|
||||
outcomeSummary: result.resolved.summary,
|
||||
});
|
||||
setTimeout(async () => {
|
||||
await (thread as ThreadChannel).setArchived?.(true).catch(() => null);
|
||||
}, 5_000);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// Never let a turn die silently — log and fall through to the always-append
|
||||
// guard so history still grows and the scheduler drains.
|
||||
console.error('[messageRouter] turn processing failed:', err);
|
||||
}
|
||||
|
||||
if (response.toolCall) {
|
||||
const freshSession = await sessionManager.get(session.threadId);
|
||||
if (!freshSession) return;
|
||||
|
||||
const result = await dispatchTool(response.toolCall, { session: freshSession, thread });
|
||||
|
||||
const toolMsg: ChatMessage = {
|
||||
role: 'system',
|
||||
content: result.systemMessage,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
await sessionManager.addMessage(session.threadId, toolMsg);
|
||||
|
||||
if (result.error) {
|
||||
await thread.send('*The narrator stumbles… something went wrong behind the scenes. Try your action again.*');
|
||||
}
|
||||
|
||||
if (result.resolved) {
|
||||
await sessionManager.update(session.threadId, {
|
||||
phase: 'resolved',
|
||||
outcome: result.resolved.outcomeId,
|
||||
outcomeSummary: result.resolved.summary,
|
||||
});
|
||||
setTimeout(async () => {
|
||||
await (thread as ThreadChannel).setArchived?.(true).catch(() => null);
|
||||
}, 5_000);
|
||||
}
|
||||
if (!appended) {
|
||||
// The LLM produced no usable narrative/tool, or processing threw before
|
||||
// anything persisted. Record a fallback beat so this turn still completes
|
||||
// deterministically — otherwise it is lost and the narrator goes quiet.
|
||||
await sessionManager
|
||||
.addMessage(session.threadId, {
|
||||
role: 'system',
|
||||
content: '[NO RESPONSE] The narrator gave no usable reply this beat; awaiting the next action.',
|
||||
timestamp: Date.now(),
|
||||
})
|
||||
.catch(() => null);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,13 +104,52 @@ export async function queryAsNPC(
|
||||
question: string,
|
||||
limit = 5,
|
||||
): Promise<NPCQueryResult> {
|
||||
const result = await callTool('query_as_npc', { npc_name: npcName, question, limit });
|
||||
return result as NPCQueryResult;
|
||||
const result = await callTool('query_as_npc', { npc_name: npcName, question, limit }) as
|
||||
| NPCQueryResult
|
||||
| null;
|
||||
// GraphMCP returns `chunks: null` (and sometimes `graph_context: null`) for
|
||||
// NPCs with no prior memory. The declared contract is arrays; normalize at
|
||||
// this boundary so the type holds for every caller. formatNPCMemory already
|
||||
// defended with `?? []`, but the raw `as NPCQueryResult` cast let null leak
|
||||
// straight through to any caller reading .length/.map.
|
||||
return {
|
||||
...(result ?? ({} as NPCQueryResult)),
|
||||
chunks: Array.isArray(result?.chunks) ? result.chunks : [],
|
||||
graph_context: Array.isArray(result?.graph_context) ? result.graph_context : [],
|
||||
};
|
||||
}
|
||||
|
||||
// Map a raw GraphMCP search chunk to the declared SemanticChunk shape. The live
|
||||
// backend returns `{ text, score, source, author, timestamp, msgID }`, but the
|
||||
// client's SemanticChunk type (and its callers — encounter.ts handleGenerate,
|
||||
// mentionHandler) read `.content`. Without this mapping, `c.content` is
|
||||
// undefined and `c.content.slice(...)` in /encounter generate throws the same
|
||||
// "Cannot read properties of undefined (reading 'slice')" class as the
|
||||
// loreResult.chunks crash. Accept either field name for robustness.
|
||||
function toSemanticChunk(raw: unknown): SemanticChunk {
|
||||
const r = (raw ?? {}) as { text?: unknown; content?: unknown; score?: unknown; source?: unknown };
|
||||
const content =
|
||||
typeof r.text === 'string' ? r.text : typeof r.content === 'string' ? r.content : '';
|
||||
return {
|
||||
content,
|
||||
score: typeof r.score === 'number' ? r.score : 0,
|
||||
source: typeof r.source === 'string' ? r.source : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export async function semanticSearch(query: string, limit = 5): Promise<SemanticSearchResult> {
|
||||
const result = await callTool('semantic_search', { query, limit });
|
||||
return (result ?? { chunks: [] }) as SemanticSearchResult;
|
||||
// GraphMCP may return null, a bare array, or { chunks: [...] | null }. The
|
||||
// old `result ?? { chunks: [] }` only coalesced a null/undefined *result*; a
|
||||
// result whose `chunks` field was missing/null slipped through as-is, so
|
||||
// `loreResult.chunks.length` threw "Cannot read properties of undefined
|
||||
// (reading 'length')". Normalize at this boundary so the typed contract
|
||||
// ({ chunks: SemanticChunk[] }) always holds for every caller, and map each
|
||||
// chunk to the declared shape (text → content).
|
||||
const raw = Array.isArray(result)
|
||||
? result
|
||||
: (result as { chunks?: unknown } | null)?.chunks;
|
||||
return { chunks: Array.isArray(raw) ? raw.map(toSemanticChunk) : [] };
|
||||
}
|
||||
|
||||
export async function logEncounter(params: LogEncounterParams): Promise<LogEncounterResult> {
|
||||
@@ -145,7 +184,9 @@ export interface EncounterDetails {
|
||||
|
||||
export async function listEncounters(limit = 10): Promise<EncounterResultItem[]> {
|
||||
const result = await callTool('list_encounters', { limit });
|
||||
return (result ?? []) as EncounterResultItem[];
|
||||
// Same boundary guard as semanticSearch: only accept an actual array so a
|
||||
// wrong-shape GraphMCP response can't reach callers as a non-array.
|
||||
return Array.isArray(result) ? (result as EncounterResultItem[]) : [];
|
||||
}
|
||||
|
||||
export async function searchEncounters(params: {
|
||||
@@ -155,7 +196,7 @@ export async function searchEncounters(params: {
|
||||
limit?: number;
|
||||
}): Promise<EncounterResultItem[]> {
|
||||
const result = await callTool('search_encounters', params);
|
||||
return (result ?? []) as EncounterResultItem[];
|
||||
return Array.isArray(result) ? (result as EncounterResultItem[]) : [];
|
||||
}
|
||||
|
||||
export async function getEncounter(id: string): Promise<EncounterDetails> {
|
||||
|
||||
@@ -0,0 +1,393 @@
|
||||
---
|
||||
stepsCompleted: ['step-01-preflight-and-context', 'step-02-generation-mode', 'step-03-test-strategy', 'step-04-generate-tests', 'step-05-validate-and-complete']
|
||||
lastStep: 'step-05-validate-and-complete'
|
||||
lastSaved: '2026-06-19'
|
||||
workflowType: 'testarch-atdd'
|
||||
storyId: 'graphmcp.live.1'
|
||||
storyKey: 'graphmcp-live-integration-tests'
|
||||
storyFile: '(user-provided goal — no BMad story file in this repo)'
|
||||
atddChecklistPath: 'tests/integration/atdd-checklist-graphmcp-live-integration-tests.md'
|
||||
generatedTestFiles:
|
||||
- 'tests/integration/graphmcp/contract.test.ts'
|
||||
- 'tests/integration/graphmcp/encounter-lifecycle.test.ts'
|
||||
- 'tests/integration/graphmcp/skill-check.test.ts'
|
||||
- 'tests/integration/graphmcp/lore-and-events.test.ts'
|
||||
- 'tests/integration/graphmcp/long-encounter.test.ts'
|
||||
- 'tests/integration/graphmcp/support/env.ts'
|
||||
- 'tests/integration/graphmcp/support/poll.ts'
|
||||
- 'tests/integration/graphmcp/support/factories.ts'
|
||||
- 'tests/integration/graphmcp/support/fakes.ts'
|
||||
- 'tests/integration/graphmcp/support/liveBots.ts'
|
||||
- 'tests/integration/graphmcp/support/cleanup.ts'
|
||||
inputDocuments:
|
||||
- 'resources/knowledge/data-factories.md'
|
||||
- 'resources/knowledge/component-tdd.md'
|
||||
- 'resources/knowledge/test-quality.md'
|
||||
- 'resources/knowledge/test-healing-patterns.md'
|
||||
- 'resources/knowledge/test-levels-framework.md'
|
||||
- 'resources/knowledge/test-priorities-matrix.md'
|
||||
- 'resources/knowledge/ci-burn-in.md'
|
||||
- 'tests/integration/phase1.test.ts'
|
||||
- 'vitest.config.ts'
|
||||
- 'src/config.ts'
|
||||
- 'src/graphmcp/client.ts'
|
||||
- 'src/bot/index.ts'
|
||||
- 'src/bot/commands/encounter.ts'
|
||||
- 'src/bot/handlers/messageRouter.ts'
|
||||
---
|
||||
|
||||
# ATDD Checklist — GraphMCP Live Integration Tests
|
||||
|
||||
**Date:** 2026-06-19
|
||||
**Author:** TEA Agent (no BMad config in this repo — running on skill defaults)
|
||||
**Primary Test Level:** Integration (live infrastructure: real Discord gateway + real LLM + real GraphMCP + real Redis)
|
||||
|
||||
---
|
||||
|
||||
## Story Summary
|
||||
|
||||
A live-infrastructure integration test suite that runs a real Mardonar encounter end-to-end against a running GraphMCP backend and verifies the slash-command outputs, skill-check tooling, and lore/question-answering paths that interface with the real graph database.
|
||||
|
||||
**As a** Mardonar maintainer
|
||||
**I want** an integration suite that exercises the real GraphMCP backend (and real Discord + real LLM + real Redis) through the bot's encounter flow
|
||||
**So that** regressions in the GraphMCP contract, encounter lifecycle, skill-check tools, and lore/event-logging paths are caught before they reach players — including the wrong-shape-response crash class recently fixed in `src/graphmcp/client.ts`.
|
||||
|
||||
---
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
1. **AC1 — GraphMCP connectivity & JSON-RPC contract.** Given a reachable GraphMCP endpoint (`GRAPHMCP_URL`), when the suite invokes each JSON-RPC tool (`query_as_npc`, `semantic_search`, `log_encounter`, `list_encounters`, `search_encounters`, `get_encounter`), then each returns a payload matching its declared TypeScript contract in `src/graphmcp/client.ts`, and wrong-shape success responses (missing/null `chunks`, non-array encounter lists, bare arrays) are normalized — never crash callers with `Cannot read properties of undefined (reading 'length')`.
|
||||
|
||||
2. **AC2 — Real encounter lifecycle via slash commands.** Given the bot connected to the real Discord gateway with real Redis + GraphMCP + LLM, when the suite drives `/encounter start` (hybrid: `execute()` with a fake interaction backed by real channel objects from the live client), then a thread is created, the opening narrative is posted to Discord, and a `SessionState` is persisted in Redis; when a driver bot posts a chat message and the LLM responds, the turn flows through `messageRouter` → `callLLM` → `toolDispatcher` and session history updates; when `/encounter end` runs, the encounter resolves, a summary is written, `log_encounter` commits to GraphMCP, and the thread archives.
|
||||
|
||||
3. **AC3 — Skill-check tool.** Given an active encounter, when the LLM emits a `skill_check_emit` tool call, then a skill-check embed is posted to the thread and `pendingSkillCheck` is set in session state; when the roll resolves via `foundry_lookup`/`foundry_reward`, then the outcome is recorded and `pendingSkillCheck` is cleared.
|
||||
|
||||
4. **AC4 — Lore/question answering + event read-after-write.** Given real lore in the graph, when a player @mentions the bot or asks a question that triggers `context_recall`/`semantic_search`, then the answer references real lore retrieved from the graph; when `log_encounter` writes an event, then `list_encounters`/`search_encounters` return that event afterward (read-after-write consistency).
|
||||
|
||||
5. **AC5 — Long encounter (20–30 turns) with complex skill usage, varied goal outcomes, and final-output verification.** Given an active run-tagged encounter, when the suite drives 20–30 turns through the real scheduler (`scheduleEncounterLLMTurn` + history polling) with a scripted driver strategy, resolving every `skill_check_emit` via `handleRollInteraction`, then the encounter reaches a valid goal outcome (one of the spec's `goals.primary`/`secondary` ids) within the turn cap; different driver strategies reach DIFFERENT goal outcomes; and the final `encounter_resolve` output is read back from GraphMCP (`list_encounters` matched by run-id in the title → `get_encounter` returns the LLM-written summary, participants, and the resolved `outcomeId` in the title).
|
||||
|
||||
---
|
||||
|
||||
## Story Integration Metadata
|
||||
|
||||
- **Story ID:** `graphmcp.live.1`
|
||||
- **Story Key:** `graphmcp-live-integration-tests`
|
||||
- **Story File:** (user-provided goal — no BMad story file in this repo)
|
||||
- **Checklist Path:** `tests/integration/atdd-checklist-graphmcp-live-integration-tests.md`
|
||||
- **Generated Test Files:** _(populated in step 4)_
|
||||
|
||||
> No writable BMad story file exists in this repo (`_bmad/` is absent), so the BMM `dev-story` handoff step does not apply. This checklist is the handoff artifact.
|
||||
|
||||
---
|
||||
|
||||
## Generation Mode
|
||||
|
||||
**Mode:** AI generation (from source code + the GraphMCP client contract in `src/graphmcp/client.ts` + existing `tests/integration/phase1.test.ts` patterns).
|
||||
|
||||
**Reason:** `detected_stack = backend` — recording mode is skipped entirely for backend projects (no browser/UI). Tests are generated from API/source analysis, not browser recording.
|
||||
|
||||
---
|
||||
|
||||
## Test Strategy Decisions (confirmed with user)
|
||||
|
||||
- **Discord surface:** Real connected bot on the real gateway. Slash commands (`/encounter start`, `/encounter end`) are driven via the **hybrid** pattern — call the registered command's `execute()` with a fake `ChatInputCommandInteraction` whose `channel`/`guildId`/`user` are **real `discord.js` objects fetched from the live client** (real `TextChannel`/thread from a test guild). Thread creation, message posting, and replies flow through the real gateway to real Discord; only the command "click" is synthesized. (Bots cannot invoke each other's slash commands via the Discord API, so pure gateway-driven slash commands are not automatable.)
|
||||
- **Thread conversation turns:** A **driver bot** (separate token) posts real chat messages into the encounter thread, firing the real `messageRouter` path through the live gateway.
|
||||
- **LLM:** Always real (LiteLLM primary → Ollama fallback). Assert on **structural outcomes** (session-state fields, embed presence, GraphMCP query results), never exact narrative text. Use polling/retries for LLM-turn completion and graph read-after-write (eventual consistency).
|
||||
- **Stack:** `backend` (Node/TypeScript, `discord.js`, Vitest, `environment: 'node'`, `globals: true`). No Playwright/Cypress/Pact — all TEA utils flags default to disabled.
|
||||
- **Gating:** Skip unless `RUN_FULL_E2E=1` (stricter than the existing `RUN_INTEGRATION=1`, because this suite exercises real Discord + real LLM and is slow/non-deterministic). Follow the existing `describe.skipIf(...)` pattern from `tests/integration/phase1.test.ts`.
|
||||
|
||||
---
|
||||
|
||||
## Operational Requirements (prerequisites to run this suite)
|
||||
|
||||
- A dedicated **Discord test guild** (not a production server).
|
||||
- **Bot under test** credentials: `DISCORD_TOKEN`, `DISCORD_CLIENT_ID`, with `DISCORD_ALLOWED_CHANNELS` including the test channel and `DISCORD_ALLOWED_USERS` including the driver (or empty for channel-scoped).
|
||||
- A **second driver-bot token** for posting chat messages into threads.
|
||||
- **Redis** reachable at `REDIS_URL` (flush test keys between runs).
|
||||
- **GraphMCP** reachable at `GRAPHMCP_URL` (the real backend under test).
|
||||
- **LiteLLM** at `LITELLM_BASE_URL` and/or **Ollama** at `OLLAMA_BASE_URL` (real LLM).
|
||||
- All four up before running; `RUN_FULL_E2E=1` to activate.
|
||||
|
||||
**Cleanup discipline:** unique `encounterId` prefix per run (e.g. `e2e-<timestamp>-…`) to avoid collisions; delete test threads; flush Redis test keys; tear down / tag GraphMCP test entities so the graph stays clean across runs.
|
||||
|
||||
---
|
||||
|
||||
## Red-Phase Test Scaffolds Created
|
||||
|
||||
All scaffolds are real `it()` tests under `describe.skipIf(...)` — skipped without live infra (CI-safe), activated by env gates. Transpiled and verified to skip cleanly (see Test Execution Evidence). No `it.skip()` placeholders; each has concrete assertion intent.
|
||||
|
||||
### Files generated (step 4 — sequential mode; no BMad subagent runtime present, E2E worker N/A for backend)
|
||||
|
||||
| File | AC | Gate | Tests |
|
||||
|------|----|------|-------|
|
||||
| `tests/integration/graphmcp/contract.test.ts` | AC1 | `RUN_GRAPHMCP_LIVE=1` ∥ `RUN_FULL_E2E=1` | 7 (S1.1 skipIf no `E2E_TEST_NPC`) |
|
||||
| `tests/integration/graphmcp/encounter-lifecycle.test.ts` | AC2 | `RUN_FULL_E2E=1` | 3 (S2.1 start, S2.2 driver turn, S2.3 end) |
|
||||
| `tests/integration/graphmcp/skill-check.test.ts` | AC3 | `RUN_FULL_E2E=1` | 2 (S3.1 emit, S3.2 resolve) |
|
||||
| `tests/integration/graphmcp/lore-and-events.test.ts` | AC4 | `RUN_FULL_E2E=1` | 2 (S4.1 mention, S4.2 read-after-write) |
|
||||
| `tests/integration/graphmcp/support/env.ts` | — | — | config-env bootstrap (stubs Discord creds if absent; seeds `DISCORD_ALLOWED_CHANNELS` from `E2E_TEST_CHANNEL_ID`) |
|
||||
| `tests/integration/graphmcp/support/poll.ts` | — | — | `waitFor` / `untilStable` (eventual-consistency + LLM-turn polling) |
|
||||
| `tests/integration/graphmcp/support/factories.ts` | — | — | `runId`, `buildEncounterLog`, `titleMatchesRun` |
|
||||
| `tests/integration/graphmcp/support/fakes.ts` | — | — | `fakeInteraction` (hybrid slash-command), `fakeButton` (roll-resolve drive), `parseThreadIdFromReply` |
|
||||
| `tests/integration/graphmcp/support/liveBots.ts` | — | — | `connectLiveBots` / `disconnectLiveBots` (real bot + driver bot clients) |
|
||||
| `tests/integration/graphmcp/support/cleanup.ts` | — | — | `deleteThread`, `flushRedisForGuild`, `disconnectRedis`; GraphMCP no-delete limitation noted |
|
||||
|
||||
### Concrete vs scaffold (honest split)
|
||||
|
||||
- **AC1 (contract)** — fully concrete and runnable against **live GraphMCP alone** (no Discord, no LLM, no Redis). Asserts the live server returns contract-shaped data the client accepts without crashing. The wrong-shape *normalization* itself is unit-tested with fetch mocks in `tests/unit/graphmcpClient.test.ts` (already green); here we assert live-contract conformance. S1.7 (bogus id) asserts no unhandled exception escapes — the `/encounter generate` crash was an unhandled `TypeError`, not a clean rejection.
|
||||
- **AC2 (lifecycle)** — S2.1 (start) and S2.3 (end) are concrete via the hybrid `execute()` + real channel/thread pattern. S2.2 (driver-message turn) routes the real fetched message through `messageRouter.handleMessage`; one explicit TODO marks the choice between direct router call vs. arming the full `src/bot/index.ts` messageCreate handler.
|
||||
- **AC3 (skill-check)** — driven **deterministically** (not by waiting for the LLM to emit): `skill_check_emit` handler invoked directly, roll resolution driven via `handleRollInteraction` + a fake `ButtonInteraction` targeting the posted embed. Concretely automatable; no LLM dependency for the emit/resolve steps (resolution schedules a real LLM turn afterward).
|
||||
- **AC4 (lore)** — S4.1 uses the hybrid `handleMention(realMentionMsg, botClient)` approach; asserts a bot reply is posted (structural) with a soft/manual TODO for asserting cited lore content (LLM output is non-deterministic). S4.2 read-after-write is fully concrete (poll `list_encounters`/`search_encounters`).
|
||||
|
||||
### Gate refinement vs step 3
|
||||
|
||||
Step 3 gated everything under `RUN_FULL_E2E=1`. Step 4 splits the gate: AC1 (contract) also activates under the lighter `RUN_GRAPHMCP_LIVE=1`, since it needs only GraphMCP — a maintainer can run the contract suite without spinning up Discord/LLM/Redis. AC2–AC4 remain `RUN_FULL_E2E=1` only. This is an improvement; the "Running Tests" section below is updated accordingly.
|
||||
|
||||
---
|
||||
|
||||
## Test Strategy (AC → scenarios → levels → priorities)
|
||||
|
||||
`detected_stack = backend` → levels are **Integration** and **Integration/Contract** (no E2E/browser, no Component). All scenarios are gated by `RUN_FULL_E2E=1` (skipped otherwise).
|
||||
|
||||
**Priority legend:** P0 = guards a real production crash / data integrity; P1 = core live-flow correctness (needs real LLM, slow); P2 = edge/negative.
|
||||
|
||||
### AC1 — GraphMCP contract (Integration/Contract) — **P0**
|
||||
|
||||
_File:_ `tests/integration/graphmcp/contract.test.ts` (no LLM needed; fastest live tests)
|
||||
|
||||
| ID | Scenario | Level | Pri | Red expectation |
|
||||
|----|----------|-------|-----|-----------------|
|
||||
| S1.1 | `query_as_npc` returns `NPCQueryResult` (npc, tier, horizon_count, chunks[], graph_context[]) | Contract | P0 | Would have failed before client normalization; passes now |
|
||||
| S1.2 | `semantic_search` with wrong-shape response (`{chunks:null}`, no `chunks`, bare array) normalizes to `{chunks:[]}` — **regression for the `/encounter generate` crash** | Contract | P0 | Red before the `src/graphmcp/client.ts` fix; green after |
|
||||
| S1.3 | `log_encounter` returns `LogEncounterResult` (enc_id, title, participants, location, timestamp) | Contract | P0 | Structural assertion |
|
||||
| S1.4 | `list_encounters` returns `EncounterResultItem[]`; non-array response normalized to `[]` | Contract | P0 | Red before fix; green after |
|
||||
| S1.5 | `search_encounters` returns array; non-array normalized | Contract | P1 | Structural assertion |
|
||||
| S1.6 | `get_encounter` returns `EncounterDetails` shape | Contract | P1 | Structural assertion |
|
||||
| S1.7 | GraphMCP HTTP error / unreachable → `callTool` rejects and caller `.catch` degrades gracefully (no throw escapes) | Contract | P2 | Negative path |
|
||||
|
||||
### AC2 — Real encounter lifecycle (Integration, real LLM) — **P1**
|
||||
|
||||
_File:_ `tests/integration/graphmcp/encounter-lifecycle.test.ts`
|
||||
|
||||
| ID | Scenario | Level | Pri |
|
||||
|----|----------|-------|-----|
|
||||
| S2.1 | `/encounter start` (hybrid `execute()` + real channel) creates a real thread, posts opening narrative, persists `SessionState` in Redis | Integration | P1 |
|
||||
| S2.2 | Driver bot posts a chat message → LLM turn runs → session history grows by the assistant turn (poll for completion) | Integration | P1 |
|
||||
| S2.3 | `/encounter end` resolves, writes summary file, `log_encounter` commits to GraphMCP (read-after-write via `list_encounters`), thread archives | Integration | P1 |
|
||||
|
||||
### AC3 — Skill-check tool (Integration, real LLM) — **P1**
|
||||
|
||||
_File:_ `tests/integration/graphmcp/skill-check.test.ts`
|
||||
|
||||
| ID | Scenario | Level | Pri |
|
||||
|----|----------|-------|-----|
|
||||
| S3.1 | LLM-emitted `skill_check_emit` posts the skill-check embed + sets `pendingSkillCheck` in session (poll for embed/state) | Integration | P1 |
|
||||
| S3.2 | Roll resolves the check via `foundry_lookup`/`foundry_reward` → `pendingSkillCheck` cleared, outcome recorded | Integration | P1 |
|
||||
|
||||
### AC4 — Lore/question answering + event read-after-write (Integration, real LLM) — **P1**
|
||||
|
||||
_File:_ `tests/integration/graphmcp/lore-and-events.test.ts`
|
||||
|
||||
| ID | Scenario | Level | Pri |
|
||||
|----|----------|-------|-----|
|
||||
| S4.1 | @mention / question triggers `context_recall`/`semantic_search`; an answer embed is produced referencing real graph lore (structural assert) | Integration | P1 |
|
||||
| S4.2 | `log_encounter` write is readable by `list_encounters`/`search_encounters` afterward (poll for read-after-write consistency) | Integration | P1 |
|
||||
|
||||
### Planned support files (step 4)
|
||||
|
||||
- `tests/integration/graphmcp/support/liveBot.ts` — real connected `Client` fixture + teardown.
|
||||
- `tests/integration/graphmcp/support/driverBot.ts` — second bot that posts chat messages into threads.
|
||||
- `tests/integration/graphmcp/support/fakes.ts` — `fakeInteraction` (backed by real channel/user objects), `fakeMessage` factories.
|
||||
- `tests/integration/graphmcp/support/factories.ts` — `createE2ESpec` (unique `encounterId` per run), `createSessionOverrides`.
|
||||
- `tests/integration/graphmcp/support/cleanup.ts` — Redis test-key flush, thread delete, GraphMCP test-entity teardown.
|
||||
- `tests/integration/graphmcp/support/poll.ts` — retry/poll helpers (LLM turn completion, graph read-after-write).
|
||||
|
||||
### Red-phase note (adapted)
|
||||
|
||||
Classic ATDD targets new features (red before implementation). This story's "implementation" is the test suite + support code against **existing** production behavior. Adaptation: scaffolds are real `it()` tests under `describe.skipIf(process.env.RUN_FULL_E2E !== '1')` — skipped without infra (CI-safe). When activated against live infra, passing = behavior holds; failing = a real regression. The **AC1** scaffolds are genuinely red→green: S1.2/S1.4 would have failed before the `src/graphmcp/client.ts` normalization fix and pass after it. AC2–AC4 require live Discord+LLM and are scaffolded with concrete assertion intent + polling, to be confirmed against a running stack.
|
||||
|
||||
---
|
||||
|
||||
## Data Factories Created
|
||||
|
||||
`tests/integration/graphmcp/support/factories.ts`:
|
||||
|
||||
- `runId()` → `e2e-<timestamp>-<pid>` — unique per run, used to tag every entity so runs never collide with each other or with real data.
|
||||
- `buildEncounterLog(run, overrides)` → `LogEncounterParams` with a `[E2E] <run> —` title prefix (what `list_encounters`/`search_encounters` filter on for read-after-write + cleanup identification).
|
||||
- `titleMatchesRun(run)` → predicate matching a title against this run's tag.
|
||||
|
||||
`tests/integration/graphmcp/support/fakes.ts`:
|
||||
|
||||
- `fakeInteraction(opts)` → `{ interaction, replies, edits, lastText }` — fake `ChatInputCommandInteraction` backed by a **real** `TextChannel`/`ThreadChannel`; captures `reply`/`editReply`, implements exactly the subset `encounter.execute()` reads (`guildId`, `channelId`, `channel`, `user`, `options.getSubcommand`/`getString`, `deferReply`/`editReply`/`reply`).
|
||||
- `fakeButton(channel, customId)` → fake `ButtonInteraction` for driving `handleRollInteraction` (roll-resolution path) — `channel` is the real thread, `update` captured.
|
||||
- `parseThreadIdFromReply(text)` → extracts `<#id>` from the `/encounter start` editReply.
|
||||
|
||||
No `fakeMessage` factory was needed: conversation turns (S2.2, S4.1) fetch **real** `Message` objects posted by the driver bot rather than synthesizing them, per the hybrid pattern.
|
||||
|
||||
---
|
||||
|
||||
## Fixtures Created
|
||||
|
||||
- **Live bots** (`support/liveBots.ts`): `connectLiveBots()` logs in a real `Client` for the bot under test (`DISCORD_TOKEN`) and a second driver bot (`E2E_DRIVER_TOKEN`), resolves the real `Guild` + `TextChannel` (`E2E_TEST_GUILD_ID` / `E2E_TEST_CHANNEL_ID`); `disconnectLiveBots()` tears both down. Used by AC2/AC3/AC4 `beforeAll`/`afterAll`.
|
||||
- **Redis** (`support/cleanup.ts`): `flushRedisForGuild(guildId)` deletes only this guild's `session:*` and `players:<guild>` keys (never `FLUSHDB`); `disconnectRedis()` closes the shared singleton so the process exits.
|
||||
- **Thread cleanup** (`support/cleanup.ts`): `deleteThread(channel, threadId)` best-effort deletes the run's encounter thread (ignores already-deleted).
|
||||
- **Poll helpers** (`support/poll.ts`): `waitFor`/`untilStable` with configurable timeouts — the fixture for eventual-consistency reads and LLM-turn completion.
|
||||
- **Env bootstrap** (`support/env.ts`): imported first by every test so `EnvSchema.parse` doesn't crash without real Discord creds; seeds `DISCORD_ALLOWED_CHANNELS` from `E2E_TEST_CHANNEL_ID`.
|
||||
|
||||
No Vitest `test.extend` fixtures used — the project's integration pattern (per `tests/integration/phase1.test.ts`) is plain `describe.skipIf` + `beforeAll`/`afterAll` with dynamic/real imports, which these scaffolds follow for consistency.
|
||||
|
||||
---
|
||||
|
||||
## Mock Requirements
|
||||
|
||||
**None for the "real" path.** This suite deliberately exercises real services (Discord gateway, LLM, GraphMCP, Redis). No HTTP mocks. (If a future opt-in "fast" variant stubs the LLM, that will be documented here.)
|
||||
|
||||
---
|
||||
|
||||
## Required data-testid Attributes
|
||||
|
||||
**N/A** — backend integration suite; no DOM/UI. (Section retained from template for structural parity only.)
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
Each scaffolded test → the concrete activation task(s) that make it pass against live infra. "Skip-clean" (transpiles + skips when gated off) is **done** for all; "live-pass" requires the listed infra.
|
||||
|
||||
- **AC1 / contract.test.ts** —
|
||||
- S1.1: set `E2E_TEST_NPC` to a real NPC name in the graph. *(infra: GraphMCP)*
|
||||
- S1.2–S1.6: GraphMCP up at `GRAPHMCP_URL`; no other infra. *(infra: GraphMCP)*
|
||||
- S1.7: GraphMCP up; bogus-id behavior is whatever the live server returns (assertion is only "no unhandled throw escapes"). *(infra: GraphMCP)*
|
||||
- Activation: `RUN_GRAPHMCP_LIVE=1 npx vitest run tests/integration/graphmcp/contract.test.ts`
|
||||
- **AC2 / encounter-lifecycle.test.ts** —
|
||||
- S2.1: set `DISCORD_TOKEN`, `E2E_DRIVER_TOKEN`, `E2E_TEST_GUILD_ID`, `E2E_TEST_CHANNEL_ID`, `E2E_SPEC` (default `market-thief`); Redis + GraphMCP + LLM up. *(infra: all four)*
|
||||
- S2.2: **TODO to finalize** — confirm direct `handleMessage(realMsg, botClient)` is sufficient vs. arming the full `src/bot/index.ts` `messageCreate` handler; the under-test bot's messageCreate path must route the driver's thread message into `messageRouter`. *(infra: all four)*
|
||||
- S2.3: same env as S2.1; `log_encounter` from `/encounter end` must be readable via `list_encounters` (poll for read-after-write). *(infra: all four)*
|
||||
- **AC3 / skill-check.test.ts** —
|
||||
- Side-effect import `src/harness/tools/index.js` added so `getPlugin('skill_check_emit')` resolves without going through `toolDispatcher`.
|
||||
- S3.1: invoke the plugin handler directly with a real thread + session; assert `pendingSkillCheck` persisted + embed message exists. *(infra: Discord + Redis; GraphMCP for the encounter start that creates the session)*
|
||||
- S3.2: `fakeButton(thread, 'sc_roll')` → `handleRollInteraction`; assert `pendingSkillCheck` cleared + `[SKILL CHECK RESULT]` system message in history. *(infra: Discord + Redis; resolution schedules a real LLM turn afterward)*
|
||||
- **AC4 / lore-and-events.test.ts** —
|
||||
- S4.1: `persona.yaml` present (`PERSONA_PATH`), Redis up (ingest stream via `publishToGraphMCP`), GraphMCP + LLM up. Driver bot @mentions the under-test bot in the test channel; reply is fetched via the under-test client. **Soft TODO**: asserting the reply cites specific lore stays manual (LLM non-determinism). *(infra: all four)*
|
||||
- S4.2: GraphMCP only; poll `list_encounters` + `search_encounters` for the just-logged `[E2E]` event. *(infra: GraphMCP)*
|
||||
- **Cleanup** — `deleteThread` + `flushRedisForGuild` + `disconnectRedis` wired in `afterAll` of AC2/AC3/AC4. GraphMCP test encounters are `[E2E]`-prefixed and **not** deleted (no delete tool in `src/graphmcp/client.ts`); see `support/cleanup.ts` `GRAPHMCP_CLEANUP_LIMITATION`. A future `delete_encounter` tool would close this.
|
||||
|
||||
### Verification done in step 5
|
||||
|
||||
- ✅ `npx vitest run tests/integration` with no env → **5 files / 16 tests skipped**, exit 0 (CI-safe). Scaffolds transpile cleanly (esbuild would fail on syntax errors).
|
||||
- ✅ `npx vitest run tests/unit` → **33 files / 400 tests pass** — including the `graphmcpClient.test.ts` wrong-shape normalization regressions (S1.2/S1.4 unit-side guard for the `/encounter generate` crash) and `historyTrim.test.ts` FIFO test.
|
||||
- ⬜ Live-pass against real infra — **not run here**: the maintainer must provision the test guild, two bot tokens, Redis, GraphMCP, and LLM, then run `RUN_FULL_E2E=1` (and optionally `RUN_GRAPHMCP_LIVE=1` for AC1 alone). I cannot provision those services from this session.
|
||||
|
||||
---
|
||||
|
||||
## Running Tests
|
||||
|
||||
```bash
|
||||
# AC1 only — needs just a reachable GraphMCP (fastest live checks)
|
||||
RUN_GRAPHMCP_LIVE=1 npx vitest run tests/integration/graphmcp/contract.test.ts
|
||||
|
||||
# Full live suite (all four infra surfaces must be up)
|
||||
RUN_FULL_E2E=1 npm run test:int
|
||||
|
||||
# A single file
|
||||
RUN_FULL_E2E=1 npx vitest run tests/integration/graphmcp/encounter-lifecycle.test.ts
|
||||
|
||||
# CI default (the live suites stay skipped — no live infra in CI)
|
||||
npm run test:unit
|
||||
```
|
||||
|
||||
> These tests are **not** part of the CI default (`npm run test:unit`). They are opt-in, run manually or from a dedicated burn-in job, per `ci-burn-in.md`. With no env gate set, `npm run test:int` skips all 16 graphmcp tests (and the 2 existing `phase1` tests) and exits 0 — verified in step 5.
|
||||
|
||||
---
|
||||
|
||||
## Red-Green-Refactor Workflow
|
||||
|
||||
_(Standard ATDD cycle — see template. RED phase scaffolds are produced in step 4; GREEN/REFACTOR are dev-team next steps.)_
|
||||
|
||||
---
|
||||
|
||||
## Knowledge Base References Applied
|
||||
|
||||
This ATDD workflow consulted the following knowledge fragments (backend profile, TEA utils disabled):
|
||||
|
||||
- **data-factories.md** — factory functions with overrides, API/DB seeding, cleanup discipline (applied: unique `encounterId`, session/interaction/message factories).
|
||||
- **component-tdd.md** — red→green→refactor loop, provider isolation.
|
||||
- **test-quality.md** — determinism, isolation, one-assertion-per-test DoD, execution limits (applied: assert structural outcomes, not LLM narrative text; generous timeouts for real LLM).
|
||||
- **test-healing-patterns.md** — common failure patterns and automated fixes (applied: polling for read-after-write, retries for LLM turn completion).
|
||||
- **test-levels-framework.md** — choosing integration vs e2e coverage (applied: this is a live-infra integration suite, distinct from unit tests).
|
||||
- **test-priorities-matrix.md** — P0–P3 coverage targets (applied: GraphMCP contract = P0 since it recently crashed production; lifecycle/skill-check/lore = P1).
|
||||
- **ci-burn-in.md** — staged jobs, skip-unless-env gating, flakiness handling (applied: `RUN_FULL_E2E=1` gate, not in CI default).
|
||||
|
||||
Frontend-only fragments (`fixture-architecture.md`, `network-first.md`, `selector-resilience.md`, `timing-debugging.md`, Playwright Utils) were **not** loaded — `detected_stack = backend`.
|
||||
|
||||
See `resources/tea-index.csv` for the complete fragment mapping.
|
||||
|
||||
---
|
||||
|
||||
## Test Execution Evidence
|
||||
|
||||
Step 5 — scaffold validation (no live infra; gates off):
|
||||
|
||||
```
|
||||
$ npx vitest run tests/integration
|
||||
RUN v3.2.6
|
||||
↓ tests/integration/phase1.test.ts (2 tests | 2 skipped)
|
||||
↓ tests/integration/graphmcp/contract.test.ts (7 tests | 7 skipped)
|
||||
↓ tests/integration/graphmcp/lore-and-events.test.ts (2 tests | 2 skipped)
|
||||
↓ tests/integration/graphmcp/encounter-lifecycle.test.ts (3 tests | 3 skipped)
|
||||
↓ tests/integration/graphmcp/skill-check.test.ts (2 tests | 2 skipped)
|
||||
↓ tests/integration/graphmcp/long-encounter.test.ts (1 test | 1 skipped)
|
||||
Test Files 6 skipped (6)
|
||||
Tests 17 skipped (17)
|
||||
Duration ~600ms
|
||||
```
|
||||
→ exit 0. All scaffolds transpile and skip cleanly (CI-safe; no live infra required to import).
|
||||
|
||||
Unit suite (regression guards for the `/encounter generate` crash live here, not in the live suite):
|
||||
|
||||
```
|
||||
$ npx vitest run tests/unit
|
||||
Test Files 33 passed (33)
|
||||
Tests 404 passed (404)
|
||||
Duration 3.3s
|
||||
```
|
||||
→ `tests/unit/graphmcpClient.test.ts` (semanticSearch / listEncounters / queryAsNPC wrong-shape normalization), `tests/unit/historyTrim.test.ts` (FIFO trim), `tests/unit/specsToolsConsistency.test.ts` (spec tool refs vs registered plugins) all green.
|
||||
|
||||
### Live-pass evidence (real Discord + LiteLLM/Ollama + Redis + GraphMCP)
|
||||
|
||||
Provisioned infra: test guild + `DISCORD_TOKEN` (bot under test) + `E2E_DRIVER_TOKEN` + `E2E_TEST_GUILD_ID` + `E2E_TEST_CHANNEL_ID`, with host overrides `GRAPHMCP_URL=http://localhost:9000 REDIS_URL=redis://localhost:6379` (dotenv does not clobber command-line env, so these win over `.env`'s Docker-internal hostnames). Gate: `RUN_FULL_E2E=1`.
|
||||
|
||||
**AC1 — GraphMCP contract (7 tests):** all PASS live. Surfaced and fixed 2 latent `src/graphmcp/client.ts` bugs during live validation — `semanticSearch` mapped the wrong field (live returns `text`, code read `content` → would crash `encounter.ts:510` and silently break mention handling), and `queryAsNPC` returned null arrays unnormalized. Fixed with `toSemanticChunk` + array coercion; locked by new unit regression tests.
|
||||
|
||||
**AC2 — encounter lifecycle (3 tests):** all PASS live (18.96s). S2.1 start → real thread + persisted `SessionState`; S2.2 driver turn → LLM reply, history grows; S2.3 end → resolved + `log_encounter` read-after-write (`list_encounters` matched by run-id in summary → `get_encounter` returns full `EncounterDetails` with participants).
|
||||
|
||||
**AC5 — long encounter (1 test × 4 strategies, run one-per-invocation via `E2E_STRATEGY`):** all PASS live. Each writes a run-tagged spec (market-thief derived, unique `encounterId`/`title`), drives turns via the real scheduler with skill checks resolved through `handleRollInteraction`, and reads the `encounter_resolve` log back from GraphMCP.
|
||||
|
||||
| strategy | outcome | driver turns | skill checks | skills exercised | GraphMCP summary |
|
||||
|---|---|---|---|---|---|
|
||||
| catch | `catch` | ~4 | 2 | Athletics | verified |
|
||||
| negotiate | `negotiate` | ~12 | 5 | (multi) | verified |
|
||||
| flee | `escape` | ~2 | 0 | — | verified |
|
||||
| long_explore | `negotiate` | ~21 | 8 | Perception×4, Athletics×2, Persuasion×2 | verified |
|
||||
| bystander | `catch` | ~9 | 3 | Persuasion | verified |
|
||||
|
||||
→ **3 distinct goal outcomes** (`catch`, `negotiate`, `escape`) confirmed across the strategies; **long_explore delivers the 20–30 turn target (~21 driver turns) with complex skill usage (8 checks across 3 skills)**; every run verifies the final output in GraphMCP via `list_encounters` + `get_encounter` (title records the `outcomeId`, summary/participants/type confirmed). The `bystander` strategy exercised the Persuasion path but the LLM classified the juggler's tackle as `catch` rather than `bystander_chase` (a fuzzy outcome-boundary judgment — `catch` is still a valid spec goal, so the test passes; the test asserts outcome validity, not a specific outcome per strategy).
|
||||
|
||||
**Bugs surfaced + fixed during live AC5 validation:**
|
||||
- `src/bot/handlers/messageRouter.ts` `runLLMTurn` — a turn could die **silently** (no history growth, no error) when the LLM reply had no parseable narrative/tool, hit the filtered-already-retried path, or threw inside the post-LLM block (the scheduler's `try/finally` has no `catch`). The narrator would go quiet and the generation never completed. Fixed: wrapped post-LLM logic in `try/catch` (logs `[messageRouter] turn processing failed:`), track an `appended` flag, and **always grow history by ≥1** with a `[NO RESPONSE]` fallback beat; hardened the filter guards against non-string `content`. 404 unit tests still pass.
|
||||
- `tests/integration/graphmcp/support/cleanup.ts` `flushRedisForGuild` — used pattern `session:*${guildId}*` but session keys are `session:<threadId>` (a Discord snowflake, no guild id), so it matched nothing and stale sessions accumulated across runs. Fixed: scan `session:*`, delete only `e2e-`-prefixed (run-tagged) ones; added `deleteSession(threadId)` for per-run `afterAll` cleanup.
|
||||
- `long-encounter.test.ts` polling baseline — measured `history.length` before `addMessage`, so the user message itself satisfied the `> prevLen` poll and the loop spun 30× instantly without waiting for LLM turns. Fixed: baseline measured after the user message / after `handleRollInteraction` returns.
|
||||
|
||||
**AC3 + AC4:** scaffolds transpile + skip cleanly; live execution pending a dedicated run window (AC1/AC2/AC5 already exercise the skill-check tool and GraphMCP read-after-write paths end-to-end).
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- This repo has **no BMad config** (`_bmad/` absent) — no `tea/config.yaml`, no `custom/` overrides, no `project-context.md`. The skill ran on all defaults; `user_name`/`communication_language` defaulted (English). Agent-identity/persona bits from BMad are absent.
|
||||
- The GraphMCP contract suite (AC1) is the highest-value coverage: it directly guards the `semanticSearch`/`listEncounters` wrong-shape crash recently fixed in `src/graphmcp/client.ts` (the `/encounter generate` `TypeError: Cannot read properties of undefined (reading 'length')`).
|
||||
- Real-LLM tests are inherently slow (seconds per turn) and non-deterministic; budget generous per-test timeouts (60–120s) and prefer structural assertions + polling over exact-text asserts.
|
||||
- The hybrid slash-command pattern depends on `command.execute(interaction, client)` (`src/bot/index.ts:151`) and real channel objects from the connected client — no Discord API for bot-to-bot slash commands exists.
|
||||
|
||||
---
|
||||
|
||||
**Generated by BMad TEA Agent** — 2026-06-19
|
||||
147
tests/integration/graphmcp/contract.test.ts
Normal file
147
tests/integration/graphmcp/contract.test.ts
Normal file
@@ -0,0 +1,147 @@
|
||||
// AC1 — GraphMCP JSON-RPC contract (live).
|
||||
//
|
||||
// These tests need ONLY a reachable GraphMCP backend (GRAPHMCP_URL). No Discord
|
||||
// gateway, no LLM, no Redis. They are the fastest live tests and directly guard
|
||||
// the wrong-shape-response crash class recently fixed in src/graphmcp/client.ts
|
||||
// (the /encounter generate "Cannot read properties of undefined (reading
|
||||
// 'length')" TypeError).
|
||||
//
|
||||
// Scope split (important):
|
||||
// - The wrong-shape NORMALIZATION (null chunks, non-array lists, bare arrays)
|
||||
// is unit-tested with fetch mocks in tests/unit/graphmcpClient.test.ts.
|
||||
// - HERE we assert the LIVE server returns contract-shaped data that the
|
||||
// client accepts without crashing — i.e. the client's typed contracts hold
|
||||
// against the real backend's actual responses.
|
||||
//
|
||||
// Gate: RUN_GRAPHMCP_LIVE=1 (lighter than full E2E) OR RUN_FULL_E2E=1.
|
||||
// Skipped by default → CI-safe.
|
||||
|
||||
import './support/env.js';
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import {
|
||||
queryAsNPC,
|
||||
semanticSearch,
|
||||
logEncounter,
|
||||
listEncounters,
|
||||
searchEncounters,
|
||||
getEncounter,
|
||||
} from '../../../src/graphmcp/client.js';
|
||||
import type {
|
||||
NPCQueryResult,
|
||||
LogEncounterResult,
|
||||
EncounterResultItem,
|
||||
EncounterDetails,
|
||||
} from '../../../src/graphmcp/client.js';
|
||||
import { runId, buildEncounterLog, titleMatchesRun } from './support/factories.js';
|
||||
import { waitFor } from './support/poll.js';
|
||||
|
||||
const runLive = process.env.RUN_GRAPHMCP_LIVE === '1' || process.env.RUN_FULL_E2E === '1';
|
||||
const testNpc = process.env.E2E_TEST_NPC ?? '';
|
||||
|
||||
describe.skipIf(!runLive)('AC1 — GraphMCP JSON-RPC contract (live)', () => {
|
||||
const run = runId();
|
||||
const log = buildEncounterLog(run);
|
||||
let loggedEncId: string | undefined;
|
||||
let loggedResult: LogEncounterResult | undefined;
|
||||
|
||||
beforeAll(async () => {
|
||||
// S1.3 side effect — write a uniquely-tagged encounter once, then read it
|
||||
// back across S1.4–S1.6. The shape assertion on the write lives in its own
|
||||
// test below; we store the result here so the read-after-write tests share
|
||||
// the exact id the server assigned.
|
||||
loggedResult = await logEncounter(log);
|
||||
loggedEncId = loggedResult?.enc_id;
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
// GraphMCP has no delete tool (see support/cleanup.ts). Test encounters are
|
||||
// [E2E]-prefixed and left in place — distinguishable from real data.
|
||||
});
|
||||
|
||||
// S1.1 — query_as_npc returns NPCQueryResult shape -------------------------
|
||||
it.skipIf(!testNpc)('S1.1 query_as_npc returns an NPCQueryResult-shaped payload', async () => {
|
||||
const result: NPCQueryResult = await queryAsNPC(
|
||||
testNpc,
|
||||
'What do you know about recent events in Mardonar?',
|
||||
5,
|
||||
);
|
||||
expect(result).toBeTruthy();
|
||||
expect(typeof result.npc).toBe('string');
|
||||
expect(typeof result.tier).toBe('string');
|
||||
expect(typeof result.horizon_count).toBe('number');
|
||||
expect(Array.isArray(result.chunks)).toBe(true);
|
||||
expect(Array.isArray(result.graph_context)).toBe(true);
|
||||
});
|
||||
|
||||
// S1.2 — semantic_search returns { chunks: [] } and never crashes ----------
|
||||
// (Wrong-shape normalization itself is unit-tested; here we assert the live
|
||||
// server's real response is accepted and shaped as { chunks: SemanticChunk[] }.)
|
||||
it('S1.2 semantic_search returns { chunks: SemanticChunk[] } (no crash)', async () => {
|
||||
const result = await semanticSearch('Mardonar factions and dangers', 6);
|
||||
expect(result).toBeTruthy();
|
||||
expect(Array.isArray(result.chunks)).toBe(true);
|
||||
// Every chunk that comes back honors the declared SemanticChunk contract.
|
||||
for (const c of result.chunks) {
|
||||
expect(typeof c.content).toBe('string');
|
||||
expect(typeof c.score).toBe('number');
|
||||
}
|
||||
});
|
||||
|
||||
// S1.3 — log_encounter returns LogEncounterResult shape --------------------
|
||||
it('S1.3 log_encounter returns a LogEncounterResult-shaped payload', async () => {
|
||||
expect(loggedResult).toBeTruthy();
|
||||
expect(typeof loggedResult!.enc_id).toBe('string');
|
||||
expect(loggedResult!.enc_id.length).toBeGreaterThan(0);
|
||||
expect(loggedResult!.title).toBe(log.title);
|
||||
expect(typeof loggedResult!.participants).toBe('string');
|
||||
expect(typeof loggedResult!.location).toBe('string');
|
||||
expect(typeof loggedResult!.timestamp).toBe('string');
|
||||
});
|
||||
|
||||
// S1.4 — list_encounters returns an EncounterResultItem[] (array) ----------
|
||||
it('S1.4 list_encounters returns an array (normalized, never a non-array)', async () => {
|
||||
const result: EncounterResultItem[] = await listEncounters(50);
|
||||
expect(Array.isArray(result)).toBe(true);
|
||||
// The encounter we just wrote should be discoverable in the list.
|
||||
const found = result.find(e => e.id === loggedEncId);
|
||||
expect(found, 'logged encounter must appear in list_encounters').toBeTruthy();
|
||||
});
|
||||
|
||||
// S1.5 — search_encounters returns an array and can find the logged event --
|
||||
it('S1.5 search_encounters returns an array and locates this run\'s event', async () => {
|
||||
const result = await searchEncounters({ query: run, limit: 50 });
|
||||
expect(Array.isArray(result)).toBe(true);
|
||||
const match = result.find(e => titleMatchesRun(run)(e.title));
|
||||
// read-after-write is eventually consistent — poll briefly before giving up.
|
||||
const found = await waitFor(
|
||||
async () => {
|
||||
const r = await searchEncounters({ query: run, limit: 50 });
|
||||
return r.find(e => titleMatchesRun(run)(e.title)) ?? null;
|
||||
},
|
||||
{ timeoutMs: 30_000, intervalMs: 2_000 },
|
||||
).catch(() => null);
|
||||
expect(match ?? found, 'search_encounters must surface the just-logged event').toBeTruthy();
|
||||
});
|
||||
|
||||
// S1.6 — get_encounter returns EncounterDetails shape ----------------------
|
||||
it('S1.6 get_encounter returns an EncounterDetails-shaped payload for the logged id', async () => {
|
||||
expect(loggedEncId, 'log_encounter must have produced an id first').toBeTruthy();
|
||||
const details = await getEncounter(loggedEncId!) as EncounterDetails;
|
||||
expect(details).toBeTruthy();
|
||||
expect(details.id).toBe(loggedEncId);
|
||||
expect(typeof details.title).toBe('string');
|
||||
expect(Array.isArray(details.participants)).toBe(true);
|
||||
expect(Array.isArray(details.featured_entities)).toBe(true);
|
||||
});
|
||||
|
||||
// S1.7 — negative path: a non-existent id rejects cleanly (not an unhandled crash)
|
||||
it('S1.7 get_encounter with a bogus id rejects with a clean GraphMCP error', async () => {
|
||||
// The /encounter generate crash was an unhandled TypeError. The correct
|
||||
// contract for a missing entity is a clean, typed rejection: the server
|
||||
// returns a JSON-RPC error envelope and callTool converts it to a thrown
|
||||
// Error. Assert it rejects (not resolves) and names the problem.
|
||||
await expect(getEncounter('e2e-bogus-does-not-exist-9999')).rejects.toThrow(
|
||||
/encounter not found/,
|
||||
);
|
||||
});
|
||||
});
|
||||
168
tests/integration/graphmcp/encounter-lifecycle.test.ts
Normal file
168
tests/integration/graphmcp/encounter-lifecycle.test.ts
Normal file
@@ -0,0 +1,168 @@
|
||||
// AC2 — Real encounter lifecycle via slash commands (live Discord + LLM + Redis + GraphMCP).
|
||||
//
|
||||
// Hybrid slash-command pattern: the bot under test is connected to the real
|
||||
// gateway; /encounter start and /encounter end are driven by calling the
|
||||
// registered command's execute() with a FAKE interaction backed by REAL
|
||||
// channel/thread objects from the live client. Conversation turns (S2.2) are
|
||||
// driven by a second driver bot posting real messages, then routed through the
|
||||
// real messageRouter. Assert on STRUCTURAL outcomes (session state, thread
|
||||
// existence, GraphMCP read-after-write) — never exact narrative text.
|
||||
//
|
||||
// Gate: RUN_FULL_E2E=1. Requires: DISCORD_TOKEN, E2E_DRIVER_TOKEN,
|
||||
// E2E_TEST_GUILD_ID, E2E_TEST_CHANNEL_ID, plus Redis + GraphMCP + LLM up.
|
||||
// Skipped by default → CI-safe.
|
||||
|
||||
import './support/env.js';
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { execute } from '../../../src/bot/commands/encounter.js';
|
||||
import { sessionManager } from '../../../src/session/sessionManager.js';
|
||||
import { runLLMTurn } from '../../../src/bot/handlers/messageRouter.js';
|
||||
import { listEncounters, getEncounter } from '../../../src/graphmcp/client.js';
|
||||
import { runId } from './support/factories.js';
|
||||
import { connectLiveBots, disconnectLiveBots, type LiveBots } from './support/liveBots.js';
|
||||
import { fakeInteraction, parseThreadIdFromReply } from './support/fakes.js';
|
||||
import {
|
||||
flushRedisForGuild,
|
||||
disconnectRedis,
|
||||
deleteThread,
|
||||
} from './support/cleanup.js';
|
||||
import { waitFor } from './support/poll.js';
|
||||
import type { ThreadChannel } from 'discord.js';
|
||||
|
||||
const runE2E = process.env.RUN_FULL_E2E === '1';
|
||||
const specName = process.env.E2E_SPEC ?? 'market-thief';
|
||||
|
||||
describe.skipIf(!runE2E)('AC2 — Real encounter lifecycle (live)', () => {
|
||||
let bots: LiveBots;
|
||||
const run = runId();
|
||||
let threadId: string | null = null;
|
||||
let thread: ThreadChannel | null = null;
|
||||
|
||||
beforeAll(async () => {
|
||||
bots = await connectLiveBots();
|
||||
await flushRedisForGuild(bots.guild.id);
|
||||
}, 120_000);
|
||||
|
||||
afterAll(async () => {
|
||||
try {
|
||||
if (threadId) await deleteThread(bots.channel, threadId);
|
||||
} finally {
|
||||
await disconnectRedis();
|
||||
await disconnectLiveBots(bots);
|
||||
}
|
||||
}, 120_000);
|
||||
|
||||
// S2.1 — /encounter start --------------------------------------------------
|
||||
it('S2.1 start creates a real thread, posts the opening, and persists SessionState', async () => {
|
||||
const { interaction, lastText } = fakeInteraction({
|
||||
subcommand: 'start',
|
||||
stringOptions: { spec: specName },
|
||||
channel: bots.channel,
|
||||
guildId: bots.guild.id,
|
||||
userId: process.env.E2E_DRIVER_USER_ID ?? 'e2e-driver-user',
|
||||
username: 'E2E Driver',
|
||||
});
|
||||
|
||||
await execute(interaction);
|
||||
|
||||
threadId = parseThreadIdFromReply(lastText());
|
||||
expect(threadId, 'start must reply with the created thread reference').toBeTruthy();
|
||||
|
||||
const session = await waitFor(
|
||||
async () => (await sessionManager.get(threadId!)) ?? null,
|
||||
{ timeoutMs: 30_000, intervalMs: 1_000 },
|
||||
);
|
||||
expect(session, 'SessionState must be persisted in Redis').toBeTruthy();
|
||||
expect(session!.phase).toBe('open');
|
||||
expect(session!.spec.encounterId).toBeTruthy();
|
||||
// Opening narrative is the first history message (role: assistant, pinned).
|
||||
expect(session!.history.length).toBeGreaterThanOrEqual(1);
|
||||
expect(session!.history[0].role).toBe('assistant');
|
||||
expect(session!.history[0].content.length).toBeGreaterThan(0);
|
||||
|
||||
thread = await bots.channel.threads.fetch(threadId!);
|
||||
expect(thread, 'thread must exist on the real gateway').toBeTruthy();
|
||||
}, 120_000);
|
||||
|
||||
// S2.2 — driver turn → LLM turn runs → history grows ---------------------
|
||||
it('S2.2 a driver turn routes through runLLMTurn and grows session history', async () => {
|
||||
expect(threadId, 'depends on S2.1').toBeTruthy();
|
||||
thread = thread ?? (await bots.channel.threads.fetch(threadId!));
|
||||
|
||||
// The bot ignores bot-authored messages (anti-loop guard, messageRouter.ts:33),
|
||||
// so a driver BOT can't drive a turn via handleMessage. Drive deterministically:
|
||||
// append a user turn to history, then call the exported runLLMTurn — the same
|
||||
// callLLM → toolDispatcher → session-update path, against real LLM + GraphMCP.
|
||||
// runLLMTurn posts the narrative to the thread (visible in Discord) and appends
|
||||
// the assistant turn (or a tool-call / filter-correction system message) to
|
||||
// history, so history reliably grows by ≥1 even on an empty LLM response.
|
||||
await sessionManager.addMessage(threadId!, {
|
||||
role: 'user',
|
||||
content: 'E2E Driver: I step forward and greet the figures before me, hand open.',
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
const sessionForTurn = await sessionManager.get(threadId!);
|
||||
const beforeLen = sessionForTurn!.history.length;
|
||||
|
||||
await runLLMTurn(sessionForTurn!, thread!, bots.botClient);
|
||||
|
||||
const grown = await waitFor(
|
||||
async () => {
|
||||
const s = await sessionManager.get(threadId!);
|
||||
return s && s.history.length > beforeLen ? s : null;
|
||||
},
|
||||
{ timeoutMs: 120_000, intervalMs: 3_000 },
|
||||
);
|
||||
expect(grown!.history.length, 'an assistant/tool turn must be appended').toBeGreaterThan(
|
||||
beforeLen,
|
||||
);
|
||||
}, 150_000);
|
||||
|
||||
// S2.3 — /encounter end ----------------------------------------------------
|
||||
it('S2.3 end resolves the session, logs to GraphMCP, and archives the thread', async () => {
|
||||
expect(threadId, 'depends on S2.1').toBeTruthy();
|
||||
// The end command reads interaction.channel as the encounter thread.
|
||||
thread = thread ?? (await bots.channel.threads.fetch(threadId!));
|
||||
const { interaction } = fakeInteraction({
|
||||
subcommand: 'end',
|
||||
stringOptions: { notes: `E2E run ${run} concluded by automated suite.` },
|
||||
channel: thread!,
|
||||
guildId: bots.guild.id,
|
||||
userId: process.env.E2E_DRIVER_USER_ID ?? 'e2e-driver-user',
|
||||
username: 'E2E Driver',
|
||||
});
|
||||
|
||||
await execute(interaction);
|
||||
|
||||
const session = await waitFor(
|
||||
async () => {
|
||||
const s = await sessionManager.get(threadId!);
|
||||
return s && s.phase === 'resolved' ? s : null;
|
||||
},
|
||||
{ timeoutMs: 60_000, intervalMs: 2_000 },
|
||||
);
|
||||
expect(session!.phase).toBe('resolved');
|
||||
expect(session!.outcomeSummary, 'LLM summary must be recorded').toBeTruthy();
|
||||
|
||||
// Read-after-write: handleEnd logs with title `${spec.title} — admin end`
|
||||
// and summary = the DM notes (which we tagged with this run's unique id).
|
||||
// So locate the event by the run id in its SUMMARY — the title is not
|
||||
// run-tagged. Then fetch its full EncounterDetails from GraphMCP to verify
|
||||
// the final output (the "look into the MCP for the encounter summary" check).
|
||||
const logged = await waitFor(
|
||||
async () => {
|
||||
const list = await listEncounters(100);
|
||||
const hit = list.find(e => typeof e.summary === 'string' && e.summary.includes(run));
|
||||
return hit ?? null;
|
||||
},
|
||||
{ timeoutMs: 45_000, intervalMs: 2_000 },
|
||||
).catch(() => null);
|
||||
expect(logged, 'log_encounter from /encounter end must be readable via list_encounters (matched by run id in summary)').toBeTruthy();
|
||||
|
||||
const details = await getEncounter(logged!.id);
|
||||
expect(details, 'GraphMCP must return full EncounterDetails for the logged event').toBeTruthy();
|
||||
expect(details!.summary.includes(run), 'GraphMCP encounter summary must preserve the run-tagged DM notes').toBe(true);
|
||||
expect(Array.isArray(details!.participants), 'GraphMCP encounter must list participants').toBe(true);
|
||||
expect(details!.participants.length, 'participants must include the encounter NPCs/players').toBeGreaterThan(0);
|
||||
}, 150_000);
|
||||
});
|
||||
298
tests/integration/graphmcp/long-encounter.test.ts
Normal file
298
tests/integration/graphmcp/long-encounter.test.ts
Normal file
@@ -0,0 +1,298 @@
|
||||
// AC5 — Long encounter (20–30 turns) with complex skill usage, varied goal
|
||||
// outcomes, and final-output verification by reading the encounter summary
|
||||
// back out of GraphMCP.
|
||||
//
|
||||
// One encounter per invocation. The driver strategy is selected by E2E_STRATEGY
|
||||
// (default 'catch'); rotate strategies across loop runs to accumulate coverage
|
||||
// of DIFFERENT goal outcomes (catch / negotiate / escape / bystander_chase).
|
||||
// Keeping one encounter per run holds each live run to ~2–5 min, well under the
|
||||
// 10m loop cadence — this avoids two runs logging in with the same DISCORD_TOKEN
|
||||
// concurrently (which would disconnect each other).
|
||||
//
|
||||
// Flow (faithful to the real scheduler, to avoid double-turn races):
|
||||
// append a user action → scheduleEncounterLLMTurn(immediate) → poll history
|
||||
// for the landed turn → if a skill check is pending, resolve it via
|
||||
// handleRollInteraction (+ fake button) and poll for the reaction turn, in a
|
||||
// loop so chained checks are handled → repeat until phase === 'resolved' or
|
||||
// 30 turns. Then read the encounter_resolve log back from GraphMCP and assert
|
||||
// the outcome + summary.
|
||||
//
|
||||
// Gate: RUN_FULL_E2E=1. Requires the full live stack (Discord + LLM + Redis +
|
||||
// GraphMCP). Skipped by default → CI-safe.
|
||||
|
||||
import './support/env.js';
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { readFileSync, writeFileSync, rmSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { load, dump } from 'js-yaml';
|
||||
import { config } from '../../../src/config.js';
|
||||
import { execute } from '../../../src/bot/commands/encounter.js';
|
||||
import { loadSpec } from '../../../src/spec/loader.js';
|
||||
import { sessionManager } from '../../../src/session/sessionManager.js';
|
||||
import { scheduleEncounterLLMTurn } from '../../../src/bot/handlers/messageRouter.js';
|
||||
import { handleRollInteraction } from '../../../src/bot/handlers/rollHandler.js';
|
||||
import { listEncounters, getEncounter } from '../../../src/graphmcp/client.js';
|
||||
import { runId } from './support/factories.js';
|
||||
import { connectLiveBots, disconnectLiveBots, type LiveBots } from './support/liveBots.js';
|
||||
import { fakeInteraction, fakeButton, parseThreadIdFromReply } from './support/fakes.js';
|
||||
import { flushRedisForGuild, disconnectRedis, deleteThread, deleteSession } from './support/cleanup.js';
|
||||
import { waitFor } from './support/poll.js';
|
||||
import type { ThreadChannel } from 'discord.js';
|
||||
|
||||
const runE2E = process.env.RUN_FULL_E2E === '1';
|
||||
const MAX_TURNS = 30;
|
||||
|
||||
interface Strategy {
|
||||
name: string;
|
||||
// In-character driver lines, played in order; the last line repeats if the
|
||||
// encounter hasn't resolved by the time the script runs out.
|
||||
actions: string[];
|
||||
// Minimum driver turns (user messages appended) expected before resolution.
|
||||
// Guards against the harness silently short-circuiting to a 2–4 turn
|
||||
// encounter. The LLM ultimately decides when to resolve, so this is a lower
|
||||
// bound, not an exact count — set conservatively per strategy.
|
||||
minDriverTurns?: number;
|
||||
}
|
||||
|
||||
const STRATEGIES: Record<string, Strategy> = {
|
||||
catch: {
|
||||
name: 'catch',
|
||||
actions: [
|
||||
"I sprint after the hooded thief, weaving through the festival crowd to cut off his escape toward the alley.",
|
||||
"I dive to tackle Dal around the legs before he can reach the alley mouth.",
|
||||
"I grab Dal's arm and pin him against a stall so he can't bolt, holding firm.",
|
||||
"I keep him restrained and shout back to Miriam that I've caught her thief.",
|
||||
],
|
||||
},
|
||||
negotiate: {
|
||||
name: 'negotiate',
|
||||
actions: [
|
||||
"I move to block the alley exit, cornering Dal so he can't run, but I keep my hands open and visible.",
|
||||
"I speak calmly to Dal: 'Easy — I'm not going to hurt you. Why did you take the apple?'",
|
||||
"I pull a coin from my pouch and hold it out. 'Take this for the apple. You look hungry — when did you last eat?'",
|
||||
"I offer Dal the coin and my word that Miriam won't call the guards if he gives the apple back.",
|
||||
],
|
||||
},
|
||||
flee: {
|
||||
name: 'flee (escape)',
|
||||
actions: [
|
||||
"I hesitate, unsure whether to intervene, and watch the thief sprint toward the crowd.",
|
||||
"I step aside to let him pass, not wanting to cause a scene at the festival.",
|
||||
"I turn back to Miriam and shrug apologetically as Dal vanishes into the alley.",
|
||||
],
|
||||
},
|
||||
bystander: {
|
||||
name: 'bystander_chase',
|
||||
actions: [
|
||||
"I shout to the young juggler by the fountain: 'Hey — that kid just robbed the apple stand! Help me catch him!'",
|
||||
"I urge the juggler: 'You're young and quick — you can head him off before he reaches the alley. I'll make it worth your while!'",
|
||||
"I point after Dal and wave the juggler after him, staying put by the stand so I don't spook Dal into running harder.",
|
||||
"I call to Miriam: 'Watch which alley he ducks into — the juggler's going after him!'",
|
||||
"I watch the juggler give chase, ready to shout out Dal's hiding spot if he doubles back.",
|
||||
"I stay by the stand and shout encouragement to the juggler as he closes in, keeping Miriam calm.",
|
||||
"I keep my eyes on Dal and direct the juggler: 'He's heading for the crates — cut left!'",
|
||||
],
|
||||
},
|
||||
// A long, exploratory play that lingers in the scene — observing, talking to
|
||||
// multiple NPCs, and attempting several DIFFERENT skill checks (Perception to
|
||||
// spot, Athletics to chase, Persuasion to recruit the juggler, Intimidation
|
||||
// to corner) — before any decisive action. This is what produces genuine
|
||||
// 20–30 turn coverage WITH complex skill usage; the decisive strategies above
|
||||
// resolve in a handful of turns. The LLM may still resolve early (e.g. Dal
|
||||
// escapes during the exploration) — that's a valid outcome, but the
|
||||
// minDriverTurns guard catches a harness regression that short-circuits it.
|
||||
long_explore: {
|
||||
name: 'long_explore',
|
||||
minDriverTurns: 15,
|
||||
actions: [
|
||||
"I take a moment to scan the festival crowd, noting the exits and the two guards' position at the far end of the square.",
|
||||
"I approach Miriam's apple stand. 'What happened — which way did the thief go?'",
|
||||
"I look in the direction Miriam points, trying to pick the hooded figure out of the crowd.",
|
||||
"I notice the young juggler by the fountain watching the commotion with interest.",
|
||||
"I call over to the juggler: 'Did you see which way that thief ran?'",
|
||||
"I try to persuade the juggler to help me head the thief off — 'A hand here would be worth a drink after!'",
|
||||
"I scan the alley mouths along the square's edge for any movement, squinting into the shadows.",
|
||||
"I move quickly toward the nearest alley, keeping my eyes peeled for the hooded figure.",
|
||||
"I peer behind a stack of crates near the alley entrance, listening for breathing.",
|
||||
"Catching a flash of brown hood ducking behind a stall, I sprint after him to cut off his escape.",
|
||||
"I call out: 'Wait — stop! I just want to talk!'",
|
||||
"I chase Dal into the alley, trying to close the gap before he vanishes.",
|
||||
"I scan the alley for where he's hidden himself behind the refuse and barrels.",
|
||||
"Spotting him pressed against the wall, I block the alley mouth so he can't bolt past me.",
|
||||
"I approach Dal slowly, hands open and visible, but making clear the exit is covered.",
|
||||
"'Easy — I'm not here to hurt you. Why did you take the apple?'",
|
||||
"I study Dal's face — gaunt, hollow-eyed. He looks genuinely hungry, not malicious.",
|
||||
"I ask Dal his name and how long it's been since he last ate.",
|
||||
"I tell Dal firmly that he's not leaving this alley until we sort this out — he needs to drop the apple.",
|
||||
"I glance back toward Miriam, then to the guards at the far end, weighing my options.",
|
||||
"I pull a coin from my pouch and hold it out toward Dal.",
|
||||
"'Take this for the apple. You look like you need a meal more than Miriam needs three silvers.'",
|
||||
"I tell Dal: 'Give the apple back to Miriam and I'll make sure she doesn't call the guards. Deal?'",
|
||||
"I wait for Dal's answer, hand still extended with the coin.",
|
||||
"I add quietly: 'Nobody needs to get hurt or arrested today. Just hand it over.'",
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
const strategyKey = process.env.E2E_STRATEGY ?? 'catch';
|
||||
const strategy = STRATEGIES[strategyKey] ?? STRATEGIES.catch;
|
||||
|
||||
describe.skipIf(!runE2E)(`AC5 — Long encounter, strategy=${strategy.name} (live)`, () => {
|
||||
let bots: LiveBots;
|
||||
const run = runId();
|
||||
const specSlug = `e2e-${run}`;
|
||||
const specPath = join(config.SPECS_DIR, `${specSlug}.yaml`);
|
||||
let threadId: string | null = null;
|
||||
let thread: ThreadChannel | null = null;
|
||||
let validOutcomeIds: Set<string>;
|
||||
|
||||
beforeAll(async () => {
|
||||
bots = await connectLiveBots();
|
||||
await flushRedisForGuild(bots.guild.id);
|
||||
|
||||
// Write a run-tagged spec derived from market-thief so the encounter_resolve
|
||||
// GraphMCP log (title `${spec.title} — ${outcomeId}`) is uniquely findable
|
||||
// by this run's id, and the outcomeId is verifiable in MCP.
|
||||
const base = load(readFileSync(join(config.SPECS_DIR, 'market-thief.yaml'), 'utf-8')) as Record<string, unknown>;
|
||||
base.encounterId = specSlug;
|
||||
base.title = `[E2E ${run}] The Market Square Thief`;
|
||||
writeFileSync(specPath, dump(base, { lineWidth: 120, quotingType: '"' }), 'utf-8');
|
||||
|
||||
const spec = loadSpec(specSlug);
|
||||
validOutcomeIds = new Set([
|
||||
...spec.goals.primary.map(g => g.id),
|
||||
...spec.goals.secondary.map(g => g.id),
|
||||
]);
|
||||
}, 120_000);
|
||||
|
||||
afterAll(async () => {
|
||||
try {
|
||||
rmSync(specPath, { force: true });
|
||||
if (threadId) {
|
||||
await deleteThread(bots.channel, threadId);
|
||||
await deleteSession(threadId);
|
||||
}
|
||||
} finally {
|
||||
await disconnectRedis();
|
||||
await disconnectLiveBots(bots);
|
||||
}
|
||||
}, 120_000);
|
||||
|
||||
it(`drives a 20–30 turn encounter via ${strategy.name}, exercising skill checks, reaching a valid goal outcome, and verifies the GraphMCP summary`, async () => {
|
||||
// ── Start the run-tagged encounter ──────────────────────────────────────
|
||||
const { interaction, lastText } = fakeInteraction({
|
||||
subcommand: 'start',
|
||||
stringOptions: { spec: specSlug },
|
||||
channel: bots.channel,
|
||||
guildId: bots.guild.id,
|
||||
userId: process.env.E2E_DRIVER_USER_ID ?? 'e2e-driver-user',
|
||||
username: 'E2E Driver',
|
||||
});
|
||||
await execute(interaction);
|
||||
threadId = parseThreadIdFromReply(lastText());
|
||||
expect(threadId, 'encounter must start and reply with the thread').toBeTruthy();
|
||||
thread = await bots.channel.threads.fetch(threadId!);
|
||||
const startSession = await sessionManager.get(threadId!);
|
||||
expect(startSession, 'session must be persisted').toBeTruthy();
|
||||
|
||||
// ── Drive up to MAX_TURNS turns ──────────────────────────────────────────
|
||||
let actionIdx = 0;
|
||||
let resolved = false;
|
||||
for (let turn = 0; turn < MAX_TURNS; turn++) {
|
||||
let s = await sessionManager.get(threadId!);
|
||||
if (!s || s.phase === 'resolved') { resolved = true; break; }
|
||||
|
||||
const action = strategy.actions[actionIdx] ?? strategy.actions.at(-1)!;
|
||||
actionIdx++;
|
||||
|
||||
await sessionManager.addMessage(threadId!, {
|
||||
role: 'user',
|
||||
content: `E2E Driver: ${action}`,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
// Baseline AFTER the user message is in history, so waitFor waits for the
|
||||
// assistant/tool turn to land — not for the user message we just added.
|
||||
const prevLen = (await sessionManager.get(threadId!))!.history.length;
|
||||
scheduleEncounterLLMTurn(threadId!, thread!, bots.botClient, true);
|
||||
|
||||
// Wait for the turn to land (an assistant narrative, a tool-call system
|
||||
// message, or a filter-correction). 90s per turn for the real LLM.
|
||||
s = await waitFor(
|
||||
async () => {
|
||||
const x = await sessionManager.get(threadId!);
|
||||
return x && x.history.length > prevLen ? x : null;
|
||||
},
|
||||
{ timeoutMs: 90_000, intervalMs: 2_000 },
|
||||
);
|
||||
|
||||
// Resolve any pending skill check (and chained checks). Each resolution
|
||||
// schedules a reaction turn; poll for that to land before continuing.
|
||||
for (;;) {
|
||||
const cur = await sessionManager.get(threadId!);
|
||||
if (!cur?.pendingSkillCheck) break;
|
||||
await handleRollInteraction(fakeButton(thread!, 'sc_roll').interaction, bots.botClient);
|
||||
// handleRollInteraction appends the [SKILL CHECK RESULT] message before
|
||||
// scheduling the reaction turn — measure the baseline after it returns,
|
||||
// then wait for the reaction turn to add another history entry (or the
|
||||
// encounter to resolve).
|
||||
const baseline = (await sessionManager.get(threadId!))!.history.length;
|
||||
await waitFor(
|
||||
async () => {
|
||||
const x = await sessionManager.get(threadId!);
|
||||
return x && (x.history.length > baseline || x.phase === 'resolved') ? x : null;
|
||||
},
|
||||
{ timeoutMs: 90_000, intervalMs: 2_000 },
|
||||
);
|
||||
}
|
||||
|
||||
const after = await sessionManager.get(threadId!);
|
||||
if (after?.phase === 'resolved') { resolved = true; break; }
|
||||
}
|
||||
|
||||
// ── Assert the encounter reached a valid goal outcome ───────────────────
|
||||
expect(resolved, `encounter must resolve within ${MAX_TURNS} turns`).toBe(true);
|
||||
const final = await sessionManager.get(threadId!);
|
||||
expect(final!.phase).toBe('resolved');
|
||||
expect(final!.outcome, 'an outcomeId must be recorded').toBeTruthy();
|
||||
expect(
|
||||
validOutcomeIds.has(final!.outcome!),
|
||||
`outcome '${final!.outcome}' must be one of the spec's goal ids: ${[...validOutcomeIds].join(', ')}`,
|
||||
).toBe(true);
|
||||
expect(final!.outcomeSummary, 'an LLM outcome summary must be recorded').toBeTruthy();
|
||||
// A long encounter should have produced a real conversation.
|
||||
expect(final!.history.length, 'history should reflect a multi-turn encounter').toBeGreaterThanOrEqual(5);
|
||||
// Driver turns = user messages appended. Guards against the harness
|
||||
// silently short-circuiting to a 2–4 turn encounter for a strategy meant to
|
||||
// sustain a long scene (the long_explore coverage target).
|
||||
const driverTurns = final!.history.filter(m => m.role === 'user').length;
|
||||
const minTurns = strategy.minDriverTurns ?? 5;
|
||||
expect(
|
||||
driverTurns,
|
||||
`strategy '${strategy.name}' should sustain ≥${minTurns} driver turns before resolution (got ${driverTurns})`,
|
||||
).toBeGreaterThanOrEqual(minTurns);
|
||||
|
||||
// ── Verify the final output in GraphMCP: read the encounter_resolve log ─
|
||||
// encounter_resolve logs title `${spec.title} — ${outcomeId}`, where
|
||||
// spec.title is run-tagged, so we locate it by the run id.
|
||||
const logged = await waitFor(
|
||||
async () => {
|
||||
const list = await listEncounters(100);
|
||||
const hit = list.find(e => typeof e.title === 'string' && e.title.includes(run));
|
||||
return hit ?? null;
|
||||
},
|
||||
{ timeoutMs: 45_000, intervalMs: 2_000 },
|
||||
).catch(() => null);
|
||||
expect(logged, 'encounter_resolve log must be readable via list_encounters (matched by run id in title)').toBeTruthy();
|
||||
expect(
|
||||
logged!.title.includes(final!.outcome!),
|
||||
'GraphMCP title must record the resolved outcomeId',
|
||||
).toBe(true);
|
||||
|
||||
const details = await getEncounter(logged!.id);
|
||||
expect(details, 'GraphMCP must return full EncounterDetails').toBeTruthy();
|
||||
expect(details!.summary, 'GraphMCP encounter summary must be non-empty').toBeTruthy();
|
||||
expect(Array.isArray(details!.participants), 'GraphMCP encounter must list participants').toBe(true);
|
||||
expect(details!.participants.length, 'participants must include the encounter NPCs').toBeGreaterThan(0);
|
||||
expect(details!.type).toBe('encounter');
|
||||
}, 600_000);
|
||||
});
|
||||
101
tests/integration/graphmcp/lore-and-events.test.ts
Normal file
101
tests/integration/graphmcp/lore-and-events.test.ts
Normal file
@@ -0,0 +1,101 @@
|
||||
// AC4 — Lore/question answering + event read-after-write (live GraphMCP + LLM + Discord).
|
||||
//
|
||||
// S4.1: the driver bot @mentions the bot under test in the (non-thread) test
|
||||
// channel. The hybrid approach fetches that real mention message and routes
|
||||
// it through the real handleMention() with the live bot client — exercising
|
||||
// semanticSearch + queryAsNPC + callLLM → lore-answer embed → reply, all
|
||||
// against real GraphMCP + real LLM. We assert a bot reply is posted
|
||||
// (structural); asserting the reply *cites specific lore* is left as a
|
||||
// soft/manual check (LLM output is non-deterministic).
|
||||
// S4.2: log_encounter read-after-write consistency — a freshly logged event
|
||||
// becomes readable via list_encounters / search_encounters (poll for
|
||||
// eventual consistency).
|
||||
//
|
||||
// Gate: RUN_FULL_E2E=1. S4.1 needs persona.yaml present + Redis (ingest stream)
|
||||
// + GraphMCP + LLM; S4.2 needs only GraphMCP (so it is also covered by AC1).
|
||||
|
||||
import './support/env.js';
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { handleMention } from '../../../src/bot/handlers/mentionHandler.js';
|
||||
import { logEncounter, listEncounters, searchEncounters } from '../../../src/graphmcp/client.js';
|
||||
import { runId, buildEncounterLog, titleMatchesRun } from './support/factories.js';
|
||||
import { connectLiveBots, disconnectLiveBots, type LiveBots } from './support/liveBots.js';
|
||||
import { flushRedisForGuild, disconnectRedis } from './support/cleanup.js';
|
||||
import { waitFor } from './support/poll.js';
|
||||
|
||||
const runE2E = process.env.RUN_FULL_E2E === '1';
|
||||
|
||||
describe.skipIf(!runE2E)('AC4 — Lore answering + event read-after-write (live)', () => {
|
||||
let bots: LiveBots;
|
||||
|
||||
beforeAll(async () => {
|
||||
bots = await connectLiveBots();
|
||||
await flushRedisForGuild(bots.guild.id);
|
||||
}, 120_000);
|
||||
|
||||
afterAll(async () => {
|
||||
await disconnectRedis();
|
||||
await disconnectLiveBots(bots);
|
||||
}, 120_000);
|
||||
|
||||
// S4.1 — @mention triggers lore answering (real GraphMCP + real LLM) --------
|
||||
it('S4.1 an @mention produces a bot reply referencing graph lore', async () => {
|
||||
const botUserId = bots.botClient.user?.id;
|
||||
expect(botUserId, 'bot under test must be logged in').toBeTruthy();
|
||||
|
||||
// Driver bot @mentions the under-test bot with a lore-flavored question,
|
||||
// posted in the (non-thread) test channel.
|
||||
const question = `What do the Ratling syndicates want with the Stormscar? (run ${runId()})`;
|
||||
const mention = `<@${botUserId}> ${question}`;
|
||||
const driverChannel = await bots.driverBot.channels.fetch(bots.channel.id);
|
||||
const sent = await (driverChannel as typeof bots.channel).send(mention);
|
||||
|
||||
// Fetch the real mention message (via the under-test client) and route it
|
||||
// through the real mention handler.
|
||||
const realMsg = await bots.channel.messages.fetch(sent.id);
|
||||
await handleMention(realMsg, bots.botClient);
|
||||
|
||||
// Poll the channel for a fresh message authored by the bot under test.
|
||||
const reply = await waitFor(
|
||||
async () => {
|
||||
const recent = await bots.channel.messages.fetch({ limit: 10 });
|
||||
const mine = recent.find(m => m.author.id === botUserId && m.id !== realMsg.id);
|
||||
return mine ?? null;
|
||||
},
|
||||
{ timeoutMs: 120_000, intervalMs: 3_000 },
|
||||
);
|
||||
expect(reply, 'bot must reply to the @mention').toBeTruthy();
|
||||
expect(reply.content.length + (reply.embeds.length > 0 ? 1 : 0)).toBeGreaterThan(0);
|
||||
// TODO(soft): assert the reply references real graph lore. LLM output is
|
||||
// non-deterministic, so this stays a structural existence check; a human
|
||||
// or a deterministic lore-injection fixture would assert cited content.
|
||||
}, 150_000);
|
||||
|
||||
// S4.2 — log_encounter read-after-write consistency -------------------------
|
||||
it('S4.2 a logged encounter is readable via list/search afterwards', async () => {
|
||||
const run = runId();
|
||||
const log = buildEncounterLog(run, { title: 'Read-after-write probe' });
|
||||
const written = await logEncounter(log);
|
||||
expect(written.enc_id, 'log_encounter must return an id').toBeTruthy();
|
||||
|
||||
// list_encounters eventually surfaces the new event.
|
||||
const inList = await waitFor(
|
||||
async () => {
|
||||
const list = await listEncounters(100);
|
||||
return list.some(e => e.id === written.enc_id) ? true : null;
|
||||
},
|
||||
{ timeoutMs: 30_000, intervalMs: 2_000 },
|
||||
);
|
||||
expect(inList, 'list_encounters must surface the just-logged event').toBe(true);
|
||||
|
||||
// search_encounters also surfaces it (by this run's unique tag in the title).
|
||||
const inSearch = await waitFor(
|
||||
async () => {
|
||||
const r = await searchEncounters({ query: run, limit: 100 });
|
||||
return r.some(e => titleMatchesRun(run)(e.title)) ? true : null;
|
||||
},
|
||||
{ timeoutMs: 30_000, intervalMs: 2_000 },
|
||||
);
|
||||
expect(inSearch, 'search_encounters must surface the just-logged event').toBe(true);
|
||||
}, 90_000);
|
||||
});
|
||||
142
tests/integration/graphmcp/skill-check.test.ts
Normal file
142
tests/integration/graphmcp/skill-check.test.ts
Normal file
@@ -0,0 +1,142 @@
|
||||
// AC3 — Skill-check tool (live Discord + Redis; no LLM needed for the tool itself).
|
||||
//
|
||||
// The skill-check flow is driven DETERMINISTICALLY (not by waiting for the LLM
|
||||
// to choose to emit it):
|
||||
// S3.1: invoke the registered `skill_check_emit` tool handler directly with a
|
||||
// real thread + session. It posts the suspense→skill-check embed to
|
||||
// real Discord and sets `pendingSkillCheck` in Redis.
|
||||
// S3.2: drive the roll resolution directly via handleRollInteraction with a
|
||||
// fake ButtonInteraction targeting the posted embed (customId 'sc_roll').
|
||||
// submitResult computes the outcome, clears `pendingSkillCheck`, appends
|
||||
// the [SKILL CHECK RESULT] system message, and schedules the next LLM
|
||||
// turn.
|
||||
//
|
||||
// Assert on structural session-state transitions, not embed text.
|
||||
// Gate: RUN_FULL_E2E=1. Requires the same live stack as AC2 (minus the LLM for
|
||||
// the emit step itself; resolution schedules a real LLM turn afterward).
|
||||
|
||||
import './support/env.js';
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { execute } from '../../../src/bot/commands/encounter.js';
|
||||
import { sessionManager } from '../../../src/session/sessionManager.js';
|
||||
import { handleRollInteraction } from '../../../src/bot/handlers/rollHandler.js';
|
||||
import { getPlugin } from '../../../src/harness/toolRegistry.js';
|
||||
// Side-effect import: populates the tool registry (skill_check_emit etc.) so
|
||||
// getPlugin('skill_check_emit') resolves. toolDispatcher normally does this,
|
||||
// but this test calls the plugin handler directly without going through dispatch.
|
||||
import '../../../src/harness/tools/index.js';
|
||||
import { runId } from './support/factories.js';
|
||||
import { connectLiveBots, disconnectLiveBots, type LiveBots } from './support/liveBots.js';
|
||||
import { fakeInteraction, fakeButton, parseThreadIdFromReply } from './support/fakes.js';
|
||||
import { flushRedisForGuild, disconnectRedis, deleteThread } from './support/cleanup.js';
|
||||
import { waitFor } from './support/poll.js';
|
||||
import type { ThreadChannel } from 'discord.js';
|
||||
|
||||
const runE2E = process.env.RUN_FULL_E2E === '1';
|
||||
const specName = process.env.E2E_SPEC ?? 'market-thief';
|
||||
|
||||
describe.skipIf(!runE2E)('AC3 — Skill-check tool (live)', () => {
|
||||
let bots: LiveBots;
|
||||
const run = runId();
|
||||
let threadId: string | null = null;
|
||||
let thread: ThreadChannel | null = null;
|
||||
let embedMessageId: string | undefined;
|
||||
|
||||
beforeAll(async () => {
|
||||
bots = await connectLiveBots();
|
||||
await flushRedisForGuild(bots.guild.id);
|
||||
|
||||
// Start a real encounter to obtain a live thread + persisted SessionState.
|
||||
const { interaction, lastText } = fakeInteraction({
|
||||
subcommand: 'start',
|
||||
stringOptions: { spec: specName },
|
||||
channel: bots.channel,
|
||||
guildId: bots.guild.id,
|
||||
});
|
||||
await execute(interaction);
|
||||
threadId = parseThreadIdFromReply(lastText());
|
||||
expect(threadId, 'encounter must start to drive a skill check').toBeTruthy();
|
||||
thread = await bots.channel.threads.fetch(threadId!);
|
||||
}, 120_000);
|
||||
|
||||
afterAll(async () => {
|
||||
try {
|
||||
if (threadId) await deleteThread(bots.channel, threadId);
|
||||
} finally {
|
||||
await disconnectRedis();
|
||||
await disconnectLiveBots(bots);
|
||||
}
|
||||
}, 120_000);
|
||||
|
||||
// S3.1 — skill_check_emit posts the embed + sets pendingSkillCheck -----------
|
||||
it('S3.1 skill_check_emit posts an embed to the thread and sets pendingSkillCheck', async () => {
|
||||
expect(threadId).toBeTruthy();
|
||||
const session = await sessionManager.get(threadId!);
|
||||
expect(session, 'session must exist before emitting a skill check').toBeTruthy();
|
||||
|
||||
const plugin = getPlugin('skill_check_emit');
|
||||
expect(plugin, 'skill_check_emit must be registered').toBeTruthy();
|
||||
|
||||
const result = await plugin!.handler(
|
||||
{
|
||||
player: 'E2E Driver',
|
||||
prompt: 'E2E: attempts to force a stuck door open',
|
||||
skill: 'Athletics',
|
||||
dc: 15,
|
||||
advantage: false,
|
||||
disadvantage: false,
|
||||
},
|
||||
{ session: session!, thread: thread! },
|
||||
);
|
||||
expect(result.systemMessage, 'tool must return a system message').toBeTruthy();
|
||||
|
||||
const updated = await waitFor(
|
||||
async () => {
|
||||
const s = await sessionManager.get(threadId!);
|
||||
return s?.pendingSkillCheck ? s : null;
|
||||
},
|
||||
{ timeoutMs: 15_000, intervalMs: 500 },
|
||||
);
|
||||
expect(updated!.pendingSkillCheck, 'pendingSkillCheck must be persisted').toBeTruthy();
|
||||
expect(updated!.pendingSkillCheck!.dc).toBe(15);
|
||||
embedMessageId = updated!.pendingSkillCheck!.messageId;
|
||||
expect(embedMessageId, 'embed message id must be recorded in session').toBeTruthy();
|
||||
|
||||
// The embed was posted to the real thread (the suspense embed first, then a
|
||||
// 1.5s-delayed edit to the full skill-check embed — see skillCheckEmit.ts).
|
||||
const msg = await waitFor(
|
||||
async () => {
|
||||
const m = await thread!.messages.fetch(embedMessageId!).catch(() => null);
|
||||
return m && m.embeds.length > 0 ? m : null;
|
||||
},
|
||||
{ timeoutMs: 10_000, intervalMs: 500 },
|
||||
).catch(() => null);
|
||||
expect(msg, 'skill-check embed must exist on the thread').toBeTruthy();
|
||||
}, 120_000);
|
||||
|
||||
// S3.2 — roll resolves the check, clears pendingSkillCheck, records outcome -
|
||||
it('S3.2 a roll resolves the check and clears pendingSkillCheck', async () => {
|
||||
expect(threadId).toBeTruthy();
|
||||
const session = await sessionManager.get(threadId!);
|
||||
expect(session?.pendingSkillCheck, 'S3.1 must have left a pending check').toBeTruthy();
|
||||
|
||||
const { interaction } = fakeButton(thread!, 'sc_roll');
|
||||
await handleRollInteraction(interaction, bots.botClient);
|
||||
|
||||
const cleared = await waitFor(
|
||||
async () => {
|
||||
const s = await sessionManager.get(threadId!);
|
||||
return s && s.pendingSkillCheck === undefined ? s : null;
|
||||
},
|
||||
{ timeoutMs: 30_000, intervalMs: 1_000 },
|
||||
);
|
||||
expect(cleared!.pendingSkillCheck, 'pendingSkillCheck must be cleared on resolution').toBeUndefined();
|
||||
|
||||
// The [SKILL CHECK RESULT] system message is appended to history.
|
||||
const lastSystem = cleared!.history
|
||||
.filter(m => m.role === 'system')
|
||||
.at(-1);
|
||||
expect(lastSystem?.content, 'a skill-check result system message must be recorded')
|
||||
.toMatch(/\[SKILL CHECK RESULT\]/);
|
||||
}, 120_000);
|
||||
});
|
||||
85
tests/integration/graphmcp/support/cleanup.ts
Normal file
85
tests/integration/graphmcp/support/cleanup.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
// Cleanup helpers. Live E2E runs leak real artifacts: Redis session keys,
|
||||
// Discord threads, and GraphMCP encounter records. These helpers tear down what
|
||||
// the current run created, keyed by the run id / thread id, and are best-effort
|
||||
// (a cleanup failure must not mask a real test failure, so errors are swallowed
|
||||
// and logged).
|
||||
|
||||
import type { Client, TextChannel, ThreadChannel } from 'discord.js';
|
||||
|
||||
/** Delete a Discord thread (if still present) and ignore "already deleted". */
|
||||
export async function deleteThread(channel: TextChannel | ThreadChannel | null, threadId: string): Promise<void> {
|
||||
try {
|
||||
if (!channel) return;
|
||||
if (channel.isThread()) {
|
||||
await channel.delete('E2E cleanup').catch(() => null);
|
||||
return;
|
||||
}
|
||||
const thread = await (channel as TextChannel).threads.fetch(threadId).catch(() => null);
|
||||
if (thread) await thread.delete('E2E cleanup').catch(() => null);
|
||||
} catch {
|
||||
/* best-effort */
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush Redis session + player keys for a guild so runs start from a clean
|
||||
* slate. Only deletes keys under known prefixes — never a global FLUSHDB.
|
||||
*
|
||||
* Session keys are `session:<threadId>` (a Discord snowflake with no guild id),
|
||||
* so a guild-scoped pattern (`session:*${guildId}*`) matches nothing. Instead
|
||||
* scan every session key and drop only the ones this E2E suite created —
|
||||
* identified by the run-tagged `spec.encounterId` prefix `e2e-`. Real (non-e2e)
|
||||
* sessions are left untouched. Player keys ARE guild-scoped (`players:<guildId>`).
|
||||
*/
|
||||
export async function flushRedisForGuild(guildId: string): Promise<void> {
|
||||
const { redis } = await import('../../../../src/db/redis.js');
|
||||
const sessionKeys = await redis.keys('session:*').catch(() => []);
|
||||
const toDelete: string[] = [];
|
||||
for (const k of sessionKeys) {
|
||||
const raw = await redis.get(k).catch(() => null);
|
||||
if (!raw) continue;
|
||||
try {
|
||||
const s = JSON.parse(raw) as { spec?: { encounterId?: string } };
|
||||
if (typeof s.spec?.encounterId === 'string' && s.spec.encounterId.startsWith('e2e-')) {
|
||||
toDelete.push(k);
|
||||
}
|
||||
} catch {
|
||||
/* not a session shape we recognize — leave it */
|
||||
}
|
||||
}
|
||||
const playerKeys = await redis.keys(`players:${guildId}`).catch(() => []);
|
||||
const all = [...toDelete, ...playerKeys];
|
||||
if (all.length) await redis.del(all).catch(() => null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a single session key (best-effort). Call in afterAll so the run's own
|
||||
* session — created during the test, after beforeAll's flush — is torn down.
|
||||
*/
|
||||
export async function deleteSession(threadId: string): Promise<void> {
|
||||
const { redis } = await import('../../../../src/db/redis.js');
|
||||
await redis.del(`session:${threadId}`).catch(() => null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Disconnect the shared redis singleton opened during a run. Call in afterAll
|
||||
* so the process can exit cleanly.
|
||||
*/
|
||||
export async function disconnectRedis(): Promise<void> {
|
||||
const { redis } = await import('../../../../src/db/redis.js');
|
||||
redis.disconnect();
|
||||
}
|
||||
|
||||
/**
|
||||
* GraphMCP test-encounter cleanup NOTE: src/graphmcp/client.ts exposes no
|
||||
* delete tool, so encounter records written by a run are NOT torn down here.
|
||||
* They are uniquely prefixed `[E2E] <runId> —` for identification. A future
|
||||
* `delete_encounter` tool (or a direct GraphMCP admin call) would let cleanup
|
||||
* remove them; until then, test encounters accumulate and are distinguishable
|
||||
* from real data by the [E2E] prefix.
|
||||
*/
|
||||
export const GRAPHMCP_CLEANUP_LIMITATION =
|
||||
'No delete tool in src/graphmcp/client.ts; test encounters are prefixed [E2E] and left in place.';
|
||||
|
||||
/** Re-export client for tests that need to fetch channels for cleanup. */
|
||||
export type { Client };
|
||||
24
tests/integration/graphmcp/support/env.ts
Normal file
24
tests/integration/graphmcp/support/env.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
// Test-environment bootstrap — imported FIRST by every graphmcp integration
|
||||
// test so it evaluates before `src/config.ts` runs `EnvSchema.parse(process.env)`.
|
||||
//
|
||||
// config.ts requires DISCORD_TOKEN / DISCORD_CLIENT_ID to be present (Zod
|
||||
// .string(), no default). The GraphMCP contract suite (AC1) does not connect
|
||||
// to Discord — it only needs GRAPHMCP_URL — so we inject harmless stubs when
|
||||
// real creds are absent. A real `.env` wins because we only fill keys that are
|
||||
// unset — BUT we must load .env first, otherwise this runs before config.ts's
|
||||
// `import 'dotenv/config'` and would stub over a real token that hasn't loaded
|
||||
// yet (dotenv never clobbers an existing process.env value, so the stub would
|
||||
// stick and the live E2E login would get TokenInvalid).
|
||||
//
|
||||
// If a dedicated test channel id is provided via E2E_TEST_CHANNEL_ID, also
|
||||
// seed DISCORD_ALLOWED_CHANNELS so /encounter start's channel allowlist passes
|
||||
// without requiring the maintainer to edit .env for a one-off test run.
|
||||
|
||||
import 'dotenv/config';
|
||||
|
||||
for (const k of ['DISCORD_TOKEN', 'DISCORD_CLIENT_ID']) {
|
||||
if (!process.env[k]) process.env[k] = `test-${k}-stub`;
|
||||
}
|
||||
if (process.env.E2E_TEST_CHANNEL_ID && !process.env.DISCORD_ALLOWED_CHANNELS) {
|
||||
process.env.DISCORD_ALLOWED_CHANNELS = process.env.E2E_TEST_CHANNEL_ID;
|
||||
}
|
||||
38
tests/integration/graphmcp/support/factories.ts
Normal file
38
tests/integration/graphmcp/support/factories.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
// Data factories for live integration tests. Every entity created by a run —
|
||||
// GraphMCP encounter logs, encounter threads, Redis keys — is tagged with a
|
||||
// unique run id so runs never collide with each other or with real data, and
|
||||
// so cleanup can identify this run's leftovers.
|
||||
|
||||
/** Unique run prefix (timestamp + pid). Stable for the lifetime of a run. */
|
||||
export function runId(): string {
|
||||
return `e2e-${Date.now()}-${process.pid}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a LogEncounterParams payload with a unique, test-tagged title. The
|
||||
* `[E2E] ${run}` prefix is what list_encounters / search_encounters filter on
|
||||
* to confirm read-after-write and what cleanup keys off of.
|
||||
*/
|
||||
export function buildEncounterLog(
|
||||
run: string,
|
||||
overrides: {
|
||||
title?: string;
|
||||
participants?: string;
|
||||
summary?: string;
|
||||
location?: string;
|
||||
type?: string;
|
||||
} = {},
|
||||
) {
|
||||
return {
|
||||
title: `[E2E] ${run} — ${overrides.title ?? 'Test encounter'}`,
|
||||
participants: overrides.participants ?? 'Test Player, Miriam',
|
||||
summary: overrides.summary ?? 'Automated integration test encounter.',
|
||||
location: overrides.location ?? 'Mardonar — test district',
|
||||
type: overrides.type ?? 'encounter',
|
||||
};
|
||||
}
|
||||
|
||||
/** Title predicate used to find this run's encounter in list/search results. */
|
||||
export function titleMatchesRun(run: string): (t: string) => boolean {
|
||||
return (t: string) => typeof t === 'string' && t.includes(`[E2E] ${run}`);
|
||||
}
|
||||
128
tests/integration/graphmcp/support/fakes.ts
Normal file
128
tests/integration/graphmcp/support/fakes.ts
Normal file
@@ -0,0 +1,128 @@
|
||||
// Fake ChatInputCommandInteraction backed by REAL discord.js objects.
|
||||
//
|
||||
// The hybrid slash-command pattern: bots cannot invoke each other's slash
|
||||
// commands via the Discord API, so we call the registered command's execute()
|
||||
// directly with a fake interaction whose `channel`/`guildId` are REAL objects
|
||||
// fetched from the live client. Thread creation, message posting, and replies
|
||||
// therefore flow through the real gateway; only the command "click" is
|
||||
// synthesized.
|
||||
//
|
||||
// This fake implements exactly the subset of ChatInputCommandInteraction that
|
||||
// src/bot/commands/encounter.ts reads. Reply/editReply calls are captured so
|
||||
// tests can assert on them; the real side effects (channel.threads.create,
|
||||
// thread.send, channel.setArchived) hit real Discord via the real channel.
|
||||
|
||||
import type { ChatInputCommandInteraction, TextChannel, ThreadChannel } from 'discord.js';
|
||||
|
||||
export interface CapturedReply {
|
||||
content?: string;
|
||||
embeds?: unknown[];
|
||||
ephemeral?: boolean;
|
||||
files?: unknown[];
|
||||
}
|
||||
|
||||
export interface FakeInteractionOptions {
|
||||
subcommand: string;
|
||||
stringOptions?: Record<string, string>;
|
||||
channel: TextChannel | ThreadChannel;
|
||||
guildId: string;
|
||||
userId?: string;
|
||||
username?: string;
|
||||
}
|
||||
|
||||
export interface FakeInteraction {
|
||||
interaction: ChatInputCommandInteraction;
|
||||
replies: CapturedReply[];
|
||||
edits: CapturedReply[];
|
||||
/** Last text the command sent back to the user (reply or edit). */
|
||||
lastText(): string | undefined;
|
||||
}
|
||||
|
||||
export function fakeInteraction(opts: FakeInteractionOptions): FakeInteraction {
|
||||
const replies: CapturedReply[] = [];
|
||||
const edits: CapturedReply[] = [];
|
||||
|
||||
const user = {
|
||||
id: opts.userId ?? 'e2e-driver-user',
|
||||
username: opts.username ?? 'E2E Driver',
|
||||
bot: false,
|
||||
};
|
||||
|
||||
const interaction = {
|
||||
guildId: opts.guildId,
|
||||
get channelId() {
|
||||
return opts.channel.id;
|
||||
},
|
||||
channel: opts.channel,
|
||||
user,
|
||||
member: undefined,
|
||||
options: {
|
||||
getSubcommand: () => opts.subcommand,
|
||||
getString: (name: string, _required?: boolean) => opts.stringOptions?.[name] ?? null,
|
||||
getBoolean: () => null,
|
||||
getInteger: () => null,
|
||||
},
|
||||
async deferReply(_o?: { ephemeral?: boolean }) {
|
||||
/* no-op — replies are captured at editReply/reply */
|
||||
},
|
||||
async editReply(payload: string | CapturedReply) {
|
||||
const entry = typeof payload === 'string' ? { content: payload } : payload;
|
||||
edits.push(entry);
|
||||
return {};
|
||||
},
|
||||
async reply(payload: string | CapturedReply) {
|
||||
const entry = typeof payload === 'string' ? { content: payload } : payload;
|
||||
replies.push(entry);
|
||||
return {};
|
||||
},
|
||||
async followUp(_payload: unknown) {
|
||||
return {};
|
||||
},
|
||||
} as unknown as ChatInputCommandInteraction;
|
||||
|
||||
const lastText = () => {
|
||||
const last = edits.at(-1) ?? replies.at(-1);
|
||||
return last?.content;
|
||||
};
|
||||
|
||||
return { interaction, replies, edits, lastText };
|
||||
}
|
||||
|
||||
/** Parse a thread id from a `/encounter start` editReply like "Encounter started: <#123>". */
|
||||
export function parseThreadIdFromReply(text: string | undefined): string | null {
|
||||
if (!text) return null;
|
||||
const m = /<#(\d+)>/.exec(text);
|
||||
return m ? m[1] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fake ButtonInteraction targeting a posted skill-check embed. submitResult
|
||||
* (src/bot/handlers/rollHandler.ts) reads only interaction.channel (the real
|
||||
* thread) and calls interaction.update(); it does not re-fetch the message, so
|
||||
* a minimal fake suffices to drive the roll-resolution path end-to-end against
|
||||
* real session state. `customId` selects the roll variant (e.g. 'sc_roll',
|
||||
* 'sc_roll_m:0', 'sc_adv_m:3'); `update` is captured.
|
||||
*/
|
||||
export interface FakeButton {
|
||||
interaction: import('discord.js').ButtonInteraction;
|
||||
updates: unknown[];
|
||||
}
|
||||
|
||||
export function fakeButton(channel: ThreadChannel, customId: string): FakeButton {
|
||||
const updates: unknown[] = [];
|
||||
const interaction = {
|
||||
isButton: () => true,
|
||||
isModalSubmit: () => false,
|
||||
isStringSelectMenu: () => false,
|
||||
customId,
|
||||
channel,
|
||||
async update(payload: unknown) {
|
||||
updates.push(payload);
|
||||
return {};
|
||||
},
|
||||
async reply(_payload: unknown) {
|
||||
return {};
|
||||
},
|
||||
} as unknown as import('discord.js').ButtonInteraction;
|
||||
return { interaction, updates };
|
||||
}
|
||||
59
tests/integration/graphmcp/support/liveBots.ts
Normal file
59
tests/integration/graphmcp/support/liveBots.ts
Normal file
@@ -0,0 +1,59 @@
|
||||
// Real connected discord.js Client fixtures.
|
||||
//
|
||||
// This suite deliberately exercises the REAL Discord gateway (no message mocks
|
||||
// on the under-test bot). Two clients are involved:
|
||||
// - botClient : the bot under test, logged in with DISCORD_TOKEN, used both
|
||||
// as the `client` passed to command.execute() / handleMessage()
|
||||
// and to fetch real channel/thread objects.
|
||||
// - driverBot : a SECOND bot (E2E_DRIVER_TOKEN) that posts real chat messages
|
||||
// into the encounter thread, firing the bot's real messageCreate
|
||||
// path through the live gateway. (Bots cannot invoke each other's
|
||||
// slash commands, so this is how we drive conversation turns.)
|
||||
//
|
||||
// Requires in env:
|
||||
// DISCORD_TOKEN — token for the bot under test
|
||||
// E2E_DRIVER_TOKEN — token for the driver bot
|
||||
// E2E_TEST_GUILD_ID — the dedicated test guild
|
||||
// E2E_TEST_CHANNEL_ID — the channel to start encounters in
|
||||
//
|
||||
// All four are only needed for AC2–AC4 (RUN_FULL_E2E=1). AC1 needs none of them.
|
||||
|
||||
import { Client, GatewayIntentBits, type TextChannel, type Guild } from 'discord.js';
|
||||
|
||||
export interface LiveBots {
|
||||
botClient: Client;
|
||||
driverBot: Client;
|
||||
guild: Guild;
|
||||
channel: TextChannel;
|
||||
}
|
||||
|
||||
export async function connectLiveBots(): Promise<LiveBots> {
|
||||
const botToken = process.env.DISCORD_TOKEN;
|
||||
const driverToken = process.env.E2E_DRIVER_TOKEN;
|
||||
const guildId = process.env.E2E_TEST_GUILD_ID;
|
||||
const channelId = process.env.E2E_TEST_CHANNEL_ID;
|
||||
for (const [k, v] of [
|
||||
['DISCORD_TOKEN', botToken],
|
||||
['E2E_DRIVER_TOKEN', driverToken],
|
||||
['E2E_TEST_GUILD_ID', guildId],
|
||||
['E2E_TEST_CHANNEL_ID', channelId],
|
||||
] as const) {
|
||||
if (!v) throw new Error(`Live E2E requires env ${k} (set, or unset RUN_FULL_E2E).`);
|
||||
}
|
||||
|
||||
const botClient = new Client({ intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildMessages, GatewayIntentBits.MessageContent] });
|
||||
const driverBot = new Client({ intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildMessages, GatewayIntentBits.MessageContent] });
|
||||
|
||||
await Promise.all([botClient.login(botToken!), driverBot.login(driverToken!)]);
|
||||
|
||||
const guild = await botClient.guilds.fetch(guildId!);
|
||||
const channel = (await botClient.channels.fetch(channelId!)) as TextChannel;
|
||||
if (!channel?.isTextBased() || channel.isThread()) {
|
||||
throw new Error(`E2E_TEST_CHANNEL_ID must resolve to a guild text channel.`);
|
||||
}
|
||||
return { botClient, driverBot, guild, channel };
|
||||
}
|
||||
|
||||
export async function disconnectLiveBots(b: LiveBots): Promise<void> {
|
||||
await Promise.allSettled([b.botClient.destroy(), b.driverBot.destroy()]);
|
||||
}
|
||||
54
tests/integration/graphmcp/support/poll.ts
Normal file
54
tests/integration/graphmcp/support/poll.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
// Polling helpers for live-infrastructure tests, where outcomes are
|
||||
// eventually consistent: an LLM turn takes seconds to land, and a freshly
|
||||
// written GraphMCP event is not guaranteed to be readable on the very next
|
||||
// read (read-after-write eventual consistency). Assert on structure, poll
|
||||
// for the condition, never assert on a single instantaneous sample.
|
||||
|
||||
export interface PollOptions {
|
||||
timeoutMs?: number;
|
||||
intervalMs?: number;
|
||||
}
|
||||
|
||||
/** Resolve once `fn()` returns a truthy value; reject on timeout. */
|
||||
export async function waitFor<T>(
|
||||
fn: () => Promise<T> | T,
|
||||
opts: PollOptions = {},
|
||||
): Promise<T> {
|
||||
const timeoutMs = opts.timeoutMs ?? 60_000;
|
||||
const intervalMs = opts.intervalMs ?? 1_000;
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
let lastErr: unknown;
|
||||
for (;;) {
|
||||
try {
|
||||
const v = await fn();
|
||||
if (v) return v;
|
||||
} catch (err) {
|
||||
lastErr = err;
|
||||
}
|
||||
if (Date.now() >= deadline) {
|
||||
throw new Error(
|
||||
`waitFor timed out after ${timeoutMs}ms; last error: ${String(lastErr)}`,
|
||||
);
|
||||
}
|
||||
await new Promise(r => setTimeout(r, intervalMs));
|
||||
}
|
||||
}
|
||||
|
||||
/** Resolve once `fn()` stops throwing; rethrow the last error on timeout. */
|
||||
export async function untilStable(
|
||||
fn: () => Promise<void> | void,
|
||||
opts: PollOptions = {},
|
||||
): Promise<void> {
|
||||
const timeoutMs = opts.timeoutMs ?? 60_000;
|
||||
const intervalMs = opts.intervalMs ?? 1_000;
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
for (;;) {
|
||||
try {
|
||||
await fn();
|
||||
return;
|
||||
} catch (err) {
|
||||
if (Date.now() >= deadline) throw err;
|
||||
}
|
||||
await new Promise(r => setTimeout(r, intervalMs));
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
import { vi, describe, it, expect } from 'vitest';
|
||||
import { vi, describe, it, expect, afterEach } from 'vitest';
|
||||
|
||||
vi.mock('../../src/config.js', () => ({
|
||||
config: {
|
||||
@@ -7,7 +7,7 @@ vi.mock('../../src/config.js', () => ({
|
||||
},
|
||||
}));
|
||||
|
||||
import { formatNPCMemory } from '../../src/graphmcp/client.js';
|
||||
import { formatNPCMemory, semanticSearch, listEncounters, queryAsNPC } from '../../src/graphmcp/client.js';
|
||||
import type { NPCQueryResult } from '../../src/graphmcp/client.js';
|
||||
|
||||
const emptyResult: NPCQueryResult = {
|
||||
@@ -93,3 +93,139 @@ describe('formatNPCMemory', () => {
|
||||
expect(matchCount).toBeLessThanOrEqual(3);
|
||||
});
|
||||
});
|
||||
|
||||
// Build a GraphMCP JSON-RPC envelope whose tool-result text is JSON.stringify(payload).
|
||||
// callTool parses json.result.content[0].text, so this lets us feed arbitrary
|
||||
// tool-result shapes to the public functions.
|
||||
function rpcEnvelope(payload: unknown): Response {
|
||||
return {
|
||||
ok: true,
|
||||
status: 200,
|
||||
json: async () => ({
|
||||
jsonrpc: '2.0',
|
||||
result: { content: [{ type: 'text', text: JSON.stringify(payload) }] },
|
||||
}),
|
||||
} as unknown as Response;
|
||||
}
|
||||
|
||||
describe('semanticSearch response normalization', () => {
|
||||
afterEach(() => vi.unstubAllGlobals());
|
||||
|
||||
// Regression: /encounter generate crashed with "Cannot read properties of
|
||||
// undefined (reading 'length')" when GraphMCP returned a success response
|
||||
// whose `chunks` field was missing/null. The `.catch(() => ({ chunks: [] }))`
|
||||
// at the call site only covers rejection, not a wrong-shape success.
|
||||
it('returns [] when chunks is null (no crash on .length)', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope({ chunks: null })));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns [] when the response has no chunks field', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope({ results: [{ content: 'x' }] })));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns [] when GraphMCP returns null', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope(null)));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks).toEqual([]);
|
||||
});
|
||||
|
||||
it('accepts a bare array as the chunks', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope([{ content: 'a', score: 1 }])));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks).toHaveLength(1);
|
||||
expect(result.chunks[0].content).toBe('a');
|
||||
});
|
||||
|
||||
it('preserves a well-formed { chunks: [...] } response', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope({
|
||||
chunks: [{ content: 'a', score: 0.9 }, { content: 'b', score: 0.8 }],
|
||||
})));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks).toHaveLength(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe('listEncounters response normalization', () => {
|
||||
afterEach(() => vi.unstubAllGlobals());
|
||||
|
||||
it('returns [] for a non-array response instead of leaking the wrong shape', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope({ encounters: [{ id: '1' }] })));
|
||||
const result = await listEncounters(5);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns the array when GraphMCP returns one', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope([{
|
||||
id: '1', title: 't', location: 'l', timestamp: '', summary: 's',
|
||||
}])));
|
||||
const result = await listEncounters(5);
|
||||
expect(result).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
// Regression: the live GraphMCP backend returns chunks shaped as
|
||||
// { text, score, source, author, timestamp, msgID } — NOT { content, ... }.
|
||||
// The client's SemanticChunk type and its callers (encounter.ts handleGenerate
|
||||
// does `c.content.slice(...)`, mentionHandler reads `c.content`) expect
|
||||
// `.content`. Without boundary mapping, `.content` is undefined and
|
||||
// `c.content.slice` throws the same "Cannot read properties of undefined"
|
||||
// class as the loreResult.chunks crash. semanticSearch must map text→content.
|
||||
describe('semanticSearch chunk field mapping (live shape: text, not content)', () => {
|
||||
afterEach(() => vi.unstubAllGlobals());
|
||||
|
||||
it('maps the live `text` field to the declared `content` field', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope([{
|
||||
text: 'tell me about Mardonar',
|
||||
score: 0.84,
|
||||
source: 'message',
|
||||
author: 'sirhaxolot',
|
||||
timestamp: '2026-05-26T03:06:18Z',
|
||||
msgID: '1508667570604081356',
|
||||
}])));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks).toHaveLength(1);
|
||||
expect(result.chunks[0].content).toBe('tell me about Mardonar');
|
||||
expect(result.chunks[0].score).toBe(0.84);
|
||||
expect(result.chunks[0].source).toBe('message');
|
||||
});
|
||||
|
||||
it('falls back to `content` when a chunk uses the declared field name', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope([{ content: 'legacy', score: 0.5 }])));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks[0].content).toBe('legacy');
|
||||
});
|
||||
|
||||
it('coerces a chunk missing both text and content to an empty string (no crash)', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope([{ score: 0.5 }])));
|
||||
const result = await semanticSearch('q', 5);
|
||||
expect(result.chunks[0].content).toBe('');
|
||||
expect(result.chunks[0].score).toBe(0.5);
|
||||
});
|
||||
});
|
||||
|
||||
// Regression: the live GraphMCP backend returns `chunks: null` (and sometimes
|
||||
// `graph_context: null`) for NPCs with no prior memory. The raw
|
||||
// `as NPCQueryResult` cast let null leak through; the contract is arrays.
|
||||
describe('queryAsNPC null-array normalization', () => {
|
||||
afterEach(() => vi.unstubAllGlobals());
|
||||
|
||||
it('coerces null chunks and graph_context to empty arrays', async () => {
|
||||
vi.stubGlobal('fetch', vi.fn(async () => rpcEnvelope({
|
||||
npc: 'miriam-merchant-mardonar',
|
||||
tier: 'local',
|
||||
horizon_count: 0,
|
||||
chunks: null,
|
||||
graph_context: null,
|
||||
})));
|
||||
const result = await queryAsNPC('miriam-merchant-mardonar', 'recent events', 5);
|
||||
expect(Array.isArray(result.chunks)).toBe(true);
|
||||
expect(result.chunks).toEqual([]);
|
||||
expect(Array.isArray(result.graph_context)).toBe(true);
|
||||
expect(result.npc).toBe('miriam-merchant-mardonar');
|
||||
expect(result.horizon_count).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user