Files
lore-engine-poc/examples/test_e2e.sh
kanban-dev 99535a8f3a docs(v2): T8 — update README + CHANGELOG + 3 worked-example docs
- README.md: 5 plugins / 19 tools (matches /healthz); 'what this proves'
  now lists consistency engine, multi-world namespace, LLM consumer;
  'next steps' section replaced with 'shipped in v2'
- docs/CONSISTENCY_DEMO.md: 4 tools, 5 violations, all output verified
  against live bash examples/test_consistency.sh
- docs/MULTI_WORLD_DEMO.md: list_worlds() + entity_context in both
  worlds + cross-world isolation tests, all output verified live
- docs/LLM_CONSUMER_DEMO.md: 5 question types, 9 distinct tools, all
  output traced to examples/results/*.json
- CHANGELOG.md: v1 -> v2 entry, all 9 task refs (T1-T9)
- examples/test_e2e.sh: T7 E2E validation script (untracked)
2026-06-17 00:45:30 +00:00

321 lines
15 KiB
Bash
Executable File

#!/usr/bin/env bash
# test_e2e.sh — End-to-end validation for v2.T7.
#
# What this proves (per task body):
# 1. The LLM consumer works end-to-end (5 question types)
# 2. The consistency tools find the right violations (5 seeded)
# 3. The LLM's answers match the seed-data ground truth
#
# Two independent layers:
# A. Direct tool calls — each of the 4 consistency tools is invoked
# against the live gateway and the violation count + ids are asserted
# against the table in examples/GROUND_TRUTH.md. This proves the
# tools work regardless of LLM behaviour.
# B. LLM consumer — for each of 5 question types, drive the LLM through
# the gateway, then assert the answer contains the expected facts
# (names, dates, severities). This proves the LLM consumer works.
#
# The script exits 0 only if EVERY check passes.
set -uo pipefail
cd "$(dirname "$0")"
mkdir -p results
GATEWAY_URL="${GATEWAY_URL:-http://localhost:8765/mcp}"
LITELLM_URL="${LITELLM_URL:-http://localhost:4000/v1}"
LITELLM_MODEL="${LITELLM_MODEL:-minimax-m3}"
export GATEWAY_URL LITELLM_URL LITELLM_MODEL
# ─── bookkeeping ──────────────────────────────────────────────────────────────
fails=0
passes=0
declare -a FAIL_DETAILS=()
ok() { passes=$((passes+1)); echo "$1"; }
fail() { fails=$((fails+1)); FAIL_DETAILS+=("$1"); echo "$1"; }
section() { echo; echo "── $* ──"; }
# ─── pre-flight ──────────────────────────────────────────────────────────────
section "pre-flight: gateway + LiteLLM reachable"
if curl -s --max-time 5 -X POST "$GATEWAY_URL" -H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' >/dev/null 2>&1; then
ok "gateway responds at $GATEWAY_URL"
else
fail "gateway unreachable at $GATEWAY_URL"
echo
echo "PRE-FLIGHT FAILED — aborting."
exit 1
fi
if curl -s --max-time 5 "$LITELLM_URL/models" >/dev/null 2>&1; then
ok "LiteLLM responds at $LITELLM_URL"
else
fail "LiteLLM unreachable at $LITELLM_URL"
echo
echo "PRE-FLIGHT FAILED — aborting."
exit 1
fi
# ─── Layer A: direct consistency-tool calls ──────────────────────────────────
# Helper: call a tool, print the parsed JSON envelope (one object per line).
call_tool() {
local name=$1
local args=$2
curl -s -X POST "$GATEWAY_URL" -H "Content-Type: application/json" \
-d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"tools/call\",\"params\":{\"name\":\"$name\",\"arguments\":$args}}" \
| python3 -c "import json,sys; d=json.load(sys.stdin); print(d['result']['content'][0]['text'])"
}
# Helper: assert a tool's violation count + ids.
# Args: <tool_name> <args_json> <expected_count> [expected_id_1 ...]
assert_violations() {
local tool=$1; shift
local args=$1; shift
local expected_count=$1; shift
local resp
resp=$(call_tool "$tool" "$args")
local got_count
got_count=$(printf '%s' "$resp" | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
if [ "$got_count" = "$expected_count" ]; then
ok "$tool: count=$got_count (expected $expected_count)"
else
fail "$tool: count=$got_count (expected $expected_count) — full response: $resp"
return
fi
for want in "$@"; do
if printf '%s' "$resp" | python3 -c "import json,sys; ids=[v['id'] for v in json.load(sys.stdin)['violations']]; print('YES' if '$want' in ids else 'NO')" \
2>/dev/null | grep -q YES; then
ok "$tool: contains id=$want"
else
fail "$tool: missing id=$want (full response: $resp)"
fi
done
}
section "Layer A — direct consistency tool calls (no LLM)"
assert_violations "find_contradictions" '{"severity":"any"}' 1 c_aldric_double_membership
assert_violations "find_anachronisms" '{"severity":"any"}' 1 a_vex_at_founding
assert_violations "find_orphans" '{}' 1 o_unfinished_npc
assert_violations "find_ontology_violations" '{"severity":"any"}' 2 ov_theron_no_died ov_maric_no_died
# Severity breakdown — task body says "the orphan being a warning, not error".
section "Layer A — severity breakdown"
contradictions_err=$(call_tool "find_contradictions" '{"severity":"error"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
contradictions_warn=$(call_tool "find_contradictions" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
[ "$contradictions_err" = "1" ] && ok "find_contradictions severity=error -> 1" || fail "find_contradictions severity=error -> $contradictions_err (expected 1)"
[ "$contradictions_warn" = "0" ] && ok "find_contradictions severity=warn -> 0" || fail "find_contradictions severity=warn -> $contradictions_warn (expected 0)"
anach_err=$(call_tool "find_anachronisms" '{"severity":"error"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
anach_warn=$(call_tool "find_anachronisms" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
[ "$anach_err" = "1" ] && ok "find_anachronisms severity=error -> 1" || fail "find_anachronisms severity=error -> $anach_err (expected 1)"
[ "$anach_warn" = "0" ] && ok "find_anachronisms severity=warn -> 0" || fail "find_anachronisms severity=warn -> $anach_warn (expected 0)"
# Orphans: 1 warn (the task body specifies this is a warn, not error).
orphan_severity=$(call_tool "find_orphans" '{}' | python3 -c "import json,sys; d=json.load(sys.stdin); print(','.join(v['severity'] for v in d['violations']))")
if [ "$orphan_severity" = "warn" ]; then
ok "find_orphans -> severity=warn (orphan is a warn, not error)"
else
fail "find_orphans -> severity=[$orphan_severity] (expected 'warn')"
fi
# Ontology: 2 warn
ont_warn=$(call_tool "find_ontology_violations" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
[ "$ont_warn" = "2" ] && ok "find_ontology_violations severity=warn -> 2" || fail "find_ontology_violations severity=warn -> $ont_warn (expected 2)"
# Total
total_err=0
total_warn=0
for t in find_contradictions find_anachronisms find_orphans find_ontology_violations; do
args='{"severity":"any"}'
[ "$t" = "find_orphans" ] && args='{}'
e=$(call_tool "$t" "$args" | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for v in d['violations'] if v['severity']=='error'))")
w=$(call_tool "$t" "$args" | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for v in d['violations'] if v['severity']=='warn'))")
total_err=$((total_err+e))
total_warn=$((total_warn+w))
done
total=$((total_err+total_warn))
[ "$total" = "5" ] && ok "TOTAL violations = 5 (2 error + 3 warn)" \
|| fail "TOTAL violations = $total (expected 5)"
[ "$total_err" = "2" ] && ok "TOTAL errors = 2" || fail "TOTAL errors = $total_err (expected 2)"
[ "$total_warn" = "3" ] && ok "TOTAL warns = 3" || fail "TOTAL warns = $total_warn (expected 3)"
# ─── Layer B: LLM consumer — 5 question types ────────────────────────────────
section "Layer B — LLM consumer (5 question types)"
declare -a IDS=(
"q1_who_is_aldric"
"q2_was_allied_230"
"q3_aldric_ancestors"
"q4_images_of_aldric"
"q5_consistency_issues"
)
declare -a QS=(
"Who is Aldric Raventhorne? Give a brief bio and his known relations."
"Was House Vyr allied with the Merchants Guild at 2nd_age.year_230?"
"What is the lineage / ancestry of Aldric Raventhorne? Walk back as far as you can."
"Show me images of Aldric Raventhorne — portraits or otherwise."
"What are the open consistency issues in the world graph right now? Check contradictions, anachronisms, orphans, and ontology violations."
)
for i in "${!IDS[@]}"; do
id="${IDS[$i]}"
q="${QS[$i]}"
echo
echo "── question $((i+1))/5: $id ──"
echo " Q: $q"
if ! python3 llm_consumer.py --question-id "$id" --question "$q" \
--out "results/${id}.json" >"/tmp/llm_consumer_${id}.log" 2>&1; then
fail "Q$((i+1)) ($id): llm_consumer.py exited non-zero — see /tmp/llm_consumer_${id}.log"
tail -5 "/tmp/llm_consumer_${id}.log" | sed 's/^/ /'
continue
fi
tail -8 "/tmp/llm_consumer_${id}.log"
ok "Q$((i+1)) ($id): llm_consumer.py exit=0"
done
# ─── Answer-level assertions against GROUND_TRUTH.md ─────────────────────────
section "Layer B — answer-level assertions against GROUND_TRUTH.md"
# Helper: read a trace and emit its (answer_lower, tools_csv) on two lines.
trace_info() {
local trace_path=$1
python3 -c "
import json
d = json.load(open('$trace_path'))
ans = (d.get('answer') or '').lower()
tools = [t['tool'] for t in d.get('tools_called', [])]
print(ans)
print('---TOOLS---')
print(','.join(tools))
"
}
# Q1: entity_context called, answer has Aldric + a known affiliation.
if [ -f "results/q1_who_is_aldric.json" ]; then
trace=$(trace_info "results/q1_who_is_aldric.json")
q1_ans=${trace%%$'---TOOLS---'*}
q1_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
echo " Q1 tools: $q1_tools"
if [[ "$q1_tools" == *entity_context* ]]; then ok "Q1: entity_context in tools_called"; else fail "Q1: entity_context NOT called (got: $q1_tools)"; fi
if printf '%s' "$q1_ans" | grep -qi 'aldric'; then ok "Q1: answer mentions 'aldric'"; else fail "Q1: answer missing 'aldric'"; fi
if printf '%s' "$q1_ans" | grep -Eqi 'vyr|thornwall|elara|valdorni|eventide'; then
ok "Q1: answer mentions a known affiliation (Vyr/Thornwall/Elara/Valdorni/Eventide)"
else
fail "Q1: answer missing known affiliation"
fi
else
fail "Q1: results/q1_who_is_aldric.json missing (LLM consumer failed)"
fi
# Q2: was_true_at called, answer says YES/allied/true.
if [ -f "results/q2_was_allied_230.json" ]; then
trace=$(trace_info "results/q2_was_allied_230.json")
q2_ans=${trace%%$'---TOOLS---'*}
q2_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
echo " Q2 tools: $q2_tools"
if [[ "$q2_tools" == *was_true_at* ]]; then ok "Q2: was_true_at in tools_called"; else fail "Q2: was_true_at NOT called (got: $q2_tools)"; fi
if printf '%s' "$q2_ans" | grep -Eqi 'yes|allied|true|in force|was an alliance'; then
ok "Q2: answer indicates YES/allied/true"
else
fail "Q2: answer missing YES/allied/true"
fi
else
fail "Q2: results/q2_was_allied_230.json missing (LLM consumer failed)"
fi
# Q3: ancestors_of called, answer names >=3 of {Theron, Maric, Cael, Yssa}.
if [ -f "results/q3_aldric_ancestors.json" ]; then
trace=$(trace_info "results/q3_aldric_ancestors.json")
q3_ans=${trace%%$'---TOOLS---'*}
q3_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
echo " Q3 tools: $q3_tools"
if [[ "$q3_tools" == *ancestors_of* ]]; then ok "Q3: ancestors_of in tools_called"; else fail "Q3: ancestors_of NOT called (got: $q3_tools)"; fi
found=0
for n in theron maric cael yssa; do
if printf '%s' "$q3_ans" | grep -qi "$n"; then found=$((found+1)); fi
done
if [ "$found" -ge 3 ]; then ok "Q3: answer names $found/4 canonical ancestors (need >=3)"; else fail "Q3: answer names only $found/4 canonical ancestors (need >=3)"; fi
else
fail "Q3: results/q3_aldric_ancestors.json missing (LLM consumer failed)"
fi
# Q4: image-recall tool called, answer mentions Aldric + portrait/image/etc.
if [ -f "results/q4_images_of_aldric.json" ]; then
trace=$(trace_info "results/q4_images_of_aldric.json")
q4_ans=${trace%%$'---TOOLS---'*}
q4_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
echo " Q4 tools: $q4_tools"
if [[ "$q4_tools" == *recall_images* || "$q4_tools" == *search_images_by_caption* || "$q4_tools" == *search_images_semantic* ]]; then
ok "Q4: image-recall tool in tools_called"
else
fail "Q4: no image-recall tool called (got: $q4_tools)"
fi
if printf '%s' "$q4_ans" | grep -qi 'aldric'; then ok "Q4: answer mentions 'aldric'"; else fail "Q4: answer missing 'aldric'"; fi
if printf '%s' "$q4_ans" | grep -Eqi 'portrait|image|presigned|thornwall'; then
ok "Q4: answer mentions portrait/image/presigned/thornwall"
else
fail "Q4: answer missing portrait/image/presigned/thornwall"
fi
else
fail "Q4: results/q4_images_of_aldric.json missing (LLM consumer failed)"
fi
# Q5: all 4 consistency tools called; answer is NOT a "no issues" answer; mentions
# canonical subject names and severity.
if [ -f "results/q5_consistency_issues.json" ]; then
trace=$(trace_info "results/q5_consistency_issues.json")
q5_ans=${trace%%$'---TOOLS---'*}
q5_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
echo " Q5 tools: $q5_tools"
missing=()
for t in find_contradictions find_anachronisms find_orphans find_ontology_violations; do
[[ "$q5_tools" == *"$t"* ]] || missing+=("$t")
done
if [ ${#missing[@]} -eq 0 ]; then
ok "Q5: all 4 consistency tools in tools_called"
else
fail "Q5: missing tools: ${missing[*]}"
fi
# Must NOT say "no issues" — there are 5 seeded violations.
if printf '%s' "$q5_ans" | grep -Eqi '(no|zero|none).{0,30}(open |detected |current )?(consistency |open )?(issues|problems|violations)'; then
fail "Q5: answer incorrectly says 'no issues' — but 5 violations are seeded"
else
ok "Q5: answer does NOT claim 'no issues' (correct — 5 violations seeded)"
fi
subject_hits=0
for n in aldric vex lyssa theron maric; do
if printf '%s' "$q5_ans" | grep -qi "$n"; then subject_hits=$((subject_hits+1)); fi
done
if [ "$subject_hits" -ge 2 ]; then
ok "Q5: answer mentions $subject_hits canonical subjects (need >=2)"
else
fail "Q5: answer mentions only $subject_hits canonical subjects (need >=2)"
fi
if printf '%s' "$q5_ans" | grep -Eqi 'severity|warn|warning|error'; then
ok "Q5: answer acknowledges severity (warn/error)"
else
fail "Q5: answer does not acknowledge severity"
fi
else
fail "Q5: results/q5_consistency_issues.json missing (LLM consumer failed)"
fi
# ─── summary ─────────────────────────────────────────────────────────────────
echo
echo "════════════════════════════════════════════════════════════"
if [ "$fails" -eq 0 ]; then
echo " PASS — $passes checks, 0 failures"
echo "════════════════════════════════════════════════════════════"
exit 0
else
echo " FAIL — $passes checks passed, $fails FAILED:"
for d in "${FAIL_DETAILS[@]}"; do
echo " - $d"
done
echo "════════════════════════════════════════════════════════════"
exit 1
fi