- README.md: 5 plugins / 19 tools (matches /healthz); 'what this proves' now lists consistency engine, multi-world namespace, LLM consumer; 'next steps' section replaced with 'shipped in v2' - docs/CONSISTENCY_DEMO.md: 4 tools, 5 violations, all output verified against live bash examples/test_consistency.sh - docs/MULTI_WORLD_DEMO.md: list_worlds() + entity_context in both worlds + cross-world isolation tests, all output verified live - docs/LLM_CONSUMER_DEMO.md: 5 question types, 9 distinct tools, all output traced to examples/results/*.json - CHANGELOG.md: v1 -> v2 entry, all 9 task refs (T1-T9) - examples/test_e2e.sh: T7 E2E validation script (untracked)
321 lines
15 KiB
Bash
Executable File
321 lines
15 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# test_e2e.sh — End-to-end validation for v2.T7.
|
|
#
|
|
# What this proves (per task body):
|
|
# 1. The LLM consumer works end-to-end (5 question types)
|
|
# 2. The consistency tools find the right violations (5 seeded)
|
|
# 3. The LLM's answers match the seed-data ground truth
|
|
#
|
|
# Two independent layers:
|
|
# A. Direct tool calls — each of the 4 consistency tools is invoked
|
|
# against the live gateway and the violation count + ids are asserted
|
|
# against the table in examples/GROUND_TRUTH.md. This proves the
|
|
# tools work regardless of LLM behaviour.
|
|
# B. LLM consumer — for each of 5 question types, drive the LLM through
|
|
# the gateway, then assert the answer contains the expected facts
|
|
# (names, dates, severities). This proves the LLM consumer works.
|
|
#
|
|
# The script exits 0 only if EVERY check passes.
|
|
set -uo pipefail
|
|
|
|
cd "$(dirname "$0")"
|
|
mkdir -p results
|
|
GATEWAY_URL="${GATEWAY_URL:-http://localhost:8765/mcp}"
|
|
LITELLM_URL="${LITELLM_URL:-http://localhost:4000/v1}"
|
|
LITELLM_MODEL="${LITELLM_MODEL:-minimax-m3}"
|
|
export GATEWAY_URL LITELLM_URL LITELLM_MODEL
|
|
|
|
# ─── bookkeeping ──────────────────────────────────────────────────────────────
|
|
|
|
fails=0
|
|
passes=0
|
|
declare -a FAIL_DETAILS=()
|
|
|
|
ok() { passes=$((passes+1)); echo " ✓ $1"; }
|
|
fail() { fails=$((fails+1)); FAIL_DETAILS+=("$1"); echo " ✗ $1"; }
|
|
|
|
section() { echo; echo "── $* ──"; }
|
|
|
|
# ─── pre-flight ──────────────────────────────────────────────────────────────
|
|
|
|
section "pre-flight: gateway + LiteLLM reachable"
|
|
if curl -s --max-time 5 -X POST "$GATEWAY_URL" -H "Content-Type: application/json" \
|
|
-d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' >/dev/null 2>&1; then
|
|
ok "gateway responds at $GATEWAY_URL"
|
|
else
|
|
fail "gateway unreachable at $GATEWAY_URL"
|
|
echo
|
|
echo "PRE-FLIGHT FAILED — aborting."
|
|
exit 1
|
|
fi
|
|
if curl -s --max-time 5 "$LITELLM_URL/models" >/dev/null 2>&1; then
|
|
ok "LiteLLM responds at $LITELLM_URL"
|
|
else
|
|
fail "LiteLLM unreachable at $LITELLM_URL"
|
|
echo
|
|
echo "PRE-FLIGHT FAILED — aborting."
|
|
exit 1
|
|
fi
|
|
|
|
# ─── Layer A: direct consistency-tool calls ──────────────────────────────────
|
|
|
|
# Helper: call a tool, print the parsed JSON envelope (one object per line).
|
|
call_tool() {
|
|
local name=$1
|
|
local args=$2
|
|
curl -s -X POST "$GATEWAY_URL" -H "Content-Type: application/json" \
|
|
-d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"tools/call\",\"params\":{\"name\":\"$name\",\"arguments\":$args}}" \
|
|
| python3 -c "import json,sys; d=json.load(sys.stdin); print(d['result']['content'][0]['text'])"
|
|
}
|
|
|
|
# Helper: assert a tool's violation count + ids.
|
|
# Args: <tool_name> <args_json> <expected_count> [expected_id_1 ...]
|
|
assert_violations() {
|
|
local tool=$1; shift
|
|
local args=$1; shift
|
|
local expected_count=$1; shift
|
|
local resp
|
|
resp=$(call_tool "$tool" "$args")
|
|
local got_count
|
|
got_count=$(printf '%s' "$resp" | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
|
|
if [ "$got_count" = "$expected_count" ]; then
|
|
ok "$tool: count=$got_count (expected $expected_count)"
|
|
else
|
|
fail "$tool: count=$got_count (expected $expected_count) — full response: $resp"
|
|
return
|
|
fi
|
|
for want in "$@"; do
|
|
if printf '%s' "$resp" | python3 -c "import json,sys; ids=[v['id'] for v in json.load(sys.stdin)['violations']]; print('YES' if '$want' in ids else 'NO')" \
|
|
2>/dev/null | grep -q YES; then
|
|
ok "$tool: contains id=$want"
|
|
else
|
|
fail "$tool: missing id=$want (full response: $resp)"
|
|
fi
|
|
done
|
|
}
|
|
|
|
section "Layer A — direct consistency tool calls (no LLM)"
|
|
|
|
assert_violations "find_contradictions" '{"severity":"any"}' 1 c_aldric_double_membership
|
|
assert_violations "find_anachronisms" '{"severity":"any"}' 1 a_vex_at_founding
|
|
assert_violations "find_orphans" '{}' 1 o_unfinished_npc
|
|
assert_violations "find_ontology_violations" '{"severity":"any"}' 2 ov_theron_no_died ov_maric_no_died
|
|
|
|
# Severity breakdown — task body says "the orphan being a warning, not error".
|
|
section "Layer A — severity breakdown"
|
|
contradictions_err=$(call_tool "find_contradictions" '{"severity":"error"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
|
|
contradictions_warn=$(call_tool "find_contradictions" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
|
|
[ "$contradictions_err" = "1" ] && ok "find_contradictions severity=error -> 1" || fail "find_contradictions severity=error -> $contradictions_err (expected 1)"
|
|
[ "$contradictions_warn" = "0" ] && ok "find_contradictions severity=warn -> 0" || fail "find_contradictions severity=warn -> $contradictions_warn (expected 0)"
|
|
anach_err=$(call_tool "find_anachronisms" '{"severity":"error"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
|
|
anach_warn=$(call_tool "find_anachronisms" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
|
|
[ "$anach_err" = "1" ] && ok "find_anachronisms severity=error -> 1" || fail "find_anachronisms severity=error -> $anach_err (expected 1)"
|
|
[ "$anach_warn" = "0" ] && ok "find_anachronisms severity=warn -> 0" || fail "find_anachronisms severity=warn -> $anach_warn (expected 0)"
|
|
# Orphans: 1 warn (the task body specifies this is a warn, not error).
|
|
orphan_severity=$(call_tool "find_orphans" '{}' | python3 -c "import json,sys; d=json.load(sys.stdin); print(','.join(v['severity'] for v in d['violations']))")
|
|
if [ "$orphan_severity" = "warn" ]; then
|
|
ok "find_orphans -> severity=warn (orphan is a warn, not error)"
|
|
else
|
|
fail "find_orphans -> severity=[$orphan_severity] (expected 'warn')"
|
|
fi
|
|
# Ontology: 2 warn
|
|
ont_warn=$(call_tool "find_ontology_violations" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
|
|
[ "$ont_warn" = "2" ] && ok "find_ontology_violations severity=warn -> 2" || fail "find_ontology_violations severity=warn -> $ont_warn (expected 2)"
|
|
|
|
# Total
|
|
total_err=0
|
|
total_warn=0
|
|
for t in find_contradictions find_anachronisms find_orphans find_ontology_violations; do
|
|
args='{"severity":"any"}'
|
|
[ "$t" = "find_orphans" ] && args='{}'
|
|
e=$(call_tool "$t" "$args" | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for v in d['violations'] if v['severity']=='error'))")
|
|
w=$(call_tool "$t" "$args" | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for v in d['violations'] if v['severity']=='warn'))")
|
|
total_err=$((total_err+e))
|
|
total_warn=$((total_warn+w))
|
|
done
|
|
total=$((total_err+total_warn))
|
|
[ "$total" = "5" ] && ok "TOTAL violations = 5 (2 error + 3 warn)" \
|
|
|| fail "TOTAL violations = $total (expected 5)"
|
|
[ "$total_err" = "2" ] && ok "TOTAL errors = 2" || fail "TOTAL errors = $total_err (expected 2)"
|
|
[ "$total_warn" = "3" ] && ok "TOTAL warns = 3" || fail "TOTAL warns = $total_warn (expected 3)"
|
|
|
|
# ─── Layer B: LLM consumer — 5 question types ────────────────────────────────
|
|
|
|
section "Layer B — LLM consumer (5 question types)"
|
|
|
|
declare -a IDS=(
|
|
"q1_who_is_aldric"
|
|
"q2_was_allied_230"
|
|
"q3_aldric_ancestors"
|
|
"q4_images_of_aldric"
|
|
"q5_consistency_issues"
|
|
)
|
|
declare -a QS=(
|
|
"Who is Aldric Raventhorne? Give a brief bio and his known relations."
|
|
"Was House Vyr allied with the Merchants Guild at 2nd_age.year_230?"
|
|
"What is the lineage / ancestry of Aldric Raventhorne? Walk back as far as you can."
|
|
"Show me images of Aldric Raventhorne — portraits or otherwise."
|
|
"What are the open consistency issues in the world graph right now? Check contradictions, anachronisms, orphans, and ontology violations."
|
|
)
|
|
|
|
for i in "${!IDS[@]}"; do
|
|
id="${IDS[$i]}"
|
|
q="${QS[$i]}"
|
|
echo
|
|
echo "── question $((i+1))/5: $id ──"
|
|
echo " Q: $q"
|
|
if ! python3 llm_consumer.py --question-id "$id" --question "$q" \
|
|
--out "results/${id}.json" >"/tmp/llm_consumer_${id}.log" 2>&1; then
|
|
fail "Q$((i+1)) ($id): llm_consumer.py exited non-zero — see /tmp/llm_consumer_${id}.log"
|
|
tail -5 "/tmp/llm_consumer_${id}.log" | sed 's/^/ /'
|
|
continue
|
|
fi
|
|
tail -8 "/tmp/llm_consumer_${id}.log"
|
|
ok "Q$((i+1)) ($id): llm_consumer.py exit=0"
|
|
done
|
|
|
|
# ─── Answer-level assertions against GROUND_TRUTH.md ─────────────────────────
|
|
|
|
section "Layer B — answer-level assertions against GROUND_TRUTH.md"
|
|
|
|
# Helper: read a trace and emit its (answer_lower, tools_csv) on two lines.
|
|
trace_info() {
|
|
local trace_path=$1
|
|
python3 -c "
|
|
import json
|
|
d = json.load(open('$trace_path'))
|
|
ans = (d.get('answer') or '').lower()
|
|
tools = [t['tool'] for t in d.get('tools_called', [])]
|
|
print(ans)
|
|
print('---TOOLS---')
|
|
print(','.join(tools))
|
|
"
|
|
}
|
|
|
|
# Q1: entity_context called, answer has Aldric + a known affiliation.
|
|
if [ -f "results/q1_who_is_aldric.json" ]; then
|
|
trace=$(trace_info "results/q1_who_is_aldric.json")
|
|
q1_ans=${trace%%$'---TOOLS---'*}
|
|
q1_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
|
|
echo " Q1 tools: $q1_tools"
|
|
if [[ "$q1_tools" == *entity_context* ]]; then ok "Q1: entity_context in tools_called"; else fail "Q1: entity_context NOT called (got: $q1_tools)"; fi
|
|
if printf '%s' "$q1_ans" | grep -qi 'aldric'; then ok "Q1: answer mentions 'aldric'"; else fail "Q1: answer missing 'aldric'"; fi
|
|
if printf '%s' "$q1_ans" | grep -Eqi 'vyr|thornwall|elara|valdorni|eventide'; then
|
|
ok "Q1: answer mentions a known affiliation (Vyr/Thornwall/Elara/Valdorni/Eventide)"
|
|
else
|
|
fail "Q1: answer missing known affiliation"
|
|
fi
|
|
else
|
|
fail "Q1: results/q1_who_is_aldric.json missing (LLM consumer failed)"
|
|
fi
|
|
|
|
# Q2: was_true_at called, answer says YES/allied/true.
|
|
if [ -f "results/q2_was_allied_230.json" ]; then
|
|
trace=$(trace_info "results/q2_was_allied_230.json")
|
|
q2_ans=${trace%%$'---TOOLS---'*}
|
|
q2_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
|
|
echo " Q2 tools: $q2_tools"
|
|
if [[ "$q2_tools" == *was_true_at* ]]; then ok "Q2: was_true_at in tools_called"; else fail "Q2: was_true_at NOT called (got: $q2_tools)"; fi
|
|
if printf '%s' "$q2_ans" | grep -Eqi 'yes|allied|true|in force|was an alliance'; then
|
|
ok "Q2: answer indicates YES/allied/true"
|
|
else
|
|
fail "Q2: answer missing YES/allied/true"
|
|
fi
|
|
else
|
|
fail "Q2: results/q2_was_allied_230.json missing (LLM consumer failed)"
|
|
fi
|
|
|
|
# Q3: ancestors_of called, answer names >=3 of {Theron, Maric, Cael, Yssa}.
|
|
if [ -f "results/q3_aldric_ancestors.json" ]; then
|
|
trace=$(trace_info "results/q3_aldric_ancestors.json")
|
|
q3_ans=${trace%%$'---TOOLS---'*}
|
|
q3_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
|
|
echo " Q3 tools: $q3_tools"
|
|
if [[ "$q3_tools" == *ancestors_of* ]]; then ok "Q3: ancestors_of in tools_called"; else fail "Q3: ancestors_of NOT called (got: $q3_tools)"; fi
|
|
found=0
|
|
for n in theron maric cael yssa; do
|
|
if printf '%s' "$q3_ans" | grep -qi "$n"; then found=$((found+1)); fi
|
|
done
|
|
if [ "$found" -ge 3 ]; then ok "Q3: answer names $found/4 canonical ancestors (need >=3)"; else fail "Q3: answer names only $found/4 canonical ancestors (need >=3)"; fi
|
|
else
|
|
fail "Q3: results/q3_aldric_ancestors.json missing (LLM consumer failed)"
|
|
fi
|
|
|
|
# Q4: image-recall tool called, answer mentions Aldric + portrait/image/etc.
|
|
if [ -f "results/q4_images_of_aldric.json" ]; then
|
|
trace=$(trace_info "results/q4_images_of_aldric.json")
|
|
q4_ans=${trace%%$'---TOOLS---'*}
|
|
q4_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
|
|
echo " Q4 tools: $q4_tools"
|
|
if [[ "$q4_tools" == *recall_images* || "$q4_tools" == *search_images_by_caption* || "$q4_tools" == *search_images_semantic* ]]; then
|
|
ok "Q4: image-recall tool in tools_called"
|
|
else
|
|
fail "Q4: no image-recall tool called (got: $q4_tools)"
|
|
fi
|
|
if printf '%s' "$q4_ans" | grep -qi 'aldric'; then ok "Q4: answer mentions 'aldric'"; else fail "Q4: answer missing 'aldric'"; fi
|
|
if printf '%s' "$q4_ans" | grep -Eqi 'portrait|image|presigned|thornwall'; then
|
|
ok "Q4: answer mentions portrait/image/presigned/thornwall"
|
|
else
|
|
fail "Q4: answer missing portrait/image/presigned/thornwall"
|
|
fi
|
|
else
|
|
fail "Q4: results/q4_images_of_aldric.json missing (LLM consumer failed)"
|
|
fi
|
|
|
|
# Q5: all 4 consistency tools called; answer is NOT a "no issues" answer; mentions
|
|
# canonical subject names and severity.
|
|
if [ -f "results/q5_consistency_issues.json" ]; then
|
|
trace=$(trace_info "results/q5_consistency_issues.json")
|
|
q5_ans=${trace%%$'---TOOLS---'*}
|
|
q5_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
|
|
echo " Q5 tools: $q5_tools"
|
|
missing=()
|
|
for t in find_contradictions find_anachronisms find_orphans find_ontology_violations; do
|
|
[[ "$q5_tools" == *"$t"* ]] || missing+=("$t")
|
|
done
|
|
if [ ${#missing[@]} -eq 0 ]; then
|
|
ok "Q5: all 4 consistency tools in tools_called"
|
|
else
|
|
fail "Q5: missing tools: ${missing[*]}"
|
|
fi
|
|
# Must NOT say "no issues" — there are 5 seeded violations.
|
|
if printf '%s' "$q5_ans" | grep -Eqi '(no|zero|none).{0,30}(open |detected |current )?(consistency |open )?(issues|problems|violations)'; then
|
|
fail "Q5: answer incorrectly says 'no issues' — but 5 violations are seeded"
|
|
else
|
|
ok "Q5: answer does NOT claim 'no issues' (correct — 5 violations seeded)"
|
|
fi
|
|
subject_hits=0
|
|
for n in aldric vex lyssa theron maric; do
|
|
if printf '%s' "$q5_ans" | grep -qi "$n"; then subject_hits=$((subject_hits+1)); fi
|
|
done
|
|
if [ "$subject_hits" -ge 2 ]; then
|
|
ok "Q5: answer mentions $subject_hits canonical subjects (need >=2)"
|
|
else
|
|
fail "Q5: answer mentions only $subject_hits canonical subjects (need >=2)"
|
|
fi
|
|
if printf '%s' "$q5_ans" | grep -Eqi 'severity|warn|warning|error'; then
|
|
ok "Q5: answer acknowledges severity (warn/error)"
|
|
else
|
|
fail "Q5: answer does not acknowledge severity"
|
|
fi
|
|
else
|
|
fail "Q5: results/q5_consistency_issues.json missing (LLM consumer failed)"
|
|
fi
|
|
|
|
# ─── summary ─────────────────────────────────────────────────────────────────
|
|
|
|
echo
|
|
echo "════════════════════════════════════════════════════════════"
|
|
if [ "$fails" -eq 0 ]; then
|
|
echo " PASS — $passes checks, 0 failures"
|
|
echo "════════════════════════════════════════════════════════════"
|
|
exit 0
|
|
else
|
|
echo " FAIL — $passes checks passed, $fails FAILED:"
|
|
for d in "${FAIL_DETAILS[@]}"; do
|
|
echo " - $d"
|
|
done
|
|
echo "════════════════════════════════════════════════════════════"
|
|
exit 1
|
|
fi
|