lore-engine-poc/examples/test_e2e.sh

#!/usr/bin/env bash
# test_e2e.sh — End-to-end validation for v2.T7.
#
# What this proves (per task body):
#   1. The LLM consumer works end-to-end (5 question types)
#   2. The consistency tools find the right violations (5 seeded)
#   3. The LLM's answers match the seed-data ground truth
#
# Two independent layers:
#   A. Direct tool calls — each of the 4 consistency tools is invoked
#      against the live gateway and the violation count + ids are asserted
#      against the table in examples/GROUND_TRUTH.md. This proves the
#      tools work regardless of LLM behaviour.
#   B. LLM consumer — for each of 5 question types, drive the LLM through
#      the gateway, then assert the answer contains the expected facts
#      (names, dates, severities). This proves the LLM consumer works.
#
# The script exits 0 only if EVERY check passes.
set -uo pipefail

cd "$(dirname "$0")"
mkdir -p results
GATEWAY_URL="${GATEWAY_URL:-http://localhost:8765/mcp}"
LITELLM_URL="${LITELLM_URL:-http://localhost:4000/v1}"
LITELLM_MODEL="${LITELLM_MODEL:-minimax-m3}"
export GATEWAY_URL LITELLM_URL LITELLM_MODEL

# ─── bookkeeping ──────────────────────────────────────────────────────────────

fails=0
passes=0
declare -a FAIL_DETAILS=()

ok()   { passes=$((passes+1)); echo "  ✓ $1"; }
fail() { fails=$((fails+1)); FAIL_DETAILS+=("$1"); echo "  ✗ $1"; }

section() { echo; echo "── $* ──"; }

# ─── pre-flight ──────────────────────────────────────────────────────────────

section "pre-flight: gateway + LiteLLM reachable"
if curl -s --max-time 5 -X POST "$GATEWAY_URL" -H "Content-Type: application/json" \
     -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' >/dev/null 2>&1; then
  ok "gateway responds at $GATEWAY_URL"
else
  fail "gateway unreachable at $GATEWAY_URL"
  echo
  echo "PRE-FLIGHT FAILED — aborting."
  exit 1
fi
if curl -s --max-time 5 "$LITELLM_URL/models" >/dev/null 2>&1; then
  ok "LiteLLM responds at $LITELLM_URL"
else
  fail "LiteLLM unreachable at $LITELLM_URL"
  echo
  echo "PRE-FLIGHT FAILED — aborting."
  exit 1
fi

# ─── Layer A: direct consistency-tool calls ──────────────────────────────────

# Helper: call a tool, print the parsed JSON envelope (one object per line).
call_tool() {
  local name=$1
  local args=$2
  curl -s -X POST "$GATEWAY_URL" -H "Content-Type: application/json" \
    -d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"tools/call\",\"params\":{\"name\":\"$name\",\"arguments\":$args}}" \
  | python3 -c "import json,sys; d=json.load(sys.stdin); print(d['result']['content'][0]['text'])"
}

# Helper: assert a tool's violation count + ids.
# Args: <tool_name> <args_json> <expected_count> [expected_id_1 ...]
assert_violations() {
  local tool=$1; shift
  local args=$1; shift
  local expected_count=$1; shift
  local resp
  resp=$(call_tool "$tool" "$args")
  local got_count
  got_count=$(printf '%s' "$resp" | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
  if [ "$got_count" = "$expected_count" ]; then
    ok "$tool: count=$got_count (expected $expected_count)"
  else
    fail "$tool: count=$got_count (expected $expected_count) — full response: $resp"
    return
  fi
  for want in "$@"; do
    if printf '%s' "$resp" | python3 -c "import json,sys; ids=[v['id'] for v in json.load(sys.stdin)['violations']]; print('YES' if '$want' in ids else 'NO')" \
       2>/dev/null | grep -q YES; then
      ok "$tool: contains id=$want"
    else
      fail "$tool: missing id=$want (full response: $resp)"
    fi
  done
}

section "Layer A — direct consistency tool calls (no LLM)"

assert_violations "find_contradictions"      '{"severity":"any"}' 1 c_aldric_double_membership
assert_violations "find_anachronisms"        '{"severity":"any"}' 1 a_vex_at_founding
assert_violations "find_orphans"             '{}'                 1 o_unfinished_npc
assert_violations "find_ontology_violations" '{"severity":"any"}' 2 ov_theron_no_died ov_maric_no_died

# Severity breakdown — task body says "the orphan being a warning, not error".
section "Layer A — severity breakdown"
contradictions_err=$(call_tool "find_contradictions" '{"severity":"error"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
contradictions_warn=$(call_tool "find_contradictions" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
[ "$contradictions_err" = "1" ] && ok "find_contradictions severity=error -> 1" || fail "find_contradictions severity=error -> $contradictions_err (expected 1)"
[ "$contradictions_warn" = "0" ] && ok "find_contradictions severity=warn  -> 0" || fail "find_contradictions severity=warn -> $contradictions_warn (expected 0)"
anach_err=$(call_tool "find_anachronisms" '{"severity":"error"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
anach_warn=$(call_tool "find_anachronisms" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
[ "$anach_err" = "1" ] && ok "find_anachronisms severity=error -> 1" || fail "find_anachronisms severity=error -> $anach_err (expected 1)"
[ "$anach_warn" = "0" ] && ok "find_anachronisms severity=warn  -> 0" || fail "find_anachronisms severity=warn -> $anach_warn (expected 0)"
# Orphans: 1 warn (the task body specifies this is a warn, not error).
orphan_severity=$(call_tool "find_orphans" '{}' | python3 -c "import json,sys; d=json.load(sys.stdin); print(','.join(v['severity'] for v in d['violations']))")
if [ "$orphan_severity" = "warn" ]; then
  ok "find_orphans -> severity=warn (orphan is a warn, not error)"
else
  fail "find_orphans -> severity=[$orphan_severity] (expected 'warn')"
fi
# Ontology: 2 warn
ont_warn=$(call_tool "find_ontology_violations" '{"severity":"warn"}' | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
[ "$ont_warn" = "2" ] && ok "find_ontology_violations severity=warn -> 2" || fail "find_ontology_violations severity=warn -> $ont_warn (expected 2)"

# Total
total_err=0
total_warn=0
for t in find_contradictions find_anachronisms find_orphans find_ontology_violations; do
  args='{"severity":"any"}'
  [ "$t" = "find_orphans" ] && args='{}'
  e=$(call_tool "$t" "$args" | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for v in d['violations'] if v['severity']=='error'))")
  w=$(call_tool "$t" "$args" | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for v in d['violations'] if v['severity']=='warn'))")
  total_err=$((total_err+e))
  total_warn=$((total_warn+w))
done
total=$((total_err+total_warn))
[ "$total" = "5" ] && ok "TOTAL violations = 5 (2 error + 3 warn)" \
                   || fail "TOTAL violations = $total (expected 5)"
[ "$total_err" = "2" ] && ok "TOTAL errors = 2" || fail "TOTAL errors = $total_err (expected 2)"
[ "$total_warn" = "3" ] && ok "TOTAL warns = 3"  || fail "TOTAL warns = $total_warn (expected 3)"

# ─── Layer B: LLM consumer — 5 question types ────────────────────────────────

section "Layer B — LLM consumer (5 question types)"

declare -a IDS=(
  "q1_who_is_aldric"
  "q2_was_allied_230"
  "q3_aldric_ancestors"
  "q4_images_of_aldric"
  "q5_consistency_issues"
)
declare -a QS=(
  "Who is Aldric Raventhorne? Give a brief bio and his known relations."
  "Was House Vyr allied with the Merchants Guild at 2nd_age.year_230?"
  "What is the lineage / ancestry of Aldric Raventhorne? Walk back as far as you can."
  "Show me images of Aldric Raventhorne — portraits or otherwise."
  "What are the open consistency issues in the world graph right now? Check contradictions, anachronisms, orphans, and ontology violations."
)

for i in "${!IDS[@]}"; do
  id="${IDS[$i]}"
  q="${QS[$i]}"
  echo
  echo "── question $((i+1))/5: $id ──"
  echo "  Q: $q"
  if ! python3 llm_consumer.py --question-id "$id" --question "$q" \
        --out "results/${id}.json" >"/tmp/llm_consumer_${id}.log" 2>&1; then
    fail "Q$((i+1)) ($id): llm_consumer.py exited non-zero — see /tmp/llm_consumer_${id}.log"
    tail -5 "/tmp/llm_consumer_${id}.log" | sed 's/^/    /'
    continue
  fi
  tail -8 "/tmp/llm_consumer_${id}.log"
  ok "Q$((i+1)) ($id): llm_consumer.py exit=0"
done

# ─── Answer-level assertions against GROUND_TRUTH.md ─────────────────────────

section "Layer B — answer-level assertions against GROUND_TRUTH.md"

# Helper: read a trace and emit its (answer_lower, tools_csv) on two lines.
trace_info() {
  local trace_path=$1
  python3 -c "
import json
d = json.load(open('$trace_path'))
ans = (d.get('answer') or '').lower()
tools = [t['tool'] for t in d.get('tools_called', [])]
print(ans)
print('---TOOLS---')
print(','.join(tools))
"
}

# Q1: entity_context called, answer has Aldric + a known affiliation.
if [ -f "results/q1_who_is_aldric.json" ]; then
  trace=$(trace_info "results/q1_who_is_aldric.json")
  q1_ans=${trace%%$'---TOOLS---'*}
  q1_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
  echo "  Q1 tools: $q1_tools"
  if [[ "$q1_tools" == *entity_context* ]]; then ok "Q1: entity_context in tools_called"; else fail "Q1: entity_context NOT called (got: $q1_tools)"; fi
  if printf '%s' "$q1_ans" | grep -qi 'aldric'; then ok "Q1: answer mentions 'aldric'"; else fail "Q1: answer missing 'aldric'"; fi
  if printf '%s' "$q1_ans" | grep -Eqi 'vyr|thornwall|elara|valdorni|eventide'; then
    ok "Q1: answer mentions a known affiliation (Vyr/Thornwall/Elara/Valdorni/Eventide)"
  else
    fail "Q1: answer missing known affiliation"
  fi
else
  fail "Q1: results/q1_who_is_aldric.json missing (LLM consumer failed)"
fi

# Q2: was_true_at called, answer says YES/allied/true.
if [ -f "results/q2_was_allied_230.json" ]; then
  trace=$(trace_info "results/q2_was_allied_230.json")
  q2_ans=${trace%%$'---TOOLS---'*}
  q2_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
  echo "  Q2 tools: $q2_tools"
  if [[ "$q2_tools" == *was_true_at* ]]; then ok "Q2: was_true_at in tools_called"; else fail "Q2: was_true_at NOT called (got: $q2_tools)"; fi
  if printf '%s' "$q2_ans" | grep -Eqi 'yes|allied|true|in force|was an alliance'; then
    ok "Q2: answer indicates YES/allied/true"
  else
    fail "Q2: answer missing YES/allied/true"
  fi
else
  fail "Q2: results/q2_was_allied_230.json missing (LLM consumer failed)"
fi

# Q3: ancestors_of called, answer names >=3 of {Theron, Maric, Cael, Yssa}.
if [ -f "results/q3_aldric_ancestors.json" ]; then
  trace=$(trace_info "results/q3_aldric_ancestors.json")
  q3_ans=${trace%%$'---TOOLS---'*}
  q3_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
  echo "  Q3 tools: $q3_tools"
  if [[ "$q3_tools" == *ancestors_of* ]]; then ok "Q3: ancestors_of in tools_called"; else fail "Q3: ancestors_of NOT called (got: $q3_tools)"; fi
  found=0
  for n in theron maric cael yssa; do
    if printf '%s' "$q3_ans" | grep -qi "$n"; then found=$((found+1)); fi
  done
  if [ "$found" -ge 3 ]; then ok "Q3: answer names $found/4 canonical ancestors (need >=3)"; else fail "Q3: answer names only $found/4 canonical ancestors (need >=3)"; fi
else
  fail "Q3: results/q3_aldric_ancestors.json missing (LLM consumer failed)"
fi

# Q4: image-recall tool called, answer mentions Aldric + portrait/image/etc.
if [ -f "results/q4_images_of_aldric.json" ]; then
  trace=$(trace_info "results/q4_images_of_aldric.json")
  q4_ans=${trace%%$'---TOOLS---'*}
  q4_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
  echo "  Q4 tools: $q4_tools"
  if [[ "$q4_tools" == *recall_images* || "$q4_tools" == *search_images_by_caption* || "$q4_tools" == *search_images_semantic* ]]; then
    ok "Q4: image-recall tool in tools_called"
  else
    fail "Q4: no image-recall tool called (got: $q4_tools)"
  fi
  if printf '%s' "$q4_ans" | grep -qi 'aldric'; then ok "Q4: answer mentions 'aldric'"; else fail "Q4: answer missing 'aldric'"; fi
  if printf '%s' "$q4_ans" | grep -Eqi 'portrait|image|presigned|thornwall'; then
    ok "Q4: answer mentions portrait/image/presigned/thornwall"
  else
    fail "Q4: answer missing portrait/image/presigned/thornwall"
  fi
else
  fail "Q4: results/q4_images_of_aldric.json missing (LLM consumer failed)"
fi

# Q5: all 4 consistency tools called; answer is NOT a "no issues" answer; mentions
# canonical subject names and severity.
if [ -f "results/q5_consistency_issues.json" ]; then
  trace=$(trace_info "results/q5_consistency_issues.json")
  q5_ans=${trace%%$'---TOOLS---'*}
  q5_tools=$(printf '%s' "$trace" | awk -F'---TOOLS---' '{print $2}')
  echo "  Q5 tools: $q5_tools"
  missing=()
  for t in find_contradictions find_anachronisms find_orphans find_ontology_violations; do
    [[ "$q5_tools" == *"$t"* ]] || missing+=("$t")
  done
  if [ ${#missing[@]} -eq 0 ]; then
    ok "Q5: all 4 consistency tools in tools_called"
  else
    fail "Q5: missing tools: ${missing[*]}"
  fi
  # Must NOT say "no issues" — there are 5 seeded violations.
  if printf '%s' "$q5_ans" | grep -Eqi '(no|zero|none).{0,30}(open |detected |current )?(consistency |open )?(issues|problems|violations)'; then
    fail "Q5: answer incorrectly says 'no issues' — but 5 violations are seeded"
  else
    ok "Q5: answer does NOT claim 'no issues' (correct — 5 violations seeded)"
  fi
  subject_hits=0
  for n in aldric vex lyssa theron maric; do
    if printf '%s' "$q5_ans" | grep -qi "$n"; then subject_hits=$((subject_hits+1)); fi
  done
  if [ "$subject_hits" -ge 2 ]; then
    ok "Q5: answer mentions $subject_hits canonical subjects (need >=2)"
  else
    fail "Q5: answer mentions only $subject_hits canonical subjects (need >=2)"
  fi
  if printf '%s' "$q5_ans" | grep -Eqi 'severity|warn|warning|error'; then
    ok "Q5: answer acknowledges severity (warn/error)"
  else
    fail "Q5: answer does not acknowledge severity"
  fi
else
  fail "Q5: results/q5_consistency_issues.json missing (LLM consumer failed)"
fi

# ─── summary ─────────────────────────────────────────────────────────────────

echo
echo "════════════════════════════════════════════════════════════"
if [ "$fails" -eq 0 ]; then
  echo "  PASS — $passes checks, 0 failures"
  echo "════════════════════════════════════════════════════════════"
  exit 0
else
  echo "  FAIL — $passes checks passed, $fails FAILED:"
  for d in "${FAIL_DETAILS[@]}"; do
    echo "    - $d"
  done
  echo "════════════════════════════════════════════════════════════"
  exit 1
fi