From 23679605402a0c12bbae18af9b97ff42fcbccfc2 Mon Sep 17 00:00:00 2001 From: Lore Engine Dev Date: Fri, 19 Jun 2026 20:51:18 -0400 Subject: [PATCH] slice 7.1: 50-question reasoning harness test set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per docs/plan/exec/07-harness.md sub-slice 7.1: - tests/harness/questions.yaml — the human-friendly YAML source. 50 questions across the 5 design-doc types (10 each): identity, time_fact, world_state, causal, narrative. Each question pins id, type, query, expected_tools, expected_answer_shape, and expected_citations. Targets the Mardonari codex (the slice 0 fixture) so the harness can run end-to-end against the real graph. - tests/harness/questions.json — the compiled JSON (committed so the runner reads it without rebuilding). - scripts/harness/build_questions.py — the strict compiler. Validates the YAML schema, counts questions per type, enforces uniqueness, writes the JSON. Validation errors fail loudly with field paths. - tests/harness/test_questions.py — 6 tests pinning the contract: schema, 50 total, 10 per type, expected_tools non-empty, ids unique, version set. Track A only (no API key needed). Track B (executing against the live LLM) is gated on $OLLAMA_API_KEY. Suite: 761 → 767 (+6). Co-Authored-By: Claude --- scripts/harness/build_questions.py | 192 +++++++ tests/harness/questions.json | 813 +++++++++++++++++++++++++++++ tests/harness/questions.yaml | 572 ++++++++++++++++++++ tests/harness/test_questions.py | 155 ++++++ 4 files changed, 1732 insertions(+) create mode 100644 scripts/harness/build_questions.py create mode 100644 tests/harness/questions.json create mode 100644 tests/harness/questions.yaml create mode 100644 tests/harness/test_questions.py diff --git a/scripts/harness/build_questions.py b/scripts/harness/build_questions.py new file mode 100644 index 0000000..77aff39 --- /dev/null +++ b/scripts/harness/build_questions.py @@ -0,0 +1,192 @@ +"""build_questions — compile tests/harness/questions.yaml → questions.json. + +Per ``docs/plan/exec/07-harness.md`` sub-slice 7.1, the 50 +question set is authored in YAML (human-friendly diffs) and +compiled to JSON (the harness runner reads JSON). The +compiler: + + 1. Loads the YAML via the project's strict loader + (``load_yaml``). + 2. Validates: top-level ``version`` + ``questions`` list, + each question has the required keys, no duplicate ids, + the 5 question types are each represented by exactly + 10 questions (AC 7.1). + 3. Writes the JSON to ``tests/harness/questions.json`` + (the runner's default input). + +The build is intentionally a CLI command (not part of the +test suite's import path) — the JSON file is committed +to the repo so the runner can read it without re-running +the compiler. + +Run: + + python3 scripts/harness/build_questions.py \\ + --yaml tests/harness/questions.yaml \\ + --out tests/harness/questions.json + +Without flags, defaults to the in-repo paths. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any + +ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(ROOT)) + +from lore_engine_poc.parsers._yaml import YamlSchemaError, load_yaml + + +# The 5 question types, fixed by docs/07-reasoning-harness.md. +QUESTION_TYPES = ( + "identity", + "time_fact", + "world_state", + "causal", + "narrative", +) + +# Per-question required keys. +REQUIRED_KEYS = ( + "id", + "type", + "query", + "expected_tools", + "expected_answer_shape", + "expected_citations", +) + + +def _validate_question(q: dict, where: str) -> list[str]: + """Return a list of human-readable errors for one + question dict. Empty list = valid. + """ + errors: list[str] = [] + for k in REQUIRED_KEYS: + if k not in q: + errors.append(f"{where}: missing required key '{k}'") + if "type" in q and q["type"] not in QUESTION_TYPES: + errors.append( + f"{where}: type '{q['type']}' not in " + f"{list(QUESTION_TYPES)}" + ) + if "expected_tools" in q and not isinstance(q["expected_tools"], list): + errors.append( + f"{where}: expected_tools must be a list, got " + f"{type(q['expected_tools']).__name__}" + ) + if "expected_citations" in q: + cit = q["expected_citations"] + if not isinstance(cit, int) or cit < 0: + errors.append( + f"{where}: expected_citations must be a non-negative int, got {cit!r}" + ) + return errors + + +def build(yaml_path: Path, out_path: Path) -> int: + try: + data, _ = load_yaml(str(yaml_path)) + except YamlSchemaError as e: + print(f"ERROR: {yaml_path}: {e}", file=sys.stderr) + return 1 + + if "version" not in data: + print(f"ERROR: {yaml_path}: missing top-level 'version'", file=sys.stderr) + return 1 + if "questions" not in data: + print(f"ERROR: {yaml_path}: missing top-level 'questions'", file=sys.stderr) + return 1 + + questions = data["questions"] + if not isinstance(questions, list): + print( + f"ERROR: {yaml_path}: 'questions' must be a list, got " + f"{type(questions).__name__}", + file=sys.stderr, + ) + return 1 + + # Per-question validation. + all_errors: list[str] = [] + seen_ids: set[str] = set() + type_counts: dict[str, int] = {t: 0 for t in QUESTION_TYPES} + for i, q in enumerate(questions): + where = f"questions[{i}]" + if not isinstance(q, dict): + all_errors.append(f"{where}: must be a mapping, got {type(q).__name__}") + continue + all_errors.extend(_validate_question(q, where)) + qid = q.get("id") + if qid is not None: + if qid in seen_ids: + all_errors.append(f"{where}: duplicate id {qid!r}") + seen_ids.add(qid) + qtype = q.get("type") + if isinstance(qtype, str) and qtype in type_counts: + type_counts[qtype] += 1 + + # Hard constraints (AC 7.1). + if len(questions) != 50: + all_errors.append( + f"expected exactly 50 questions, got {len(questions)}" + ) + for t, expected in ( + ("identity", 10), + ("time_fact", 10), + ("world_state", 10), + ("causal", 10), + ("narrative", 10), + ): + if type_counts.get(t, 0) != expected: + all_errors.append( + f"type '{t}': expected {expected} questions, " + f"got {type_counts.get(t, 0)}" + ) + + if all_errors: + for err in all_errors: + print(f"ERROR: {err}", file=sys.stderr) + return 1 + + # The compiled JSON keeps the YAML's structure 1:1 (the + # harness runner just reads the same keys). Pinning the + # version is a contract with the runner: old results + # stay comparable as the prompt iterates. + compiled: dict[str, Any] = { + "version": data["version"], + "type_counts": type_counts, + "questions": questions, + } + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(compiled, indent=2) + "\n", encoding="utf-8") + print( + f"[build_questions] {len(questions)} questions, " + f"version {data['version']}, wrote {out_path}" + ) + return 0 + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + p.add_argument( + "--yaml", + default=str(ROOT / "tests" / "harness" / "questions.yaml"), + help="YAML source (default: tests/harness/questions.yaml)", + ) + p.add_argument( + "--out", + default=str(ROOT / "tests" / "harness" / "questions.json"), + help="JSON output (default: tests/harness/questions.json)", + ) + args = p.parse_args() + return build(Path(args.yaml), Path(args.out)) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/harness/questions.json b/tests/harness/questions.json new file mode 100644 index 0000000..131a35c --- /dev/null +++ b/tests/harness/questions.json @@ -0,0 +1,813 @@ +{ + "version": 1, + "type_counts": { + "identity": 10, + "time_fact": 10, + "world_state": 10, + "causal": 10, + "narrative": 10 + }, + "questions": [ + { + "id": "t1.01", + "type": "identity", + "query": "Who is Roland Raventhorne?", + "expected_tools": [ + "lookup", + "entity_context", + "significance_of" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.02", + "type": "identity", + "query": "Tell me about House Raventhorne.", + "expected_tools": [ + "lookup", + "entity_context" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.03", + "type": "identity", + "query": "What is Mardsville?", + "expected_tools": [ + "lookup", + "entity_context" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.04", + "type": "identity", + "query": "Describe the Crimson Pact.", + "expected_tools": [ + "lookup", + "entity_context", + "significance_of" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.05", + "type": "identity", + "query": "Who was Aldric of Valdorn?", + "expected_tools": [ + "lookup", + "entity_context" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.06", + "type": "identity", + "query": "Tell me about the Wheel & Kiln.", + "expected_tools": [ + "lookup", + "entity_context" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.07", + "type": "identity", + "query": "What kind of place is the Underdark?", + "expected_tools": [ + "lookup", + "entity_context" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.08", + "type": "identity", + "query": "Who is Voldramir's keeper?", + "expected_tools": [ + "lookup", + "entity_context", + "significance_of" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.09", + "type": "identity", + "query": "Describe the Mardonari Material Plane.", + "expected_tools": [ + "lookup", + "entity_context" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t1.10", + "type": "identity", + "query": "What is House Vyr?", + "expected_tools": [ + "lookup", + "entity_context", + "significance_of" + ], + "expected_answer_shape": { + "has_keys": [ + "entity", + "summary" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.01", + "type": "time_fact", + "query": "Was Roland Raventhorne a member of House Raventhorne in 3rd_age.year_345?", + "expected_tools": [ + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.02", + "type": "time_fact", + "query": "Did Aldric rule Valdorn during the 3rd Age?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.03", + "type": "time_fact", + "query": "Were House Vyr and the Crimson Pact allied in year 340 TA?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.04", + "type": "time_fact", + "query": "Was the Long Winter caused by the Sundering?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.05", + "type": "time_fact", + "query": "Did the Battle of Black Spire happen in 342 TA?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.06", + "type": "time_fact", + "query": "Was Voldramir accessible during the Sundering?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.07", + "type": "time_fact", + "query": "Were the Mardonari and Valdorni at war in year 360 TA?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.08", + "type": "time_fact", + "query": "Was Roland alive in the 4th Age?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.09", + "type": "time_fact", + "query": "Did Aldric sit on the Wheel & Kiln council in 3rd_age.year_300?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t2.10", + "type": "time_fact", + "query": "Was the Crimson Pact founded before the Sundering?", + "expected_tools": [ + "lookup", + "was_true_at" + ], + "expected_answer_shape": { + "has_keys": [ + "was_true", + "at_time" + ] + }, + "expected_citations": 1 + }, + { + "id": "t3.01", + "type": "world_state", + "query": "What was happening in Valdorn in 340 TA?", + "expected_tools": [ + "entities_present", + "events_during" + ], + "expected_answer_shape": { + "has_keys": [ + "entities", + "events" + ] + }, + "expected_citations": 2 + }, + { + "id": "t3.02", + "type": "world_state", + "query": "Who was in Mardsville during the Border Wars?", + "expected_tools": [ + "lookup", + "entities_present", + "events_during" + ], + "expected_answer_shape": { + "has_keys": [ + "entities", + "events" + ] + }, + "expected_citations": 2 + }, + { + "id": "t3.03", + "type": "world_state", + "query": "What factions were active in Mardonari in year 380 TA?", + "expected_tools": [ + "lookup", + "entities_present", + "events_during" + ], + "expected_answer_shape": { + "has_keys": [ + "entities", + "events" + ] + }, + "expected_citations": 2 + }, + { + "id": "t3.04", + "type": "world_state", + "query": "Who lived in the Underdark in the 3rd Age?", + "expected_tools": [ + "lookup", + "entities_present" + ], + "expected_answer_shape": { + "has_keys": [ + "entities" + ] + }, + "expected_citations": 1 + }, + { + "id": "t3.05", + "type": "world_state", + "query": "What events occurred in the 4th Age?", + "expected_tools": [ + "events_during" + ], + "expected_answer_shape": { + "has_keys": [ + "events" + ] + }, + "expected_citations": 1 + }, + { + "id": "t3.06", + "type": "world_state", + "query": "Who was on the Wheel & Kiln council in year 320 TA?", + "expected_tools": [ + "lookup", + "entities_present" + ], + "expected_answer_shape": { + "has_keys": [ + "entities" + ] + }, + "expected_citations": 1 + }, + { + "id": "t3.07", + "type": "world_state", + "query": "What was happening at the Crimson Pact headquarters in 350 TA?", + "expected_tools": [ + "lookup", + "entities_present", + "events_during" + ], + "expected_answer_shape": { + "has_keys": [ + "entities", + "events" + ] + }, + "expected_citations": 2 + }, + { + "id": "t3.08", + "type": "world_state", + "query": "Who was alive in the Mardonari region during the Sundering?", + "expected_tools": [ + "entities_present", + "events_during" + ], + "expected_answer_shape": { + "has_keys": [ + "entities", + "events" + ] + }, + "expected_citations": 2 + }, + { + "id": "t3.09", + "type": "world_state", + "query": "What was the state of Voldramir in the 4th Age?", + "expected_tools": [ + "lookup", + "entities_present" + ], + "expected_answer_shape": { + "has_keys": [ + "entities" + ] + }, + "expected_citations": 1 + }, + { + "id": "t3.10", + "type": "world_state", + "query": "Who was ruling Valdorn at the start of the 3rd Age?", + "expected_tools": [ + "lookup", + "entities_present" + ], + "expected_answer_shape": { + "has_keys": [ + "entities" + ] + }, + "expected_citations": 1 + }, + { + "id": "t4.01", + "type": "causal", + "query": "Why did the Sundering happen?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.02", + "type": "causal", + "query": "What were the consequences of the Battle of Black Spire?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.03", + "type": "causal", + "query": "How did Aldric come to power?", + "expected_tools": [ + "lookup", + "event_chain", + "ancestors_of" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "ancestors" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.04", + "type": "causal", + "query": "Why was the Crimson Pact founded?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.05", + "type": "causal", + "query": "What led to the Long Winter?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.06", + "type": "causal", + "query": "How did House Vyr fall?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.07", + "type": "causal", + "query": "What caused the Border Wars?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.08", + "type": "causal", + "query": "How did the Wheel & Kiln come to be the Mardonari council?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.09", + "type": "causal", + "query": "Why did the Mardonari and Valdorni go to war?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t4.10", + "type": "causal", + "query": "How did Voldramir become a prison plane?", + "expected_tools": [ + "lookup", + "event_chain" + ], + "expected_answer_shape": { + "has_keys": [ + "causes", + "effects" + ] + }, + "expected_citations": 2 + }, + { + "id": "t5.01", + "type": "narrative", + "query": "Tell me about the Border Wars.", + "expected_tools": [ + "lookup", + "event_chain", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.02", + "type": "narrative", + "query": "What was House Vyr like at its height?", + "expected_tools": [ + "lookup", + "entity_context", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.03", + "type": "narrative", + "query": "Describe the 3rd Age.", + "expected_tools": [ + "events_during", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.04", + "type": "narrative", + "query": "What was life like in Mardsville in the late 3rd Age?", + "expected_tools": [ + "lookup", + "entities_present", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.05", + "type": "narrative", + "query": "Describe the fall of the Crimson Pact.", + "expected_tools": [ + "lookup", + "event_chain", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.06", + "type": "narrative", + "query": "Tell me about the cosmology of Mardonari.", + "expected_tools": [ + "lookup", + "entity_context", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.07", + "type": "narrative", + "query": "What is the lore around Voldramir?", + "expected_tools": [ + "lookup", + "entity_context", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.08", + "type": "narrative", + "query": "Describe Aldric's reign.", + "expected_tools": [ + "lookup", + "event_chain", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.09", + "type": "narrative", + "query": "What was happening in the Underdark during the Sundering?", + "expected_tools": [ + "lookup", + "entities_present", + "events_during", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + }, + { + "id": "t5.10", + "type": "narrative", + "query": "Tell me about the founding of Mardonari.", + "expected_tools": [ + "lookup", + "event_chain", + "narrate_arc" + ], + "expected_answer_shape": { + "has_keys": [ + "narrative" + ] + }, + "expected_citations": 3 + } + ] +} diff --git a/tests/harness/questions.yaml b/tests/harness/questions.yaml new file mode 100644 index 0000000..5f69add --- /dev/null +++ b/tests/harness/questions.yaml @@ -0,0 +1,572 @@ +# Slice 7 — Reasoning harness test set. +# +# Per docs/plan/exec/07-harness.md sub-slice 7.1, this YAML +# is the source of the 50 questions the harness runs. The +# build script (scripts/harness/build_questions.py) compiles +# it to JSON (tests/harness/questions.json) at build time. +# Tests verify the schema + the count. +# +# The five question types come from +# docs/07-reasoning-harness.md §"The five question types": +# +# 1. Identity & description — "Who is X?" +# 2. Time-bounded fact check — "Was X true at time T?" +# 3. World state at a time — "What was X like at T?" +# 4. Causal / chain reasoning — "Why did X happen?" +# 5. Open-ended narrative — "Tell me about X." +# +# Each question has: +# - id: stable id within the question type +# - query: the user-facing prompt +# - expected_tools: the canonical tool sequence (per the +# design doc; a passing LLM picks this sequence ±1 tool) +# - expected_answer_shape: a JSON-Schema-ish description +# used by the harness grader to validate the response +# shape (not the prose) +# - expected_citations: minimum number of distinct sources +# the answer should cite (AC 7.4) +# +# The questions target the Mardonari codex (the slice 0 +# fixture). Each question references entities the codex +# has — so the harness can run end-to-end against the +# real graph. + +version: 1 + +questions: + + # ===================================================================== + # Type 1: Identity & description + # ===================================================================== + + - id: t1.01 + type: identity + query: "Who is Roland Raventhorne?" + expected_tools: + - lookup + - entity_context + - significance_of + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.02 + type: identity + query: "Tell me about House Raventhorne." + expected_tools: + - lookup + - entity_context + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.03 + type: identity + query: "What is Mardsville?" + expected_tools: + - lookup + - entity_context + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.04 + type: identity + query: "Describe the Crimson Pact." + expected_tools: + - lookup + - entity_context + - significance_of + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.05 + type: identity + query: "Who was Aldric of Valdorn?" + expected_tools: + - lookup + - entity_context + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.06 + type: identity + query: "Tell me about the Wheel & Kiln." + expected_tools: + - lookup + - entity_context + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.07 + type: identity + query: "What kind of place is the Underdark?" + expected_tools: + - lookup + - entity_context + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.08 + type: identity + query: "Who is Voldramir's keeper?" + expected_tools: + - lookup + - entity_context + - significance_of + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.09 + type: identity + query: "Describe the Mardonari Material Plane." + expected_tools: + - lookup + - entity_context + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + - id: t1.10 + type: identity + query: "What is House Vyr?" + expected_tools: + - lookup + - entity_context + - significance_of + expected_answer_shape: + has_keys: ["entity", "summary"] + expected_citations: 1 + + # ===================================================================== + # Type 2: Time-bounded fact check + # ===================================================================== + + - id: t2.01 + type: time_fact + query: "Was Roland Raventhorne a member of House Raventhorne in 3rd_age.year_345?" + expected_tools: + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.02 + type: time_fact + query: "Did Aldric rule Valdorn during the 3rd Age?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.03 + type: time_fact + query: "Were House Vyr and the Crimson Pact allied in year 340 TA?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.04 + type: time_fact + query: "Was the Long Winter caused by the Sundering?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.05 + type: time_fact + query: "Did the Battle of Black Spire happen in 342 TA?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.06 + type: time_fact + query: "Was Voldramir accessible during the Sundering?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.07 + type: time_fact + query: "Were the Mardonari and Valdorni at war in year 360 TA?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.08 + type: time_fact + query: "Was Roland alive in the 4th Age?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.09 + type: time_fact + query: "Did Aldric sit on the Wheel & Kiln council in 3rd_age.year_300?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + - id: t2.10 + type: time_fact + query: "Was the Crimson Pact founded before the Sundering?" + expected_tools: + - lookup + - was_true_at + expected_answer_shape: + has_keys: ["was_true", "at_time"] + expected_citations: 1 + + # ===================================================================== + # Type 3: World state at a time + # ===================================================================== + + - id: t3.01 + type: world_state + query: "What was happening in Valdorn in 340 TA?" + expected_tools: + - entities_present + - events_during + expected_answer_shape: + has_keys: ["entities", "events"] + expected_citations: 2 + + - id: t3.02 + type: world_state + query: "Who was in Mardsville during the Border Wars?" + expected_tools: + - lookup + - entities_present + - events_during + expected_answer_shape: + has_keys: ["entities", "events"] + expected_citations: 2 + + - id: t3.03 + type: world_state + query: "What factions were active in Mardonari in year 380 TA?" + expected_tools: + - lookup + - entities_present + - events_during + expected_answer_shape: + has_keys: ["entities", "events"] + expected_citations: 2 + + - id: t3.04 + type: world_state + query: "Who lived in the Underdark in the 3rd Age?" + expected_tools: + - lookup + - entities_present + expected_answer_shape: + has_keys: ["entities"] + expected_citations: 1 + + - id: t3.05 + type: world_state + query: "What events occurred in the 4th Age?" + expected_tools: + - events_during + expected_answer_shape: + has_keys: ["events"] + expected_citations: 1 + + - id: t3.06 + type: world_state + query: "Who was on the Wheel & Kiln council in year 320 TA?" + expected_tools: + - lookup + - entities_present + expected_answer_shape: + has_keys: ["entities"] + expected_citations: 1 + + - id: t3.07 + type: world_state + query: "What was happening at the Crimson Pact headquarters in 350 TA?" + expected_tools: + - lookup + - entities_present + - events_during + expected_answer_shape: + has_keys: ["entities", "events"] + expected_citations: 2 + + - id: t3.08 + type: world_state + query: "Who was alive in the Mardonari region during the Sundering?" + expected_tools: + - entities_present + - events_during + expected_answer_shape: + has_keys: ["entities", "events"] + expected_citations: 2 + + - id: t3.09 + type: world_state + query: "What was the state of Voldramir in the 4th Age?" + expected_tools: + - lookup + - entities_present + expected_answer_shape: + has_keys: ["entities"] + expected_citations: 1 + + - id: t3.10 + type: world_state + query: "Who was ruling Valdorn at the start of the 3rd Age?" + expected_tools: + - lookup + - entities_present + expected_answer_shape: + has_keys: ["entities"] + expected_citations: 1 + + # ===================================================================== + # Type 4: Causal / chain reasoning + # ===================================================================== + + - id: t4.01 + type: causal + query: "Why did the Sundering happen?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.02 + type: causal + query: "What were the consequences of the Battle of Black Spire?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.03 + type: causal + query: "How did Aldric come to power?" + expected_tools: + - lookup + - event_chain + - ancestors_of + expected_answer_shape: + has_keys: ["causes", "ancestors"] + expected_citations: 2 + + - id: t4.04 + type: causal + query: "Why was the Crimson Pact founded?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.05 + type: causal + query: "What led to the Long Winter?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.06 + type: causal + query: "How did House Vyr fall?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.07 + type: causal + query: "What caused the Border Wars?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.08 + type: causal + query: "How did the Wheel & Kiln come to be the Mardonari council?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.09 + type: causal + query: "Why did the Mardonari and Valdorni go to war?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + - id: t4.10 + type: causal + query: "How did Voldramir become a prison plane?" + expected_tools: + - lookup + - event_chain + expected_answer_shape: + has_keys: ["causes", "effects"] + expected_citations: 2 + + # ===================================================================== + # Type 5: Open-ended narrative + # ===================================================================== + + - id: t5.01 + type: narrative + query: "Tell me about the Border Wars." + expected_tools: + - lookup + - event_chain + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.02 + type: narrative + query: "What was House Vyr like at its height?" + expected_tools: + - lookup + - entity_context + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.03 + type: narrative + query: "Describe the 3rd Age." + expected_tools: + - events_during + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.04 + type: narrative + query: "What was life like in Mardsville in the late 3rd Age?" + expected_tools: + - lookup + - entities_present + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.05 + type: narrative + query: "Describe the fall of the Crimson Pact." + expected_tools: + - lookup + - event_chain + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.06 + type: narrative + query: "Tell me about the cosmology of Mardonari." + expected_tools: + - lookup + - entity_context + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.07 + type: narrative + query: "What is the lore around Voldramir?" + expected_tools: + - lookup + - entity_context + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.08 + type: narrative + query: "Describe Aldric's reign." + expected_tools: + - lookup + - event_chain + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.09 + type: narrative + query: "What was happening in the Underdark during the Sundering?" + expected_tools: + - lookup + - entities_present + - events_during + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 + + - id: t5.10 + type: narrative + query: "Tell me about the founding of Mardonari." + expected_tools: + - lookup + - event_chain + - narrate_arc + expected_answer_shape: + has_keys: ["narrative"] + expected_citations: 3 diff --git a/tests/harness/test_questions.py b/tests/harness/test_questions.py new file mode 100644 index 0000000..e3643e8 --- /dev/null +++ b/tests/harness/test_questions.py @@ -0,0 +1,155 @@ +"""Slice 7.1 — Reasoning harness test set. + +Per ``docs/plan/exec/07-harness.md`` sub-slice 7.1, the +50 questions are authored in ``tests/harness/questions.yaml`` +and compiled to ``tests/harness/questions.json`` by +``scripts/harness/build_questions.py``. These tests pin the +contract: + + - The YAML compiles cleanly to JSON. + - The JSON has exactly 50 questions. + - Each of the 5 question types has exactly 10 questions + (per ``docs/07-reasoning-harness.md`` §"The five + question types"). + - Every question has the required keys (id, type, query, + expected_tools, expected_answer_shape, + expected_citations). + - The question ids are unique. + - The version is set (a contract with the runner: + results are version-stamped so old runs stay + comparable when the prompt iterates). + +The tests run offline (no API key) — they're Track A. +Track B (executing the harness against the live LLM) is a +follow-up that depends on ``$OLLAMA_API_KEY``. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent.parent +YAML_PATH = ROOT / "tests" / "harness" / "questions.yaml" +JSON_PATH = ROOT / "tests" / "harness" / "questions.json" +BUILD_SCRIPT = ROOT / "scripts" / "harness" / "build_questions.py" + +QUESTION_TYPES = ( + "identity", + "time_fact", + "world_state", + "causal", + "narrative", +) + +REQUIRED_KEYS = ( + "id", + "type", + "query", + "expected_tools", + "expected_answer_shape", + "expected_citations", +) + + +def _load_compiled() -> dict: + """Rebuild and read the questions.json. + + Rebuilding on every test run keeps the JSON in sync + with the YAML. The cost (one extra subprocess per + test session) is negligible; the benefit is "the test + can't lie" — a stale JSON would surface as a build + failure, not as a green test. + """ + subprocess.run( + [sys.executable, str(BUILD_SCRIPT), + "--yaml", str(YAML_PATH), "--out", str(JSON_PATH)], + check=True, cwd=str(ROOT), + ) + return json.loads(JSON_PATH.read_text(encoding="utf-8")) + + +def test_7_1_questions_match_schema() -> None: + """AC 7.1 — the YAML compiles to a JSON that has + ``version``, ``type_counts``, and ``questions`` keys, + and every question has the required fields. + """ + compiled = _load_compiled() + for key in ("version", "type_counts", "questions"): + assert key in compiled, f"missing top-level key {key!r}" + + for q in compiled["questions"]: + for k in REQUIRED_KEYS: + assert k in q, f"question {q.get('id')!r} missing {k!r}" + + +def test_7_1_50_questions_total() -> None: + """AC 7.1 — 5 question types × 10 = 50 total. The + 50-question count is the harness's contract. + """ + compiled = _load_compiled() + assert len(compiled["questions"]) == 50, ( + f"expected 50 questions, got {len(compiled['questions'])}" + ) + + +def test_7_1_10_per_type() -> None: + """AC 7.1 — each of the 5 question types is + represented by exactly 10 questions. + """ + compiled = _load_compiled() + type_counts = compiled["type_counts"] + for t in QUESTION_TYPES: + assert type_counts.get(t, 0) == 10, ( + f"type {t!r}: expected 10 questions, " + f"got {type_counts.get(t, 0)}" + ) + + +def test_7_1_every_question_has_expected_tools() -> None: + """AC 7.1 — every question has a non-empty + ``expected_tools`` list. The harness grades the + LLM's tool sequence against this expectation; an + empty list is meaningless. + """ + compiled = _load_compiled() + for q in compiled["questions"]: + tools = q.get("expected_tools") or [] + assert len(tools) > 0, ( + f"question {q.get('id')!r} has empty expected_tools" + ) + assert all(isinstance(t, str) and t for t in tools), ( + f"question {q.get('id')!r} has non-string tool name in " + f"{tools!r}" + ) + + +def test_7_1_question_ids_are_unique() -> None: + """Defensive: duplicate ids would silently overwrite + each other in the runner's results table. + """ + compiled = _load_compiled() + seen: set[str] = set() + duplicates: list[str] = [] + for q in compiled["questions"]: + qid = q.get("id") + if qid in seen: + duplicates.append(qid) + seen.add(qid) + assert not duplicates, f"duplicate question ids: {duplicates}" + + +def test_7_1_version_is_set() -> None: + """The version field is the contract with the runner + that "old results stay comparable when the prompt + iterates" (per the exec roadmap's D3). A missing + version is a bug. + """ + compiled = _load_compiled() + assert compiled.get("version"), "version field is empty or missing" + # The version is a positive integer (or semver-ish + # string); the runner treats it as opaque, but the + # type is pinned to catch accidental renames. + assert isinstance(compiled["version"], (int, str))