slice 7.1: 50-question reasoning harness test set

Per docs/plan/exec/07-harness.md sub-slice 7.1:

  - tests/harness/questions.yaml — the human-friendly
    YAML source. 50 questions across the 5 design-doc
    types (10 each): identity, time_fact, world_state,
    causal, narrative. Each question pins id, type,
    query, expected_tools, expected_answer_shape, and
    expected_citations. Targets the Mardonari codex
    (the slice 0 fixture) so the harness can run
    end-to-end against the real graph.
  - tests/harness/questions.json — the compiled JSON
    (committed so the runner reads it without rebuilding).
  - scripts/harness/build_questions.py — the strict
    compiler. Validates the YAML schema, counts questions
    per type, enforces uniqueness, writes the JSON.
    Validation errors fail loudly with field paths.
  - tests/harness/test_questions.py — 6 tests pinning the
    contract: schema, 50 total, 10 per type, expected_tools
    non-empty, ids unique, version set.

Track A only (no API key needed). Track B (executing
against the live LLM) is gated on $OLLAMA_API_KEY.

Suite: 761 → 767 (+6).

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Lore Engine Dev
2026-06-19 20:51:18 -04:00
parent 891e3adf37
commit 2367960540
4 changed files with 1732 additions and 0 deletions

View File

@@ -0,0 +1,192 @@
"""build_questions — compile tests/harness/questions.yaml → questions.json.
Per ``docs/plan/exec/07-harness.md`` sub-slice 7.1, the 50
question set is authored in YAML (human-friendly diffs) and
compiled to JSON (the harness runner reads JSON). The
compiler:
1. Loads the YAML via the project's strict loader
(``load_yaml``).
2. Validates: top-level ``version`` + ``questions`` list,
each question has the required keys, no duplicate ids,
the 5 question types are each represented by exactly
10 questions (AC 7.1).
3. Writes the JSON to ``tests/harness/questions.json``
(the runner's default input).
The build is intentionally a CLI command (not part of the
test suite's import path) — the JSON file is committed
to the repo so the runner can read it without re-running
the compiler.
Run:
python3 scripts/harness/build_questions.py \\
--yaml tests/harness/questions.yaml \\
--out tests/harness/questions.json
Without flags, defaults to the in-repo paths.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
from lore_engine_poc.parsers._yaml import YamlSchemaError, load_yaml
# The 5 question types, fixed by docs/07-reasoning-harness.md.
QUESTION_TYPES = (
"identity",
"time_fact",
"world_state",
"causal",
"narrative",
)
# Per-question required keys.
REQUIRED_KEYS = (
"id",
"type",
"query",
"expected_tools",
"expected_answer_shape",
"expected_citations",
)
def _validate_question(q: dict, where: str) -> list[str]:
"""Return a list of human-readable errors for one
question dict. Empty list = valid.
"""
errors: list[str] = []
for k in REQUIRED_KEYS:
if k not in q:
errors.append(f"{where}: missing required key '{k}'")
if "type" in q and q["type"] not in QUESTION_TYPES:
errors.append(
f"{where}: type '{q['type']}' not in "
f"{list(QUESTION_TYPES)}"
)
if "expected_tools" in q and not isinstance(q["expected_tools"], list):
errors.append(
f"{where}: expected_tools must be a list, got "
f"{type(q['expected_tools']).__name__}"
)
if "expected_citations" in q:
cit = q["expected_citations"]
if not isinstance(cit, int) or cit < 0:
errors.append(
f"{where}: expected_citations must be a non-negative int, got {cit!r}"
)
return errors
def build(yaml_path: Path, out_path: Path) -> int:
try:
data, _ = load_yaml(str(yaml_path))
except YamlSchemaError as e:
print(f"ERROR: {yaml_path}: {e}", file=sys.stderr)
return 1
if "version" not in data:
print(f"ERROR: {yaml_path}: missing top-level 'version'", file=sys.stderr)
return 1
if "questions" not in data:
print(f"ERROR: {yaml_path}: missing top-level 'questions'", file=sys.stderr)
return 1
questions = data["questions"]
if not isinstance(questions, list):
print(
f"ERROR: {yaml_path}: 'questions' must be a list, got "
f"{type(questions).__name__}",
file=sys.stderr,
)
return 1
# Per-question validation.
all_errors: list[str] = []
seen_ids: set[str] = set()
type_counts: dict[str, int] = {t: 0 for t in QUESTION_TYPES}
for i, q in enumerate(questions):
where = f"questions[{i}]"
if not isinstance(q, dict):
all_errors.append(f"{where}: must be a mapping, got {type(q).__name__}")
continue
all_errors.extend(_validate_question(q, where))
qid = q.get("id")
if qid is not None:
if qid in seen_ids:
all_errors.append(f"{where}: duplicate id {qid!r}")
seen_ids.add(qid)
qtype = q.get("type")
if isinstance(qtype, str) and qtype in type_counts:
type_counts[qtype] += 1
# Hard constraints (AC 7.1).
if len(questions) != 50:
all_errors.append(
f"expected exactly 50 questions, got {len(questions)}"
)
for t, expected in (
("identity", 10),
("time_fact", 10),
("world_state", 10),
("causal", 10),
("narrative", 10),
):
if type_counts.get(t, 0) != expected:
all_errors.append(
f"type '{t}': expected {expected} questions, "
f"got {type_counts.get(t, 0)}"
)
if all_errors:
for err in all_errors:
print(f"ERROR: {err}", file=sys.stderr)
return 1
# The compiled JSON keeps the YAML's structure 1:1 (the
# harness runner just reads the same keys). Pinning the
# version is a contract with the runner: old results
# stay comparable as the prompt iterates.
compiled: dict[str, Any] = {
"version": data["version"],
"type_counts": type_counts,
"questions": questions,
}
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(compiled, indent=2) + "\n", encoding="utf-8")
print(
f"[build_questions] {len(questions)} questions, "
f"version {data['version']}, wrote {out_path}"
)
return 0
def main() -> int:
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
p.add_argument(
"--yaml",
default=str(ROOT / "tests" / "harness" / "questions.yaml"),
help="YAML source (default: tests/harness/questions.yaml)",
)
p.add_argument(
"--out",
default=str(ROOT / "tests" / "harness" / "questions.json"),
help="JSON output (default: tests/harness/questions.json)",
)
args = p.parse_args()
return build(Path(args.yaml), Path(args.out))
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,813 @@
{
"version": 1,
"type_counts": {
"identity": 10,
"time_fact": 10,
"world_state": 10,
"causal": 10,
"narrative": 10
},
"questions": [
{
"id": "t1.01",
"type": "identity",
"query": "Who is Roland Raventhorne?",
"expected_tools": [
"lookup",
"entity_context",
"significance_of"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.02",
"type": "identity",
"query": "Tell me about House Raventhorne.",
"expected_tools": [
"lookup",
"entity_context"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.03",
"type": "identity",
"query": "What is Mardsville?",
"expected_tools": [
"lookup",
"entity_context"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.04",
"type": "identity",
"query": "Describe the Crimson Pact.",
"expected_tools": [
"lookup",
"entity_context",
"significance_of"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.05",
"type": "identity",
"query": "Who was Aldric of Valdorn?",
"expected_tools": [
"lookup",
"entity_context"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.06",
"type": "identity",
"query": "Tell me about the Wheel & Kiln.",
"expected_tools": [
"lookup",
"entity_context"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.07",
"type": "identity",
"query": "What kind of place is the Underdark?",
"expected_tools": [
"lookup",
"entity_context"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.08",
"type": "identity",
"query": "Who is Voldramir's keeper?",
"expected_tools": [
"lookup",
"entity_context",
"significance_of"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.09",
"type": "identity",
"query": "Describe the Mardonari Material Plane.",
"expected_tools": [
"lookup",
"entity_context"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t1.10",
"type": "identity",
"query": "What is House Vyr?",
"expected_tools": [
"lookup",
"entity_context",
"significance_of"
],
"expected_answer_shape": {
"has_keys": [
"entity",
"summary"
]
},
"expected_citations": 1
},
{
"id": "t2.01",
"type": "time_fact",
"query": "Was Roland Raventhorne a member of House Raventhorne in 3rd_age.year_345?",
"expected_tools": [
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.02",
"type": "time_fact",
"query": "Did Aldric rule Valdorn during the 3rd Age?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.03",
"type": "time_fact",
"query": "Were House Vyr and the Crimson Pact allied in year 340 TA?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.04",
"type": "time_fact",
"query": "Was the Long Winter caused by the Sundering?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.05",
"type": "time_fact",
"query": "Did the Battle of Black Spire happen in 342 TA?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.06",
"type": "time_fact",
"query": "Was Voldramir accessible during the Sundering?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.07",
"type": "time_fact",
"query": "Were the Mardonari and Valdorni at war in year 360 TA?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.08",
"type": "time_fact",
"query": "Was Roland alive in the 4th Age?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.09",
"type": "time_fact",
"query": "Did Aldric sit on the Wheel & Kiln council in 3rd_age.year_300?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t2.10",
"type": "time_fact",
"query": "Was the Crimson Pact founded before the Sundering?",
"expected_tools": [
"lookup",
"was_true_at"
],
"expected_answer_shape": {
"has_keys": [
"was_true",
"at_time"
]
},
"expected_citations": 1
},
{
"id": "t3.01",
"type": "world_state",
"query": "What was happening in Valdorn in 340 TA?",
"expected_tools": [
"entities_present",
"events_during"
],
"expected_answer_shape": {
"has_keys": [
"entities",
"events"
]
},
"expected_citations": 2
},
{
"id": "t3.02",
"type": "world_state",
"query": "Who was in Mardsville during the Border Wars?",
"expected_tools": [
"lookup",
"entities_present",
"events_during"
],
"expected_answer_shape": {
"has_keys": [
"entities",
"events"
]
},
"expected_citations": 2
},
{
"id": "t3.03",
"type": "world_state",
"query": "What factions were active in Mardonari in year 380 TA?",
"expected_tools": [
"lookup",
"entities_present",
"events_during"
],
"expected_answer_shape": {
"has_keys": [
"entities",
"events"
]
},
"expected_citations": 2
},
{
"id": "t3.04",
"type": "world_state",
"query": "Who lived in the Underdark in the 3rd Age?",
"expected_tools": [
"lookup",
"entities_present"
],
"expected_answer_shape": {
"has_keys": [
"entities"
]
},
"expected_citations": 1
},
{
"id": "t3.05",
"type": "world_state",
"query": "What events occurred in the 4th Age?",
"expected_tools": [
"events_during"
],
"expected_answer_shape": {
"has_keys": [
"events"
]
},
"expected_citations": 1
},
{
"id": "t3.06",
"type": "world_state",
"query": "Who was on the Wheel & Kiln council in year 320 TA?",
"expected_tools": [
"lookup",
"entities_present"
],
"expected_answer_shape": {
"has_keys": [
"entities"
]
},
"expected_citations": 1
},
{
"id": "t3.07",
"type": "world_state",
"query": "What was happening at the Crimson Pact headquarters in 350 TA?",
"expected_tools": [
"lookup",
"entities_present",
"events_during"
],
"expected_answer_shape": {
"has_keys": [
"entities",
"events"
]
},
"expected_citations": 2
},
{
"id": "t3.08",
"type": "world_state",
"query": "Who was alive in the Mardonari region during the Sundering?",
"expected_tools": [
"entities_present",
"events_during"
],
"expected_answer_shape": {
"has_keys": [
"entities",
"events"
]
},
"expected_citations": 2
},
{
"id": "t3.09",
"type": "world_state",
"query": "What was the state of Voldramir in the 4th Age?",
"expected_tools": [
"lookup",
"entities_present"
],
"expected_answer_shape": {
"has_keys": [
"entities"
]
},
"expected_citations": 1
},
{
"id": "t3.10",
"type": "world_state",
"query": "Who was ruling Valdorn at the start of the 3rd Age?",
"expected_tools": [
"lookup",
"entities_present"
],
"expected_answer_shape": {
"has_keys": [
"entities"
]
},
"expected_citations": 1
},
{
"id": "t4.01",
"type": "causal",
"query": "Why did the Sundering happen?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.02",
"type": "causal",
"query": "What were the consequences of the Battle of Black Spire?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.03",
"type": "causal",
"query": "How did Aldric come to power?",
"expected_tools": [
"lookup",
"event_chain",
"ancestors_of"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"ancestors"
]
},
"expected_citations": 2
},
{
"id": "t4.04",
"type": "causal",
"query": "Why was the Crimson Pact founded?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.05",
"type": "causal",
"query": "What led to the Long Winter?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.06",
"type": "causal",
"query": "How did House Vyr fall?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.07",
"type": "causal",
"query": "What caused the Border Wars?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.08",
"type": "causal",
"query": "How did the Wheel & Kiln come to be the Mardonari council?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.09",
"type": "causal",
"query": "Why did the Mardonari and Valdorni go to war?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t4.10",
"type": "causal",
"query": "How did Voldramir become a prison plane?",
"expected_tools": [
"lookup",
"event_chain"
],
"expected_answer_shape": {
"has_keys": [
"causes",
"effects"
]
},
"expected_citations": 2
},
{
"id": "t5.01",
"type": "narrative",
"query": "Tell me about the Border Wars.",
"expected_tools": [
"lookup",
"event_chain",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.02",
"type": "narrative",
"query": "What was House Vyr like at its height?",
"expected_tools": [
"lookup",
"entity_context",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.03",
"type": "narrative",
"query": "Describe the 3rd Age.",
"expected_tools": [
"events_during",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.04",
"type": "narrative",
"query": "What was life like in Mardsville in the late 3rd Age?",
"expected_tools": [
"lookup",
"entities_present",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.05",
"type": "narrative",
"query": "Describe the fall of the Crimson Pact.",
"expected_tools": [
"lookup",
"event_chain",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.06",
"type": "narrative",
"query": "Tell me about the cosmology of Mardonari.",
"expected_tools": [
"lookup",
"entity_context",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.07",
"type": "narrative",
"query": "What is the lore around Voldramir?",
"expected_tools": [
"lookup",
"entity_context",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.08",
"type": "narrative",
"query": "Describe Aldric's reign.",
"expected_tools": [
"lookup",
"event_chain",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.09",
"type": "narrative",
"query": "What was happening in the Underdark during the Sundering?",
"expected_tools": [
"lookup",
"entities_present",
"events_during",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
},
{
"id": "t5.10",
"type": "narrative",
"query": "Tell me about the founding of Mardonari.",
"expected_tools": [
"lookup",
"event_chain",
"narrate_arc"
],
"expected_answer_shape": {
"has_keys": [
"narrative"
]
},
"expected_citations": 3
}
]
}

View File

@@ -0,0 +1,572 @@
# Slice 7 — Reasoning harness test set.
#
# Per docs/plan/exec/07-harness.md sub-slice 7.1, this YAML
# is the source of the 50 questions the harness runs. The
# build script (scripts/harness/build_questions.py) compiles
# it to JSON (tests/harness/questions.json) at build time.
# Tests verify the schema + the count.
#
# The five question types come from
# docs/07-reasoning-harness.md §"The five question types":
#
# 1. Identity & description — "Who is X?"
# 2. Time-bounded fact check — "Was X true at time T?"
# 3. World state at a time — "What was X like at T?"
# 4. Causal / chain reasoning — "Why did X happen?"
# 5. Open-ended narrative — "Tell me about X."
#
# Each question has:
# - id: stable id within the question type
# - query: the user-facing prompt
# - expected_tools: the canonical tool sequence (per the
# design doc; a passing LLM picks this sequence ±1 tool)
# - expected_answer_shape: a JSON-Schema-ish description
# used by the harness grader to validate the response
# shape (not the prose)
# - expected_citations: minimum number of distinct sources
# the answer should cite (AC 7.4)
#
# The questions target the Mardonari codex (the slice 0
# fixture). Each question references entities the codex
# has — so the harness can run end-to-end against the
# real graph.
version: 1
questions:
# =====================================================================
# Type 1: Identity & description
# =====================================================================
- id: t1.01
type: identity
query: "Who is Roland Raventhorne?"
expected_tools:
- lookup
- entity_context
- significance_of
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.02
type: identity
query: "Tell me about House Raventhorne."
expected_tools:
- lookup
- entity_context
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.03
type: identity
query: "What is Mardsville?"
expected_tools:
- lookup
- entity_context
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.04
type: identity
query: "Describe the Crimson Pact."
expected_tools:
- lookup
- entity_context
- significance_of
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.05
type: identity
query: "Who was Aldric of Valdorn?"
expected_tools:
- lookup
- entity_context
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.06
type: identity
query: "Tell me about the Wheel & Kiln."
expected_tools:
- lookup
- entity_context
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.07
type: identity
query: "What kind of place is the Underdark?"
expected_tools:
- lookup
- entity_context
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.08
type: identity
query: "Who is Voldramir's keeper?"
expected_tools:
- lookup
- entity_context
- significance_of
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.09
type: identity
query: "Describe the Mardonari Material Plane."
expected_tools:
- lookup
- entity_context
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
- id: t1.10
type: identity
query: "What is House Vyr?"
expected_tools:
- lookup
- entity_context
- significance_of
expected_answer_shape:
has_keys: ["entity", "summary"]
expected_citations: 1
# =====================================================================
# Type 2: Time-bounded fact check
# =====================================================================
- id: t2.01
type: time_fact
query: "Was Roland Raventhorne a member of House Raventhorne in 3rd_age.year_345?"
expected_tools:
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.02
type: time_fact
query: "Did Aldric rule Valdorn during the 3rd Age?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.03
type: time_fact
query: "Were House Vyr and the Crimson Pact allied in year 340 TA?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.04
type: time_fact
query: "Was the Long Winter caused by the Sundering?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.05
type: time_fact
query: "Did the Battle of Black Spire happen in 342 TA?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.06
type: time_fact
query: "Was Voldramir accessible during the Sundering?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.07
type: time_fact
query: "Were the Mardonari and Valdorni at war in year 360 TA?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.08
type: time_fact
query: "Was Roland alive in the 4th Age?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.09
type: time_fact
query: "Did Aldric sit on the Wheel & Kiln council in 3rd_age.year_300?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
- id: t2.10
type: time_fact
query: "Was the Crimson Pact founded before the Sundering?"
expected_tools:
- lookup
- was_true_at
expected_answer_shape:
has_keys: ["was_true", "at_time"]
expected_citations: 1
# =====================================================================
# Type 3: World state at a time
# =====================================================================
- id: t3.01
type: world_state
query: "What was happening in Valdorn in 340 TA?"
expected_tools:
- entities_present
- events_during
expected_answer_shape:
has_keys: ["entities", "events"]
expected_citations: 2
- id: t3.02
type: world_state
query: "Who was in Mardsville during the Border Wars?"
expected_tools:
- lookup
- entities_present
- events_during
expected_answer_shape:
has_keys: ["entities", "events"]
expected_citations: 2
- id: t3.03
type: world_state
query: "What factions were active in Mardonari in year 380 TA?"
expected_tools:
- lookup
- entities_present
- events_during
expected_answer_shape:
has_keys: ["entities", "events"]
expected_citations: 2
- id: t3.04
type: world_state
query: "Who lived in the Underdark in the 3rd Age?"
expected_tools:
- lookup
- entities_present
expected_answer_shape:
has_keys: ["entities"]
expected_citations: 1
- id: t3.05
type: world_state
query: "What events occurred in the 4th Age?"
expected_tools:
- events_during
expected_answer_shape:
has_keys: ["events"]
expected_citations: 1
- id: t3.06
type: world_state
query: "Who was on the Wheel & Kiln council in year 320 TA?"
expected_tools:
- lookup
- entities_present
expected_answer_shape:
has_keys: ["entities"]
expected_citations: 1
- id: t3.07
type: world_state
query: "What was happening at the Crimson Pact headquarters in 350 TA?"
expected_tools:
- lookup
- entities_present
- events_during
expected_answer_shape:
has_keys: ["entities", "events"]
expected_citations: 2
- id: t3.08
type: world_state
query: "Who was alive in the Mardonari region during the Sundering?"
expected_tools:
- entities_present
- events_during
expected_answer_shape:
has_keys: ["entities", "events"]
expected_citations: 2
- id: t3.09
type: world_state
query: "What was the state of Voldramir in the 4th Age?"
expected_tools:
- lookup
- entities_present
expected_answer_shape:
has_keys: ["entities"]
expected_citations: 1
- id: t3.10
type: world_state
query: "Who was ruling Valdorn at the start of the 3rd Age?"
expected_tools:
- lookup
- entities_present
expected_answer_shape:
has_keys: ["entities"]
expected_citations: 1
# =====================================================================
# Type 4: Causal / chain reasoning
# =====================================================================
- id: t4.01
type: causal
query: "Why did the Sundering happen?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.02
type: causal
query: "What were the consequences of the Battle of Black Spire?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.03
type: causal
query: "How did Aldric come to power?"
expected_tools:
- lookup
- event_chain
- ancestors_of
expected_answer_shape:
has_keys: ["causes", "ancestors"]
expected_citations: 2
- id: t4.04
type: causal
query: "Why was the Crimson Pact founded?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.05
type: causal
query: "What led to the Long Winter?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.06
type: causal
query: "How did House Vyr fall?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.07
type: causal
query: "What caused the Border Wars?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.08
type: causal
query: "How did the Wheel & Kiln come to be the Mardonari council?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.09
type: causal
query: "Why did the Mardonari and Valdorni go to war?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
- id: t4.10
type: causal
query: "How did Voldramir become a prison plane?"
expected_tools:
- lookup
- event_chain
expected_answer_shape:
has_keys: ["causes", "effects"]
expected_citations: 2
# =====================================================================
# Type 5: Open-ended narrative
# =====================================================================
- id: t5.01
type: narrative
query: "Tell me about the Border Wars."
expected_tools:
- lookup
- event_chain
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.02
type: narrative
query: "What was House Vyr like at its height?"
expected_tools:
- lookup
- entity_context
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.03
type: narrative
query: "Describe the 3rd Age."
expected_tools:
- events_during
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.04
type: narrative
query: "What was life like in Mardsville in the late 3rd Age?"
expected_tools:
- lookup
- entities_present
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.05
type: narrative
query: "Describe the fall of the Crimson Pact."
expected_tools:
- lookup
- event_chain
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.06
type: narrative
query: "Tell me about the cosmology of Mardonari."
expected_tools:
- lookup
- entity_context
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.07
type: narrative
query: "What is the lore around Voldramir?"
expected_tools:
- lookup
- entity_context
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.08
type: narrative
query: "Describe Aldric's reign."
expected_tools:
- lookup
- event_chain
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.09
type: narrative
query: "What was happening in the Underdark during the Sundering?"
expected_tools:
- lookup
- entities_present
- events_during
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3
- id: t5.10
type: narrative
query: "Tell me about the founding of Mardonari."
expected_tools:
- lookup
- event_chain
- narrate_arc
expected_answer_shape:
has_keys: ["narrative"]
expected_citations: 3

View File

@@ -0,0 +1,155 @@
"""Slice 7.1 — Reasoning harness test set.
Per ``docs/plan/exec/07-harness.md`` sub-slice 7.1, the
50 questions are authored in ``tests/harness/questions.yaml``
and compiled to ``tests/harness/questions.json`` by
``scripts/harness/build_questions.py``. These tests pin the
contract:
- The YAML compiles cleanly to JSON.
- The JSON has exactly 50 questions.
- Each of the 5 question types has exactly 10 questions
(per ``docs/07-reasoning-harness.md`` §"The five
question types").
- Every question has the required keys (id, type, query,
expected_tools, expected_answer_shape,
expected_citations).
- The question ids are unique.
- The version is set (a contract with the runner:
results are version-stamped so old runs stay
comparable when the prompt iterates).
The tests run offline (no API key) — they're Track A.
Track B (executing the harness against the live LLM) is a
follow-up that depends on ``$OLLAMA_API_KEY``.
"""
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent.parent
YAML_PATH = ROOT / "tests" / "harness" / "questions.yaml"
JSON_PATH = ROOT / "tests" / "harness" / "questions.json"
BUILD_SCRIPT = ROOT / "scripts" / "harness" / "build_questions.py"
QUESTION_TYPES = (
"identity",
"time_fact",
"world_state",
"causal",
"narrative",
)
REQUIRED_KEYS = (
"id",
"type",
"query",
"expected_tools",
"expected_answer_shape",
"expected_citations",
)
def _load_compiled() -> dict:
"""Rebuild and read the questions.json.
Rebuilding on every test run keeps the JSON in sync
with the YAML. The cost (one extra subprocess per
test session) is negligible; the benefit is "the test
can't lie" — a stale JSON would surface as a build
failure, not as a green test.
"""
subprocess.run(
[sys.executable, str(BUILD_SCRIPT),
"--yaml", str(YAML_PATH), "--out", str(JSON_PATH)],
check=True, cwd=str(ROOT),
)
return json.loads(JSON_PATH.read_text(encoding="utf-8"))
def test_7_1_questions_match_schema() -> None:
"""AC 7.1 — the YAML compiles to a JSON that has
``version``, ``type_counts``, and ``questions`` keys,
and every question has the required fields.
"""
compiled = _load_compiled()
for key in ("version", "type_counts", "questions"):
assert key in compiled, f"missing top-level key {key!r}"
for q in compiled["questions"]:
for k in REQUIRED_KEYS:
assert k in q, f"question {q.get('id')!r} missing {k!r}"
def test_7_1_50_questions_total() -> None:
"""AC 7.1 — 5 question types × 10 = 50 total. The
50-question count is the harness's contract.
"""
compiled = _load_compiled()
assert len(compiled["questions"]) == 50, (
f"expected 50 questions, got {len(compiled['questions'])}"
)
def test_7_1_10_per_type() -> None:
"""AC 7.1 — each of the 5 question types is
represented by exactly 10 questions.
"""
compiled = _load_compiled()
type_counts = compiled["type_counts"]
for t in QUESTION_TYPES:
assert type_counts.get(t, 0) == 10, (
f"type {t!r}: expected 10 questions, "
f"got {type_counts.get(t, 0)}"
)
def test_7_1_every_question_has_expected_tools() -> None:
"""AC 7.1 — every question has a non-empty
``expected_tools`` list. The harness grades the
LLM's tool sequence against this expectation; an
empty list is meaningless.
"""
compiled = _load_compiled()
for q in compiled["questions"]:
tools = q.get("expected_tools") or []
assert len(tools) > 0, (
f"question {q.get('id')!r} has empty expected_tools"
)
assert all(isinstance(t, str) and t for t in tools), (
f"question {q.get('id')!r} has non-string tool name in "
f"{tools!r}"
)
def test_7_1_question_ids_are_unique() -> None:
"""Defensive: duplicate ids would silently overwrite
each other in the runner's results table.
"""
compiled = _load_compiled()
seen: set[str] = set()
duplicates: list[str] = []
for q in compiled["questions"]:
qid = q.get("id")
if qid in seen:
duplicates.append(qid)
seen.add(qid)
assert not duplicates, f"duplicate question ids: {duplicates}"
def test_7_1_version_is_set() -> None:
"""The version field is the contract with the runner
that "old results stay comparable when the prompt
iterates" (per the exec roadmap's D3). A missing
version is a bug.
"""
compiled = _load_compiled()
assert compiled.get("version"), "version field is empty or missing"
# The version is a positive integer (or semver-ish
# string); the runner treats it as opaque, but the
# type is pinned to catch accidental renames.
assert isinstance(compiled["version"], (int, str))