slice 7.1: 50-question reasoning harness test set
Per docs/plan/exec/07-harness.md sub-slice 7.1:
- tests/harness/questions.yaml — the human-friendly
YAML source. 50 questions across the 5 design-doc
types (10 each): identity, time_fact, world_state,
causal, narrative. Each question pins id, type,
query, expected_tools, expected_answer_shape, and
expected_citations. Targets the Mardonari codex
(the slice 0 fixture) so the harness can run
end-to-end against the real graph.
- tests/harness/questions.json — the compiled JSON
(committed so the runner reads it without rebuilding).
- scripts/harness/build_questions.py — the strict
compiler. Validates the YAML schema, counts questions
per type, enforces uniqueness, writes the JSON.
Validation errors fail loudly with field paths.
- tests/harness/test_questions.py — 6 tests pinning the
contract: schema, 50 total, 10 per type, expected_tools
non-empty, ids unique, version set.
Track A only (no API key needed). Track B (executing
against the live LLM) is gated on $OLLAMA_API_KEY.
Suite: 761 → 767 (+6).
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
192
scripts/harness/build_questions.py
Normal file
192
scripts/harness/build_questions.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""build_questions — compile tests/harness/questions.yaml → questions.json.
|
||||
|
||||
Per ``docs/plan/exec/07-harness.md`` sub-slice 7.1, the 50
|
||||
question set is authored in YAML (human-friendly diffs) and
|
||||
compiled to JSON (the harness runner reads JSON). The
|
||||
compiler:
|
||||
|
||||
1. Loads the YAML via the project's strict loader
|
||||
(``load_yaml``).
|
||||
2. Validates: top-level ``version`` + ``questions`` list,
|
||||
each question has the required keys, no duplicate ids,
|
||||
the 5 question types are each represented by exactly
|
||||
10 questions (AC 7.1).
|
||||
3. Writes the JSON to ``tests/harness/questions.json``
|
||||
(the runner's default input).
|
||||
|
||||
The build is intentionally a CLI command (not part of the
|
||||
test suite's import path) — the JSON file is committed
|
||||
to the repo so the runner can read it without re-running
|
||||
the compiler.
|
||||
|
||||
Run:
|
||||
|
||||
python3 scripts/harness/build_questions.py \\
|
||||
--yaml tests/harness/questions.yaml \\
|
||||
--out tests/harness/questions.json
|
||||
|
||||
Without flags, defaults to the in-repo paths.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from lore_engine_poc.parsers._yaml import YamlSchemaError, load_yaml
|
||||
|
||||
|
||||
# The 5 question types, fixed by docs/07-reasoning-harness.md.
|
||||
QUESTION_TYPES = (
|
||||
"identity",
|
||||
"time_fact",
|
||||
"world_state",
|
||||
"causal",
|
||||
"narrative",
|
||||
)
|
||||
|
||||
# Per-question required keys.
|
||||
REQUIRED_KEYS = (
|
||||
"id",
|
||||
"type",
|
||||
"query",
|
||||
"expected_tools",
|
||||
"expected_answer_shape",
|
||||
"expected_citations",
|
||||
)
|
||||
|
||||
|
||||
def _validate_question(q: dict, where: str) -> list[str]:
|
||||
"""Return a list of human-readable errors for one
|
||||
question dict. Empty list = valid.
|
||||
"""
|
||||
errors: list[str] = []
|
||||
for k in REQUIRED_KEYS:
|
||||
if k not in q:
|
||||
errors.append(f"{where}: missing required key '{k}'")
|
||||
if "type" in q and q["type"] not in QUESTION_TYPES:
|
||||
errors.append(
|
||||
f"{where}: type '{q['type']}' not in "
|
||||
f"{list(QUESTION_TYPES)}"
|
||||
)
|
||||
if "expected_tools" in q and not isinstance(q["expected_tools"], list):
|
||||
errors.append(
|
||||
f"{where}: expected_tools must be a list, got "
|
||||
f"{type(q['expected_tools']).__name__}"
|
||||
)
|
||||
if "expected_citations" in q:
|
||||
cit = q["expected_citations"]
|
||||
if not isinstance(cit, int) or cit < 0:
|
||||
errors.append(
|
||||
f"{where}: expected_citations must be a non-negative int, got {cit!r}"
|
||||
)
|
||||
return errors
|
||||
|
||||
|
||||
def build(yaml_path: Path, out_path: Path) -> int:
|
||||
try:
|
||||
data, _ = load_yaml(str(yaml_path))
|
||||
except YamlSchemaError as e:
|
||||
print(f"ERROR: {yaml_path}: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if "version" not in data:
|
||||
print(f"ERROR: {yaml_path}: missing top-level 'version'", file=sys.stderr)
|
||||
return 1
|
||||
if "questions" not in data:
|
||||
print(f"ERROR: {yaml_path}: missing top-level 'questions'", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
questions = data["questions"]
|
||||
if not isinstance(questions, list):
|
||||
print(
|
||||
f"ERROR: {yaml_path}: 'questions' must be a list, got "
|
||||
f"{type(questions).__name__}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
# Per-question validation.
|
||||
all_errors: list[str] = []
|
||||
seen_ids: set[str] = set()
|
||||
type_counts: dict[str, int] = {t: 0 for t in QUESTION_TYPES}
|
||||
for i, q in enumerate(questions):
|
||||
where = f"questions[{i}]"
|
||||
if not isinstance(q, dict):
|
||||
all_errors.append(f"{where}: must be a mapping, got {type(q).__name__}")
|
||||
continue
|
||||
all_errors.extend(_validate_question(q, where))
|
||||
qid = q.get("id")
|
||||
if qid is not None:
|
||||
if qid in seen_ids:
|
||||
all_errors.append(f"{where}: duplicate id {qid!r}")
|
||||
seen_ids.add(qid)
|
||||
qtype = q.get("type")
|
||||
if isinstance(qtype, str) and qtype in type_counts:
|
||||
type_counts[qtype] += 1
|
||||
|
||||
# Hard constraints (AC 7.1).
|
||||
if len(questions) != 50:
|
||||
all_errors.append(
|
||||
f"expected exactly 50 questions, got {len(questions)}"
|
||||
)
|
||||
for t, expected in (
|
||||
("identity", 10),
|
||||
("time_fact", 10),
|
||||
("world_state", 10),
|
||||
("causal", 10),
|
||||
("narrative", 10),
|
||||
):
|
||||
if type_counts.get(t, 0) != expected:
|
||||
all_errors.append(
|
||||
f"type '{t}': expected {expected} questions, "
|
||||
f"got {type_counts.get(t, 0)}"
|
||||
)
|
||||
|
||||
if all_errors:
|
||||
for err in all_errors:
|
||||
print(f"ERROR: {err}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# The compiled JSON keeps the YAML's structure 1:1 (the
|
||||
# harness runner just reads the same keys). Pinning the
|
||||
# version is a contract with the runner: old results
|
||||
# stay comparable as the prompt iterates.
|
||||
compiled: dict[str, Any] = {
|
||||
"version": data["version"],
|
||||
"type_counts": type_counts,
|
||||
"questions": questions,
|
||||
}
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(json.dumps(compiled, indent=2) + "\n", encoding="utf-8")
|
||||
print(
|
||||
f"[build_questions] {len(questions)} questions, "
|
||||
f"version {data['version']}, wrote {out_path}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||
p.add_argument(
|
||||
"--yaml",
|
||||
default=str(ROOT / "tests" / "harness" / "questions.yaml"),
|
||||
help="YAML source (default: tests/harness/questions.yaml)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--out",
|
||||
default=str(ROOT / "tests" / "harness" / "questions.json"),
|
||||
help="JSON output (default: tests/harness/questions.json)",
|
||||
)
|
||||
args = p.parse_args()
|
||||
return build(Path(args.yaml), Path(args.out))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
813
tests/harness/questions.json
Normal file
813
tests/harness/questions.json
Normal file
@@ -0,0 +1,813 @@
|
||||
{
|
||||
"version": 1,
|
||||
"type_counts": {
|
||||
"identity": 10,
|
||||
"time_fact": 10,
|
||||
"world_state": 10,
|
||||
"causal": 10,
|
||||
"narrative": 10
|
||||
},
|
||||
"questions": [
|
||||
{
|
||||
"id": "t1.01",
|
||||
"type": "identity",
|
||||
"query": "Who is Roland Raventhorne?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"significance_of"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.02",
|
||||
"type": "identity",
|
||||
"query": "Tell me about House Raventhorne.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.03",
|
||||
"type": "identity",
|
||||
"query": "What is Mardsville?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.04",
|
||||
"type": "identity",
|
||||
"query": "Describe the Crimson Pact.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"significance_of"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.05",
|
||||
"type": "identity",
|
||||
"query": "Who was Aldric of Valdorn?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.06",
|
||||
"type": "identity",
|
||||
"query": "Tell me about the Wheel & Kiln.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.07",
|
||||
"type": "identity",
|
||||
"query": "What kind of place is the Underdark?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.08",
|
||||
"type": "identity",
|
||||
"query": "Who is Voldramir's keeper?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"significance_of"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.09",
|
||||
"type": "identity",
|
||||
"query": "Describe the Mardonari Material Plane.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t1.10",
|
||||
"type": "identity",
|
||||
"query": "What is House Vyr?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"significance_of"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entity",
|
||||
"summary"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.01",
|
||||
"type": "time_fact",
|
||||
"query": "Was Roland Raventhorne a member of House Raventhorne in 3rd_age.year_345?",
|
||||
"expected_tools": [
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.02",
|
||||
"type": "time_fact",
|
||||
"query": "Did Aldric rule Valdorn during the 3rd Age?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.03",
|
||||
"type": "time_fact",
|
||||
"query": "Were House Vyr and the Crimson Pact allied in year 340 TA?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.04",
|
||||
"type": "time_fact",
|
||||
"query": "Was the Long Winter caused by the Sundering?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.05",
|
||||
"type": "time_fact",
|
||||
"query": "Did the Battle of Black Spire happen in 342 TA?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.06",
|
||||
"type": "time_fact",
|
||||
"query": "Was Voldramir accessible during the Sundering?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.07",
|
||||
"type": "time_fact",
|
||||
"query": "Were the Mardonari and Valdorni at war in year 360 TA?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.08",
|
||||
"type": "time_fact",
|
||||
"query": "Was Roland alive in the 4th Age?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.09",
|
||||
"type": "time_fact",
|
||||
"query": "Did Aldric sit on the Wheel & Kiln council in 3rd_age.year_300?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t2.10",
|
||||
"type": "time_fact",
|
||||
"query": "Was the Crimson Pact founded before the Sundering?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"was_true_at"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"was_true",
|
||||
"at_time"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t3.01",
|
||||
"type": "world_state",
|
||||
"query": "What was happening in Valdorn in 340 TA?",
|
||||
"expected_tools": [
|
||||
"entities_present",
|
||||
"events_during"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities",
|
||||
"events"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t3.02",
|
||||
"type": "world_state",
|
||||
"query": "Who was in Mardsville during the Border Wars?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present",
|
||||
"events_during"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities",
|
||||
"events"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t3.03",
|
||||
"type": "world_state",
|
||||
"query": "What factions were active in Mardonari in year 380 TA?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present",
|
||||
"events_during"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities",
|
||||
"events"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t3.04",
|
||||
"type": "world_state",
|
||||
"query": "Who lived in the Underdark in the 3rd Age?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t3.05",
|
||||
"type": "world_state",
|
||||
"query": "What events occurred in the 4th Age?",
|
||||
"expected_tools": [
|
||||
"events_during"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"events"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t3.06",
|
||||
"type": "world_state",
|
||||
"query": "Who was on the Wheel & Kiln council in year 320 TA?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t3.07",
|
||||
"type": "world_state",
|
||||
"query": "What was happening at the Crimson Pact headquarters in 350 TA?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present",
|
||||
"events_during"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities",
|
||||
"events"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t3.08",
|
||||
"type": "world_state",
|
||||
"query": "Who was alive in the Mardonari region during the Sundering?",
|
||||
"expected_tools": [
|
||||
"entities_present",
|
||||
"events_during"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities",
|
||||
"events"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t3.09",
|
||||
"type": "world_state",
|
||||
"query": "What was the state of Voldramir in the 4th Age?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t3.10",
|
||||
"type": "world_state",
|
||||
"query": "Who was ruling Valdorn at the start of the 3rd Age?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"entities"
|
||||
]
|
||||
},
|
||||
"expected_citations": 1
|
||||
},
|
||||
{
|
||||
"id": "t4.01",
|
||||
"type": "causal",
|
||||
"query": "Why did the Sundering happen?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.02",
|
||||
"type": "causal",
|
||||
"query": "What were the consequences of the Battle of Black Spire?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.03",
|
||||
"type": "causal",
|
||||
"query": "How did Aldric come to power?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain",
|
||||
"ancestors_of"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"ancestors"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.04",
|
||||
"type": "causal",
|
||||
"query": "Why was the Crimson Pact founded?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.05",
|
||||
"type": "causal",
|
||||
"query": "What led to the Long Winter?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.06",
|
||||
"type": "causal",
|
||||
"query": "How did House Vyr fall?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.07",
|
||||
"type": "causal",
|
||||
"query": "What caused the Border Wars?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.08",
|
||||
"type": "causal",
|
||||
"query": "How did the Wheel & Kiln come to be the Mardonari council?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.09",
|
||||
"type": "causal",
|
||||
"query": "Why did the Mardonari and Valdorni go to war?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t4.10",
|
||||
"type": "causal",
|
||||
"query": "How did Voldramir become a prison plane?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"causes",
|
||||
"effects"
|
||||
]
|
||||
},
|
||||
"expected_citations": 2
|
||||
},
|
||||
{
|
||||
"id": "t5.01",
|
||||
"type": "narrative",
|
||||
"query": "Tell me about the Border Wars.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.02",
|
||||
"type": "narrative",
|
||||
"query": "What was House Vyr like at its height?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.03",
|
||||
"type": "narrative",
|
||||
"query": "Describe the 3rd Age.",
|
||||
"expected_tools": [
|
||||
"events_during",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.04",
|
||||
"type": "narrative",
|
||||
"query": "What was life like in Mardsville in the late 3rd Age?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.05",
|
||||
"type": "narrative",
|
||||
"query": "Describe the fall of the Crimson Pact.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.06",
|
||||
"type": "narrative",
|
||||
"query": "Tell me about the cosmology of Mardonari.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.07",
|
||||
"type": "narrative",
|
||||
"query": "What is the lore around Voldramir?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entity_context",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.08",
|
||||
"type": "narrative",
|
||||
"query": "Describe Aldric's reign.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.09",
|
||||
"type": "narrative",
|
||||
"query": "What was happening in the Underdark during the Sundering?",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"entities_present",
|
||||
"events_during",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
},
|
||||
{
|
||||
"id": "t5.10",
|
||||
"type": "narrative",
|
||||
"query": "Tell me about the founding of Mardonari.",
|
||||
"expected_tools": [
|
||||
"lookup",
|
||||
"event_chain",
|
||||
"narrate_arc"
|
||||
],
|
||||
"expected_answer_shape": {
|
||||
"has_keys": [
|
||||
"narrative"
|
||||
]
|
||||
},
|
||||
"expected_citations": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
572
tests/harness/questions.yaml
Normal file
572
tests/harness/questions.yaml
Normal file
@@ -0,0 +1,572 @@
|
||||
# Slice 7 — Reasoning harness test set.
|
||||
#
|
||||
# Per docs/plan/exec/07-harness.md sub-slice 7.1, this YAML
|
||||
# is the source of the 50 questions the harness runs. The
|
||||
# build script (scripts/harness/build_questions.py) compiles
|
||||
# it to JSON (tests/harness/questions.json) at build time.
|
||||
# Tests verify the schema + the count.
|
||||
#
|
||||
# The five question types come from
|
||||
# docs/07-reasoning-harness.md §"The five question types":
|
||||
#
|
||||
# 1. Identity & description — "Who is X?"
|
||||
# 2. Time-bounded fact check — "Was X true at time T?"
|
||||
# 3. World state at a time — "What was X like at T?"
|
||||
# 4. Causal / chain reasoning — "Why did X happen?"
|
||||
# 5. Open-ended narrative — "Tell me about X."
|
||||
#
|
||||
# Each question has:
|
||||
# - id: stable id within the question type
|
||||
# - query: the user-facing prompt
|
||||
# - expected_tools: the canonical tool sequence (per the
|
||||
# design doc; a passing LLM picks this sequence ±1 tool)
|
||||
# - expected_answer_shape: a JSON-Schema-ish description
|
||||
# used by the harness grader to validate the response
|
||||
# shape (not the prose)
|
||||
# - expected_citations: minimum number of distinct sources
|
||||
# the answer should cite (AC 7.4)
|
||||
#
|
||||
# The questions target the Mardonari codex (the slice 0
|
||||
# fixture). Each question references entities the codex
|
||||
# has — so the harness can run end-to-end against the
|
||||
# real graph.
|
||||
|
||||
version: 1
|
||||
|
||||
questions:
|
||||
|
||||
# =====================================================================
|
||||
# Type 1: Identity & description
|
||||
# =====================================================================
|
||||
|
||||
- id: t1.01
|
||||
type: identity
|
||||
query: "Who is Roland Raventhorne?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- significance_of
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.02
|
||||
type: identity
|
||||
query: "Tell me about House Raventhorne."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.03
|
||||
type: identity
|
||||
query: "What is Mardsville?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.04
|
||||
type: identity
|
||||
query: "Describe the Crimson Pact."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- significance_of
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.05
|
||||
type: identity
|
||||
query: "Who was Aldric of Valdorn?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.06
|
||||
type: identity
|
||||
query: "Tell me about the Wheel & Kiln."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.07
|
||||
type: identity
|
||||
query: "What kind of place is the Underdark?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.08
|
||||
type: identity
|
||||
query: "Who is Voldramir's keeper?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- significance_of
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.09
|
||||
type: identity
|
||||
query: "Describe the Mardonari Material Plane."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t1.10
|
||||
type: identity
|
||||
query: "What is House Vyr?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- significance_of
|
||||
expected_answer_shape:
|
||||
has_keys: ["entity", "summary"]
|
||||
expected_citations: 1
|
||||
|
||||
# =====================================================================
|
||||
# Type 2: Time-bounded fact check
|
||||
# =====================================================================
|
||||
|
||||
- id: t2.01
|
||||
type: time_fact
|
||||
query: "Was Roland Raventhorne a member of House Raventhorne in 3rd_age.year_345?"
|
||||
expected_tools:
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.02
|
||||
type: time_fact
|
||||
query: "Did Aldric rule Valdorn during the 3rd Age?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.03
|
||||
type: time_fact
|
||||
query: "Were House Vyr and the Crimson Pact allied in year 340 TA?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.04
|
||||
type: time_fact
|
||||
query: "Was the Long Winter caused by the Sundering?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.05
|
||||
type: time_fact
|
||||
query: "Did the Battle of Black Spire happen in 342 TA?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.06
|
||||
type: time_fact
|
||||
query: "Was Voldramir accessible during the Sundering?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.07
|
||||
type: time_fact
|
||||
query: "Were the Mardonari and Valdorni at war in year 360 TA?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.08
|
||||
type: time_fact
|
||||
query: "Was Roland alive in the 4th Age?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.09
|
||||
type: time_fact
|
||||
query: "Did Aldric sit on the Wheel & Kiln council in 3rd_age.year_300?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t2.10
|
||||
type: time_fact
|
||||
query: "Was the Crimson Pact founded before the Sundering?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- was_true_at
|
||||
expected_answer_shape:
|
||||
has_keys: ["was_true", "at_time"]
|
||||
expected_citations: 1
|
||||
|
||||
# =====================================================================
|
||||
# Type 3: World state at a time
|
||||
# =====================================================================
|
||||
|
||||
- id: t3.01
|
||||
type: world_state
|
||||
query: "What was happening in Valdorn in 340 TA?"
|
||||
expected_tools:
|
||||
- entities_present
|
||||
- events_during
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities", "events"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t3.02
|
||||
type: world_state
|
||||
query: "Who was in Mardsville during the Border Wars?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
- events_during
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities", "events"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t3.03
|
||||
type: world_state
|
||||
query: "What factions were active in Mardonari in year 380 TA?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
- events_during
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities", "events"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t3.04
|
||||
type: world_state
|
||||
query: "Who lived in the Underdark in the 3rd Age?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t3.05
|
||||
type: world_state
|
||||
query: "What events occurred in the 4th Age?"
|
||||
expected_tools:
|
||||
- events_during
|
||||
expected_answer_shape:
|
||||
has_keys: ["events"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t3.06
|
||||
type: world_state
|
||||
query: "Who was on the Wheel & Kiln council in year 320 TA?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t3.07
|
||||
type: world_state
|
||||
query: "What was happening at the Crimson Pact headquarters in 350 TA?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
- events_during
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities", "events"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t3.08
|
||||
type: world_state
|
||||
query: "Who was alive in the Mardonari region during the Sundering?"
|
||||
expected_tools:
|
||||
- entities_present
|
||||
- events_during
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities", "events"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t3.09
|
||||
type: world_state
|
||||
query: "What was the state of Voldramir in the 4th Age?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities"]
|
||||
expected_citations: 1
|
||||
|
||||
- id: t3.10
|
||||
type: world_state
|
||||
query: "Who was ruling Valdorn at the start of the 3rd Age?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
expected_answer_shape:
|
||||
has_keys: ["entities"]
|
||||
expected_citations: 1
|
||||
|
||||
# =====================================================================
|
||||
# Type 4: Causal / chain reasoning
|
||||
# =====================================================================
|
||||
|
||||
- id: t4.01
|
||||
type: causal
|
||||
query: "Why did the Sundering happen?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.02
|
||||
type: causal
|
||||
query: "What were the consequences of the Battle of Black Spire?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.03
|
||||
type: causal
|
||||
query: "How did Aldric come to power?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
- ancestors_of
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "ancestors"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.04
|
||||
type: causal
|
||||
query: "Why was the Crimson Pact founded?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.05
|
||||
type: causal
|
||||
query: "What led to the Long Winter?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.06
|
||||
type: causal
|
||||
query: "How did House Vyr fall?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.07
|
||||
type: causal
|
||||
query: "What caused the Border Wars?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.08
|
||||
type: causal
|
||||
query: "How did the Wheel & Kiln come to be the Mardonari council?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.09
|
||||
type: causal
|
||||
query: "Why did the Mardonari and Valdorni go to war?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
- id: t4.10
|
||||
type: causal
|
||||
query: "How did Voldramir become a prison plane?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
expected_answer_shape:
|
||||
has_keys: ["causes", "effects"]
|
||||
expected_citations: 2
|
||||
|
||||
# =====================================================================
|
||||
# Type 5: Open-ended narrative
|
||||
# =====================================================================
|
||||
|
||||
- id: t5.01
|
||||
type: narrative
|
||||
query: "Tell me about the Border Wars."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.02
|
||||
type: narrative
|
||||
query: "What was House Vyr like at its height?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.03
|
||||
type: narrative
|
||||
query: "Describe the 3rd Age."
|
||||
expected_tools:
|
||||
- events_during
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.04
|
||||
type: narrative
|
||||
query: "What was life like in Mardsville in the late 3rd Age?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.05
|
||||
type: narrative
|
||||
query: "Describe the fall of the Crimson Pact."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.06
|
||||
type: narrative
|
||||
query: "Tell me about the cosmology of Mardonari."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.07
|
||||
type: narrative
|
||||
query: "What is the lore around Voldramir?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entity_context
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.08
|
||||
type: narrative
|
||||
query: "Describe Aldric's reign."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.09
|
||||
type: narrative
|
||||
query: "What was happening in the Underdark during the Sundering?"
|
||||
expected_tools:
|
||||
- lookup
|
||||
- entities_present
|
||||
- events_during
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
|
||||
- id: t5.10
|
||||
type: narrative
|
||||
query: "Tell me about the founding of Mardonari."
|
||||
expected_tools:
|
||||
- lookup
|
||||
- event_chain
|
||||
- narrate_arc
|
||||
expected_answer_shape:
|
||||
has_keys: ["narrative"]
|
||||
expected_citations: 3
|
||||
155
tests/harness/test_questions.py
Normal file
155
tests/harness/test_questions.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""Slice 7.1 — Reasoning harness test set.
|
||||
|
||||
Per ``docs/plan/exec/07-harness.md`` sub-slice 7.1, the
|
||||
50 questions are authored in ``tests/harness/questions.yaml``
|
||||
and compiled to ``tests/harness/questions.json`` by
|
||||
``scripts/harness/build_questions.py``. These tests pin the
|
||||
contract:
|
||||
|
||||
- The YAML compiles cleanly to JSON.
|
||||
- The JSON has exactly 50 questions.
|
||||
- Each of the 5 question types has exactly 10 questions
|
||||
(per ``docs/07-reasoning-harness.md`` §"The five
|
||||
question types").
|
||||
- Every question has the required keys (id, type, query,
|
||||
expected_tools, expected_answer_shape,
|
||||
expected_citations).
|
||||
- The question ids are unique.
|
||||
- The version is set (a contract with the runner:
|
||||
results are version-stamped so old runs stay
|
||||
comparable when the prompt iterates).
|
||||
|
||||
The tests run offline (no API key) — they're Track A.
|
||||
Track B (executing the harness against the live LLM) is a
|
||||
follow-up that depends on ``$OLLAMA_API_KEY``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
YAML_PATH = ROOT / "tests" / "harness" / "questions.yaml"
|
||||
JSON_PATH = ROOT / "tests" / "harness" / "questions.json"
|
||||
BUILD_SCRIPT = ROOT / "scripts" / "harness" / "build_questions.py"
|
||||
|
||||
QUESTION_TYPES = (
|
||||
"identity",
|
||||
"time_fact",
|
||||
"world_state",
|
||||
"causal",
|
||||
"narrative",
|
||||
)
|
||||
|
||||
REQUIRED_KEYS = (
|
||||
"id",
|
||||
"type",
|
||||
"query",
|
||||
"expected_tools",
|
||||
"expected_answer_shape",
|
||||
"expected_citations",
|
||||
)
|
||||
|
||||
|
||||
def _load_compiled() -> dict:
|
||||
"""Rebuild and read the questions.json.
|
||||
|
||||
Rebuilding on every test run keeps the JSON in sync
|
||||
with the YAML. The cost (one extra subprocess per
|
||||
test session) is negligible; the benefit is "the test
|
||||
can't lie" — a stale JSON would surface as a build
|
||||
failure, not as a green test.
|
||||
"""
|
||||
subprocess.run(
|
||||
[sys.executable, str(BUILD_SCRIPT),
|
||||
"--yaml", str(YAML_PATH), "--out", str(JSON_PATH)],
|
||||
check=True, cwd=str(ROOT),
|
||||
)
|
||||
return json.loads(JSON_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def test_7_1_questions_match_schema() -> None:
|
||||
"""AC 7.1 — the YAML compiles to a JSON that has
|
||||
``version``, ``type_counts``, and ``questions`` keys,
|
||||
and every question has the required fields.
|
||||
"""
|
||||
compiled = _load_compiled()
|
||||
for key in ("version", "type_counts", "questions"):
|
||||
assert key in compiled, f"missing top-level key {key!r}"
|
||||
|
||||
for q in compiled["questions"]:
|
||||
for k in REQUIRED_KEYS:
|
||||
assert k in q, f"question {q.get('id')!r} missing {k!r}"
|
||||
|
||||
|
||||
def test_7_1_50_questions_total() -> None:
|
||||
"""AC 7.1 — 5 question types × 10 = 50 total. The
|
||||
50-question count is the harness's contract.
|
||||
"""
|
||||
compiled = _load_compiled()
|
||||
assert len(compiled["questions"]) == 50, (
|
||||
f"expected 50 questions, got {len(compiled['questions'])}"
|
||||
)
|
||||
|
||||
|
||||
def test_7_1_10_per_type() -> None:
|
||||
"""AC 7.1 — each of the 5 question types is
|
||||
represented by exactly 10 questions.
|
||||
"""
|
||||
compiled = _load_compiled()
|
||||
type_counts = compiled["type_counts"]
|
||||
for t in QUESTION_TYPES:
|
||||
assert type_counts.get(t, 0) == 10, (
|
||||
f"type {t!r}: expected 10 questions, "
|
||||
f"got {type_counts.get(t, 0)}"
|
||||
)
|
||||
|
||||
|
||||
def test_7_1_every_question_has_expected_tools() -> None:
|
||||
"""AC 7.1 — every question has a non-empty
|
||||
``expected_tools`` list. The harness grades the
|
||||
LLM's tool sequence against this expectation; an
|
||||
empty list is meaningless.
|
||||
"""
|
||||
compiled = _load_compiled()
|
||||
for q in compiled["questions"]:
|
||||
tools = q.get("expected_tools") or []
|
||||
assert len(tools) > 0, (
|
||||
f"question {q.get('id')!r} has empty expected_tools"
|
||||
)
|
||||
assert all(isinstance(t, str) and t for t in tools), (
|
||||
f"question {q.get('id')!r} has non-string tool name in "
|
||||
f"{tools!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_7_1_question_ids_are_unique() -> None:
|
||||
"""Defensive: duplicate ids would silently overwrite
|
||||
each other in the runner's results table.
|
||||
"""
|
||||
compiled = _load_compiled()
|
||||
seen: set[str] = set()
|
||||
duplicates: list[str] = []
|
||||
for q in compiled["questions"]:
|
||||
qid = q.get("id")
|
||||
if qid in seen:
|
||||
duplicates.append(qid)
|
||||
seen.add(qid)
|
||||
assert not duplicates, f"duplicate question ids: {duplicates}"
|
||||
|
||||
|
||||
def test_7_1_version_is_set() -> None:
|
||||
"""The version field is the contract with the runner
|
||||
that "old results stay comparable when the prompt
|
||||
iterates" (per the exec roadmap's D3). A missing
|
||||
version is a bug.
|
||||
"""
|
||||
compiled = _load_compiled()
|
||||
assert compiled.get("version"), "version field is empty or missing"
|
||||
# The version is a positive integer (or semver-ish
|
||||
# string); the runner treats it as opaque, but the
|
||||
# type is pinned to catch accidental renames.
|
||||
assert isinstance(compiled["version"], (int, str))
|
||||
Reference in New Issue
Block a user