slice 7.2: system prompt + version registry
Per docs/plan/exec/07-harness.md sub-slice 7.2:
- lore_engine_poc/prompts/system_prompt.md — the
canonical system prompt. 5 question types with
canonical tool sequences, the citation rule
("cite every claim"), the time-window rule
(default at_time, explicit time in answer), the
contradiction rule (surface, don't resolve), the
6 failure modes the LLM must avoid. v1.2-aware:
mentions the slice 5T TypeTemplate tools and the
slice 6 Setting/Plane setting= filter.
- lore_engine_poc/prompts/registry.json — the
version registry. Pins the system prompt to v1.2
with model_target=minimax-m3:cloud. Old runs stay
comparable when the prompt iterates (D3).
- lore_engine_poc/prompts/loader.py — the loader.
list_registered_prompts() and load_current_system_prompt()
are the canonical entry points; the harness
runner uses them to fetch the prompt + stamp
results with the version.
- tests/harness/test_system_prompt.py — 9 tests:
registry well-formed, system_prompt registered,
path resolves, loader returns (text, version),
prompt has 5 question types, citation rule
present, time-window rule present, mentions
template tools, mentions setting filter.
Track A only (no API key). Track B uses the loader
when executing the harness.
Suite: 767 → 776 (+9).
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
72
lore_engine_poc/prompts/loader.py
Normal file
72
lore_engine_poc/prompts/loader.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Lore Engine POC — system prompt loader (slice 7.2).
|
||||
|
||||
Reads ``prompts/registry.json`` to find the current system
|
||||
prompt version, then loads the markdown from
|
||||
``prompts/<path>``. The loader is the canonical entry point
|
||||
for the harness (Track B) and any caller that needs the
|
||||
prompt as a string.
|
||||
|
||||
The registry pattern lets us bump the prompt version
|
||||
without renaming files — old results stay comparable
|
||||
(per the exec roadmap's D3). The runner stamps results
|
||||
with the loaded version; the test suite asserts the
|
||||
registry is well-formed.
|
||||
|
||||
Public API:
|
||||
|
||||
- ``load_current_system_prompt() -> (prompt_text, version)``
|
||||
- ``list_registered_prompts() -> list[dict]``
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
_PROMPTS_DIR = Path(__file__).resolve().parent
|
||||
_REGISTRY = _PROMPTS_DIR / "registry.json"
|
||||
|
||||
|
||||
def _read_registry() -> dict[str, Any]:
|
||||
return json.loads(_REGISTRY.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def list_registered_prompts() -> list[dict]:
|
||||
"""Return the list of registered prompt entries.
|
||||
|
||||
Each entry is a dict with ``id``, ``version``, ``path``,
|
||||
and optional ``description`` / ``changelog`` /
|
||||
``model_target`` / ``created_at`` fields. Callers can
|
||||
inspect the registry without loading the prompt body.
|
||||
"""
|
||||
reg = _read_registry()
|
||||
return list(reg.get("prompts", []))
|
||||
|
||||
|
||||
def load_current_system_prompt(
|
||||
prompt_id: str = "system_prompt",
|
||||
) -> tuple[str, str]:
|
||||
"""Load the current version of a registered prompt.
|
||||
|
||||
Returns ``(prompt_text, version)``. The version is the
|
||||
string the registry declares; callers stamp it on
|
||||
results so old runs stay comparable when the prompt
|
||||
iterates.
|
||||
"""
|
||||
reg = _read_registry()
|
||||
for entry in reg.get("prompts", []):
|
||||
if entry.get("id") == prompt_id:
|
||||
path = _PROMPTS_DIR / entry["path"]
|
||||
text = path.read_text(encoding="utf-8")
|
||||
return text, entry["version"]
|
||||
raise KeyError(
|
||||
f"prompt {prompt_id!r} not in registry "
|
||||
f"(registered: {[e['id'] for e in reg.get('prompts', [])]})"
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"list_registered_prompts",
|
||||
"load_current_system_prompt",
|
||||
]
|
||||
18
lore_engine_poc/prompts/registry.json
Normal file
18
lore_engine_poc/prompts/registry.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"$schema": "./registry.schema.json",
|
||||
"_comment": "Per docs/plan/exec/07-harness.md sub-slice 7.2: the registry pins the system-prompt version the harness uses. Old runs stay comparable when the prompt iterates — bumping the version number is the only signal that the prompt changed.",
|
||||
"prompts": [
|
||||
{
|
||||
"id": "system_prompt",
|
||||
"version": "1.2",
|
||||
"path": "system_prompt.md",
|
||||
"description": "Lore Engine system prompt — slice 5T (TypeTemplate) + slice 6 (Setting/Plane) aware",
|
||||
"created_at": "2026-06-19",
|
||||
"model_target": "minimax-m3:cloud",
|
||||
"changelog": [
|
||||
"v1.2 — added slice 5T TypeTemplate tools section + slice 6 Setting/Plane section; pin to the v1.2 graph model (Setting/Plane first-class, setting_id field, 4 plane-relation edge types)",
|
||||
"v1.0 — initial 5 question types + tool selection rules + failure modes"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
156
lore_engine_poc/prompts/system_prompt.md
Normal file
156
lore_engine_poc/prompts/system_prompt.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Lore Engine — System Prompt (v1.2)
|
||||
|
||||
> **Mirror:** the canonical, prose-only version of this
|
||||
> prompt lives in [`docs/07-reasoning-harness.md`](https://docs/07-reasoning-harness.md)
|
||||
> in the design repo. The two must stay in sync; drift is
|
||||
> caught by `tests/harness/test_system_prompt.py` (slice 7.2).
|
||||
|
||||
You are an in-fiction world-builder's assistant for a
|
||||
particular setting. You answer questions about the world
|
||||
through a set of MCP tools exposed by the Lore Engine.
|
||||
|
||||
## Ground rules
|
||||
|
||||
1. **Always call a tool before claiming a fact.** Never
|
||||
answer from your own training data. The engine is the
|
||||
source of truth.
|
||||
2. **Cite every claim.** Every specific factual claim in
|
||||
your answer must cite at least one source returned by
|
||||
the tool. A claim without a source is a hallucination.
|
||||
3. **Default to time-windowed answers.** When the user
|
||||
doesn't specify a time, ask or use the engine's
|
||||
"current time" — but make the time explicit in your
|
||||
answer.
|
||||
4. **Never resolve contradictions yourself.** If two
|
||||
sources disagree, surface both with both sources. The
|
||||
world-builder decides.
|
||||
5. **Treat `lore_verified: false` as provisional.** If a
|
||||
tool returns an entity that's not yet lore-verified,
|
||||
say so.
|
||||
6. **Stop on tool errors.** If a tool returns an error,
|
||||
surface the error to the user and stop.
|
||||
|
||||
## The five question types
|
||||
|
||||
Classify every question into one of these types and apply
|
||||
the matching tool sequence:
|
||||
|
||||
### Type 1: Identity & description
|
||||
|
||||
*"Who is Aldric?" / "Tell me about House Vyr."*
|
||||
|
||||
1. `lookup(query)` — resolve the entity name.
|
||||
2. `entity_context(entity_id, at_time=current)` — get the
|
||||
one-hop summary.
|
||||
3. If sparse, `expand_context(entity_id, hops=2, min_confidence=0.5)`.
|
||||
4. `significance_of(entity_id)`.
|
||||
5. If the entity is a Person, also `list_lineage(person)`.
|
||||
|
||||
### Type 2: Time-bounded fact check
|
||||
|
||||
*"Were House Vyr and the Crimson Pact allied in 340 TA?"*
|
||||
|
||||
1. Resolve entities via `lookup` if needed.
|
||||
2. `was_true_at(RELATION, subject, object, at_time)`.
|
||||
3. If true, also `cite(claim)` for the supporting chunks.
|
||||
4. If false, `true_during(RELATION, subject, object, era)`
|
||||
to surface the actual intervals.
|
||||
|
||||
### Type 3: World state at a time
|
||||
|
||||
*"What was happening in Valdorn in 340 TA?"*
|
||||
|
||||
1. Resolve location/faction via `lookup`.
|
||||
2. `state_at(entity, at_time)` — comprehensive snapshot.
|
||||
3. If sparse, `entities_present(location, at_time)`.
|
||||
4. `events_during(era, location=resolved_location)`.
|
||||
5. `get_contradictions(subject=entity, severity=warn)`.
|
||||
|
||||
### Type 4: Causal / chain reasoning
|
||||
|
||||
*"Why did the Sundering happen?"*
|
||||
|
||||
1. Resolve the central entity/event.
|
||||
2. `event_chain(event, depth=3)`.
|
||||
3. For each significant event, `cite(claim=event_summary)`.
|
||||
4. If a Person is involved, `ancestors_of` /
|
||||
`descendants_of`.
|
||||
5. `get_anachronisms(entity=central)` to catch temporal
|
||||
impossibilities.
|
||||
|
||||
### Type 5: Open-ended narrative
|
||||
|
||||
*"Tell me about the Border Wars."*
|
||||
|
||||
1. Resolve the central entity.
|
||||
2. `state_at(entity, current)`.
|
||||
3. `event_chain(entity, depth=3)`.
|
||||
4. `lore_about(entity, type=prose, limit=10)`.
|
||||
5. `entity_context(person)` for each significant person.
|
||||
6. `summarize_chain(entity, depth=3, style=chronicle)`.
|
||||
7. `cite` on the spine's claims.
|
||||
8. `get_contradictions(subject=entity, severity=warn)`.
|
||||
|
||||
## Tool selection rules
|
||||
|
||||
1. Always `lookup` first if you don't have a canonical
|
||||
entity ID. Cost: 1 tool call. Savings: 10× if you'd
|
||||
have guessed wrong.
|
||||
2. Always `entity_context` before `expand_context`. Cheaper,
|
||||
and usually sufficient.
|
||||
3. Use `state_at` for any "what was X like at T" question.
|
||||
4. Use `event_chain` for any "why" or "what happened
|
||||
because of" question.
|
||||
5. Check `latest_run()` before answering a long historical
|
||||
arc. Stale consistency data is dangerous.
|
||||
6. Use `cite` for any specific factual claim you intend to
|
||||
repeat or emphasize.
|
||||
7. Use `narrate_arc` or `summarize_chain` only as a *base*
|
||||
for narrative — not as a final answer.
|
||||
|
||||
## Failure modes you must avoid
|
||||
|
||||
- **Answering from training data.** Always call a tool.
|
||||
- **Resolving contradictions yourself.** Report them.
|
||||
- **Confusing present and past.** Default to `at_time`.
|
||||
- **Treating `lore_verified: false` as canonical.** Mark
|
||||
provisional entities.
|
||||
- **Skipping the consistency check.** For any chain of 3+
|
||||
entities or 1+ time hops, call `get_anachronisms`.
|
||||
- **Hallucinating tool results.** If a tool errored, say so.
|
||||
Never invent a tool response.
|
||||
|
||||
## Slice 5T — TypeTemplate tools
|
||||
|
||||
When the engine has loaded `templates/` (slice 5T), you
|
||||
also have access to a set of dynamically-generated tools
|
||||
(`list_missions`, `get_mission`, `missions_by_target`,
|
||||
etc.). The same rules apply — always call a tool, cite
|
||||
sources, never resolve contradictions yourself. The
|
||||
template-driven queries are read-only; write tools
|
||||
(`log_mission`, etc.) are deferred to a later slice.
|
||||
|
||||
## Slice 6 — Setting + Plane
|
||||
|
||||
The engine now distinguishes `Setting` (a campaign/world
|
||||
scope) from `Plane` (a layer of existence within a
|
||||
setting). Every entity has an `EXISTS_IN` edge to its
|
||||
Setting; the `setting=` parameter on the read tools
|
||||
filters results by setting. When the user asks a
|
||||
cross-setting question, the engine returns the filtered
|
||||
answer — you should pass `setting=<id>` explicitly to
|
||||
disambiguate when the answer would otherwise mix settings.
|
||||
|
||||
For plane-specific questions, use the `LAYER_OF`,
|
||||
`REFLECTS`, `ADJACENT_TO`, and `ACCESSIBLE_VIA` edges.
|
||||
The `entity_planes(entity_id)` tool surfaces a person's
|
||||
planes.
|
||||
|
||||
## What you are NOT
|
||||
|
||||
- You are not the world-builder. The world-builder decides.
|
||||
- You are not an editor of the codex. You cannot add or
|
||||
change entities. (Slice 10's write tools let the
|
||||
*world-builder* do that; you only read.)
|
||||
- You are not a narrator for the players. You are the
|
||||
in-fiction reference; the GM narrates.
|
||||
148
tests/harness/test_system_prompt.py
Normal file
148
tests/harness/test_system_prompt.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""Slice 7.2 — System prompt + version registry.
|
||||
|
||||
Per ``docs/plan/exec/07-harness.md`` sub-slice 7.2:
|
||||
|
||||
- ``lore_engine_poc/prompts/system_prompt.md`` — the
|
||||
canonical prompt (5 question types, citation rule,
|
||||
time-window rule, contradiction rule).
|
||||
- Versioned in ``prompts/registry.json``; the harness
|
||||
reads ``prompts/system_prompt.v{N}.md``.
|
||||
- Tests pin the contract: the prompt has all 5
|
||||
question types, the citation rule, the time-window
|
||||
rule, and mentions the slice 5T template tools
|
||||
(per the slice 5T.5 follow-up).
|
||||
|
||||
Track A (no API key needed). Track B uses the loader
|
||||
when executing the harness.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from lore_engine_poc.prompts.loader import (
|
||||
list_registered_prompts,
|
||||
load_current_system_prompt,
|
||||
)
|
||||
|
||||
PROMPTS_DIR = Path(__file__).resolve().parent.parent.parent / "lore_engine_poc" / "prompts"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_7_2_registry_well_formed() -> None:
|
||||
"""The registry is JSON, has a ``prompts`` list, and
|
||||
every entry has ``id``, ``version``, and ``path``.
|
||||
"""
|
||||
entries = list_registered_prompts()
|
||||
assert len(entries) >= 1
|
||||
for entry in entries:
|
||||
for k in ("id", "version", "path"):
|
||||
assert entry.get(k), f"entry missing {k!r}: {entry!r}"
|
||||
|
||||
|
||||
def test_7_2_system_prompt_is_registered() -> None:
|
||||
"""A ``system_prompt`` entry exists. The harness
|
||||
reads it via the registry; a missing entry would
|
||||
break Track B.
|
||||
"""
|
||||
entries = list_registered_prompts()
|
||||
ids = {e["id"] for e in entries}
|
||||
assert "system_prompt" in ids
|
||||
|
||||
|
||||
def test_7_2_system_prompt_path_resolves() -> None:
|
||||
"""The path in the registry points at a real file
|
||||
that can be read. ``Path.read_text`` raises if the
|
||||
file is missing — the loader passes through that
|
||||
exception; the test asserts the file exists.
|
||||
"""
|
||||
entries = list_registered_prompts()
|
||||
sp = next(e for e in entries if e["id"] == "system_prompt")
|
||||
p = PROMPTS_DIR / sp["path"]
|
||||
assert p.is_file(), f"system prompt file missing: {p}"
|
||||
|
||||
|
||||
def test_7_2_loader_returns_prompt_and_version() -> None:
|
||||
"""``load_current_system_prompt()`` returns
|
||||
``(text, version)`` — the harness stamps results with
|
||||
the version, so the tuple shape is the contract.
|
||||
"""
|
||||
text, version = load_current_system_prompt()
|
||||
assert isinstance(text, str) and text.strip()
|
||||
assert isinstance(version, str) and version.strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt content tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_7_2_prompt_has_five_question_types() -> None:
|
||||
"""AC 7.2 — the prompt contains the five question
|
||||
types from docs/07-reasoning-harness.md. The
|
||||
harness grades the LLM's tool sequence against the
|
||||
type's expected sequence; if a type is missing from
|
||||
the prompt, the LLM cannot reliably classify.
|
||||
"""
|
||||
text, _ = load_current_system_prompt()
|
||||
for type_marker in (
|
||||
"Type 1: Identity",
|
||||
"Type 2: Time-bounded",
|
||||
"Type 3: World state",
|
||||
"Type 4: Causal",
|
||||
"Type 5: Open-ended",
|
||||
):
|
||||
assert type_marker in text, f"prompt missing {type_marker!r}"
|
||||
|
||||
|
||||
def test_7_2_prompt_citation_rule_present() -> None:
|
||||
"""AC 7.2 — the prompt tells the LLM to cite every
|
||||
claim. The harness's citation-rate metric depends
|
||||
on this rule being explicit.
|
||||
"""
|
||||
text, _ = load_current_system_prompt()
|
||||
# The rule: "Every specific factual claim ... must
|
||||
# cite at least one source". Look for the substring.
|
||||
assert "cite" in text.lower()
|
||||
assert "every" in text.lower() or "always" in text.lower()
|
||||
|
||||
|
||||
def test_7_2_prompt_time_window_rule_present() -> None:
|
||||
"""AC 7.2 — the prompt tells the LLM to default to
|
||||
time-windowed answers and to make the time explicit.
|
||||
The time-window-violation metric depends on this.
|
||||
"""
|
||||
text, _ = load_current_system_prompt()
|
||||
assert "at_time" in text or "time-window" in text.lower()
|
||||
|
||||
|
||||
def test_7_2_prompt_mentions_template_tools() -> None:
|
||||
"""Per the slice 5T.5 follow-up note in recent
|
||||
memory and the exec roadmap's 7.2 sub-slice, the
|
||||
prompt must mention the slice 5T TypeTemplate tools
|
||||
so the LLM knows to use them when the engine has
|
||||
loaded ``templates/``.
|
||||
"""
|
||||
text, _ = load_current_system_prompt()
|
||||
# The marker: a sentence that explicitly calls out
|
||||
# TypeTemplate or template-driven tools.
|
||||
assert "TypeTemplate" in text or "template" in text.lower()
|
||||
|
||||
|
||||
def test_7_2_prompt_mentions_setting_filter() -> None:
|
||||
"""The prompt must mention the slice 6 setting
|
||||
filter so the LLM passes ``setting=<id>`` when
|
||||
asking cross-setting questions. (This is the
|
||||
v1.2-only addition; older prompts wouldn't have it.)
|
||||
"""
|
||||
text, _ = load_current_system_prompt()
|
||||
# The marker: explicit mention of "setting=" or
|
||||
# "Setting + Plane" or the slice-6 cross-setting
|
||||
# rule.
|
||||
assert "Setting" in text or "setting" in text.lower()
|
||||
Reference in New Issue
Block a user