slice 1.7: seed/yaml + ingest walks structured path; demo exercises time filter; integration tests prove AC 1.11 (85/85 green)
This commit is contained in:
Binary file not shown.
92
lore_engine_poc/parsers/_adapter.py
Normal file
92
lore_engine_poc/parsers/_adapter.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""Adapter that turns structured-YAML edge dicts into the legacy
|
||||
``Triple`` shape the in-memory :class:`Graph` consumes.
|
||||
|
||||
The structured-YAML parsers (slice 1) emit edge dicts in a
|
||||
deliberately verbose shape — extra keys like ``kind`` and
|
||||
``reason`` are useful for downstream reasoning but aren't part
|
||||
of the engine's canonical ``Triple``. This adapter narrows
|
||||
those dicts down to the four-field ``Triple`` the graph builder
|
||||
expects, so the new YAML path lands in the same graph as the
|
||||
markdown/codex path.
|
||||
|
||||
Slice 2 will replace this adapter when the consistency engine
|
||||
re-receives the raw edge dicts directly.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
from ..parsers_legacy import Triple
|
||||
from ._yaml import YamlSchemaError
|
||||
from .family_tree import parse_family_tree_file
|
||||
|
||||
# Map of YAML file basename (without extension) -> parser function.
|
||||
# Each parser returns ``(nodes, edges)`` where each edge is a dict
|
||||
# in the parser-defined shape. Adding a new YAML format = add a
|
||||
# line here and a parser module.
|
||||
PARSERS_BY_KIND: dict[str, callable] = {
|
||||
"family_tree": parse_family_tree_file,
|
||||
# "factions": parse_factions_file, # wired by sub-slice 1.2
|
||||
# "timeline": parse_timeline_file, # wired by sub-slice 1.4
|
||||
# "gazetteer": parse_gazetteer_file,
|
||||
# "bestiary": parse_bestiary_file,
|
||||
# "magic_system": parse_magic_file,
|
||||
# "culture": parse_culture_file,
|
||||
}
|
||||
|
||||
|
||||
# File-naming convention: the YAML filename is the kind name.
|
||||
# ``family_tree.yaml`` → kind = ``family_tree``. We try the longest
|
||||
# matching prefix; ``family_tree_raventhorne.yaml`` would still
|
||||
# match ``family_tree``. Files that don't match a known kind are
|
||||
# silently skipped (the markdown parser is the fallback for unknown
|
||||
# shapes).
|
||||
def _detect_kind(filename: str) -> str | None:
|
||||
stem = filename.rsplit(".", 1)[0]
|
||||
# Try the longest kind prefix first so "family_tree" wins over "family".
|
||||
for kind in sorted(PARSERS_BY_KIND.keys(), key=len, reverse=True):
|
||||
if stem == kind or stem.startswith(kind + "_") or stem.startswith(kind + "."):
|
||||
return kind
|
||||
return None
|
||||
|
||||
|
||||
def iter_structured_yaml(root: str) -> Iterable[Triple]:
|
||||
"""Walk a directory and yield :class:`Triple` for every ``*.yaml`` file
|
||||
whose kind-prefix matches a registered parser."""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
for dirpath, _dirs, files in os.walk(root):
|
||||
for fn in sorted(files):
|
||||
if not fn.lower().endswith((".yaml", ".yml")):
|
||||
continue
|
||||
kind = _detect_kind(fn)
|
||||
if kind is None:
|
||||
continue
|
||||
parser = PARSERS_BY_KIND[kind]
|
||||
full = os.path.join(dirpath, fn)
|
||||
try:
|
||||
_nodes, edges = parser(full)
|
||||
except YamlSchemaError as e:
|
||||
# Re-raise with the file path inline so the caller can
|
||||
# see which YAML file failed without re-walking.
|
||||
raise type(e)(f"{full}: {e}", line=e.line, field=e.field) from e
|
||||
for e in edges:
|
||||
yield structured_edge_to_triple(e)
|
||||
|
||||
|
||||
def structured_edge_to_triple(e: dict) -> Triple:
|
||||
"""Convert a parser edge dict into the legacy :class:`Triple` shape."""
|
||||
return Triple(
|
||||
subject=e["subject"],
|
||||
relation=e["relation"],
|
||||
object=e["object"],
|
||||
source_path=e["source_path"],
|
||||
source_slug=e["source_slug"],
|
||||
valid_from=e.get("valid_from"),
|
||||
valid_until=e.get("valid_until"),
|
||||
extraction_confidence=e.get("extraction_confidence", 1.0),
|
||||
source_confidence=e.get("source_confidence", 1.0),
|
||||
reliability=e.get("reliability", "canonical"),
|
||||
)
|
||||
@@ -89,6 +89,11 @@ class Triple:
|
||||
extraction_confidence: float = 1.0
|
||||
source_confidence: float = 1.0
|
||||
reliability: str = "canonical"
|
||||
# Time bounds on the edge. None for unbounded (the markdown path
|
||||
# always emits None; the structured-YAML path, slice 1, sets
|
||||
# these from YAML).
|
||||
valid_from: str | None = None
|
||||
valid_until: str | None = None
|
||||
|
||||
|
||||
def _read_frontmatter(text: str) -> tuple[dict, str]:
|
||||
|
||||
29
lore_engine_poc/seed/yaml/family_tree_ashveil_bloodline.yaml
Normal file
29
lore_engine_poc/seed/yaml/family_tree_ashveil_bloodline.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
lineage: "ashveil_bloodline"
|
||||
founding_ancestor: "theron_ashveil"
|
||||
description: "The bloodline of Theron Ashveil's descendants."
|
||||
|
||||
members:
|
||||
- id: "theron_ashveil"
|
||||
name: "Theron Ashveil"
|
||||
born: "3rd_age.year_200"
|
||||
died: "3rd_age.year_350"
|
||||
|
||||
- id: "maric_vyr"
|
||||
name: "Maric Vyr"
|
||||
born: "3rd_age.year_220"
|
||||
died: "3rd_age.year_330"
|
||||
|
||||
- id: "yssa_raventhorne"
|
||||
name: "Yssa Raventhorne"
|
||||
born: "3rd_age.year_260"
|
||||
died: "3rd_age.year_320"
|
||||
|
||||
- id: "aldric_raventhorne"
|
||||
name: "Aldric Raventhorne"
|
||||
born: "3rd_age.year_300"
|
||||
parents: ["theron_ashveil", "yssa_raventhorne"]
|
||||
|
||||
- id: "elara_raventhorne"
|
||||
name: "Elara Raventhorne"
|
||||
born: "3rd_age.year_305"
|
||||
parents: ["maric_vyr"]
|
||||
@@ -0,0 +1,15 @@
|
||||
# A timeline event used by the demo's time-bounded queries (AC 1.11).
|
||||
# The Battle of Black Spire takes place during the named sub-era;
|
||||
# the structured parser turns it into an Event node plus an Era node
|
||||
# with OCCURRED_DURING edges. Slice 1 ships the format; the timeline
|
||||
# parser implementation lands in sub-slice 1.4.
|
||||
|
||||
era: "3rd_age"
|
||||
sub_era: "age_of_iron"
|
||||
|
||||
events:
|
||||
- id: "battle_of_black_spire"
|
||||
name: "Battle of Black Spire"
|
||||
during: "3rd_age.age_of_iron"
|
||||
participants:
|
||||
- "aldric_raventhorne"
|
||||
@@ -149,6 +149,8 @@ def build_graph(entities: Iterable[Entity], triples: Iterable[Triple]) -> Graph:
|
||||
subject=t.subject,
|
||||
relation=t.relation,
|
||||
object=t.object,
|
||||
valid_from=t.valid_from,
|
||||
valid_until=t.valid_until,
|
||||
sources=[t.source_path],
|
||||
extraction_confidences=[t.extraction_confidence],
|
||||
source_confidences=[t.source_confidence],
|
||||
@@ -171,6 +173,31 @@ def build_graph(entities: Iterable[Entity], triples: Iterable[Triple]) -> Graph:
|
||||
# ``is_disputed``, and link them via ``disputed_with``.
|
||||
# Slice 2's consistency engine turns this into a
|
||||
# ``Contradiction`` node.
|
||||
#
|
||||
# 0. Existing is unbounded, new is bounded (slice 1,
|
||||
# structured-YAML override): the YAML is the
|
||||
# authoritative source for time bounds. Adopt the
|
||||
# YAML bounds on the existing edge and append the
|
||||
# YAML as a contributing source. The unbounded
|
||||
# markdown-style triple still gets cited for
|
||||
# audit (it's where the relationship was first
|
||||
# noticed), but the bounds come from the structured
|
||||
# path. This is the inverse of Case 3 — instead of
|
||||
# spawning a second disputed Edge, we promote the
|
||||
# structured source.
|
||||
if (
|
||||
existing.valid_from is None
|
||||
and existing.valid_until is None
|
||||
and (t.valid_from is not None or t.valid_until is not None)
|
||||
):
|
||||
existing.valid_from = t.valid_from
|
||||
existing.valid_until = t.valid_until
|
||||
if t.source_path not in existing.sources:
|
||||
existing.sources.append(t.source_path)
|
||||
existing.extraction_confidences.append(t.extraction_confidence)
|
||||
existing.source_confidences.append(t.source_confidence)
|
||||
existing.reliabilities.append(t.reliability)
|
||||
continue
|
||||
if t.source_path in existing.sources and _windows_consistent(
|
||||
existing.valid_from, existing.valid_until, None, None
|
||||
):
|
||||
|
||||
@@ -16,6 +16,7 @@ ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from lore_engine_poc.parsers import iter_codex, extract_triples
|
||||
from lore_engine_poc.parsers import iter_structured_yaml
|
||||
from lore_engine_poc.tools import build_graph
|
||||
|
||||
|
||||
@@ -39,8 +40,23 @@ def main() -> int:
|
||||
return 1
|
||||
|
||||
entities = list(iter_codex(args.codex))
|
||||
triples = extract_triples(entities)
|
||||
graph = build_graph(entities, triples)
|
||||
md_triples = extract_triples(entities)
|
||||
|
||||
# Slice 1: also walk the structured-YAML files alongside the
|
||||
# markdown codex. Markdown triples are the prose path (no
|
||||
# bounds); YAML triples are the structured path (with bounds).
|
||||
# ``build_graph`` merges them, and the structured bounds win
|
||||
# when both paths produce the same fact.
|
||||
yaml_root = os.path.join(args.codex, "yaml")
|
||||
yaml_triples = []
|
||||
if os.path.isdir(yaml_root):
|
||||
yaml_triples = list(iter_structured_yaml(yaml_root))
|
||||
print(f"[01_ingest] structured YAML triples: {len(yaml_triples)}")
|
||||
else:
|
||||
print(f"[01_ingest] no yaml/ subdir; structured path skipped")
|
||||
|
||||
graph = build_graph(entities, md_triples + yaml_triples)
|
||||
triples = md_triples + yaml_triples
|
||||
|
||||
# Dedupe triples for reporting.
|
||||
seen = set()
|
||||
|
||||
@@ -64,6 +64,15 @@ def main() -> int:
|
||||
"MEMBER_OF,Roland Raventhorne,Iron Mountain Trading Company,3rd_age.year_345",
|
||||
# Negative case: a relationship that doesn't exist in the codex.
|
||||
"ALLIED_WITH,House Raventhorne,House Quche,3rd_age.year_345",
|
||||
# Slice 1: time-bounded edges from the structured YAML path.
|
||||
# Inside Theron's life → true.
|
||||
"PARENT_OF,Theron Ashveil,Aldric Raventhorne,3rd_age.year_325",
|
||||
# After Theron's death at year 350 → false (proves the
|
||||
# time filter actually fires on the YAML-derived bounds).
|
||||
"PARENT_OF,Theron Ashveil,Aldric Raventhorne,3rd_age.year_400",
|
||||
# Maric's PARENT_OF window ends at his death year 330.
|
||||
"PARENT_OF,Maric Vyr,Elara Raventhorne,3rd_age.year_250",
|
||||
"PARENT_OF,Maric Vyr,Elara Raventhorne,3rd_age.year_340",
|
||||
]
|
||||
|
||||
for q in queries:
|
||||
|
||||
132
tests/test_parsers/test_integration.py
Normal file
132
tests/test_parsers/test_integration.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""Tests for the structured-YAML → Graph integration (slice 1, sub-slice 1.7).
|
||||
|
||||
These exercise the seam between the new YAML parsers and the legacy
|
||||
in-memory graph. The contract: a YAML file's time-bounded edges
|
||||
must reach the graph with their ``valid_from``/``valid_until``
|
||||
populated, and the markdown/codex path's unbounded edges must not
|
||||
be allowed to clobber them on merge.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _yaml(tmp_path, name, body):
|
||||
p = tmp_path / name
|
||||
p.write_text(textwrap.dedent(body).lstrip())
|
||||
return p
|
||||
|
||||
|
||||
def test_yaml_edge_reaches_graph_with_bounds(tmp_path):
|
||||
"""AC 1.2 + 1.11: YAML PARENT_OF reaches the graph with non-null bounds."""
|
||||
from lore_engine_poc.parsers import iter_structured_yaml
|
||||
from lore_engine_poc.tools import build_graph
|
||||
|
||||
_yaml(tmp_path, "family_tree_minimal.yaml", """
|
||||
lineage: "test_lineage"
|
||||
members:
|
||||
- id: "parent"
|
||||
name: "Parent"
|
||||
born: "3rd_age.year_200"
|
||||
died: "3rd_age.year_350"
|
||||
- id: "child"
|
||||
name: "Child"
|
||||
born: "3rd_age.year_300"
|
||||
parents: ["parent"]
|
||||
""")
|
||||
|
||||
triples = list(iter_structured_yaml(str(tmp_path)))
|
||||
g = build_graph([], triples)
|
||||
|
||||
edge = next(
|
||||
e for e in g.edges_by_subject.get("Parent", {}).get("PARENT_OF", [])
|
||||
if e.object == "Child"
|
||||
)
|
||||
assert edge.valid_from == "3rd_age.year_300"
|
||||
assert edge.valid_until == "3rd_age.year_350"
|
||||
|
||||
|
||||
def test_was_true_at_uses_yaml_bounds(tmp_path):
|
||||
"""AC 1.11: was_true_at returns was_true=false when at_time is outside the YAML window."""
|
||||
from lore_engine_poc.parsers import iter_structured_yaml
|
||||
from lore_engine_poc.tools import build_graph, was_true_at
|
||||
|
||||
_yaml(tmp_path, "family_tree.yaml", """
|
||||
lineage: "test_lineage"
|
||||
members:
|
||||
- id: "parent"
|
||||
name: "Parent"
|
||||
born: "3rd_age.year_200"
|
||||
died: "3rd_age.year_350"
|
||||
- id: "child"
|
||||
name: "Child"
|
||||
born: "3rd_age.year_300"
|
||||
parents: ["parent"]
|
||||
""")
|
||||
|
||||
g = build_graph([], list(iter_structured_yaml(str(tmp_path))))
|
||||
|
||||
# at_time INSIDE the window: [300, 350) → true.
|
||||
r_in = was_true_at(g, "PARENT_OF", "Parent", "Child", "3rd_age.year_325")
|
||||
assert r_in["was_true"] is True
|
||||
assert r_in["valid_from"] == "3rd_age.year_300"
|
||||
assert r_in["valid_until"] == "3rd_age.year_350"
|
||||
|
||||
# at_time OUTSIDE the window: parent died at year 350, child
|
||||
# born at year 300. Year 400 is past the upper bound → false.
|
||||
r_out = was_true_at(g, "PARENT_OF", "Parent", "Child", "3rd_age.year_400")
|
||||
assert r_out["was_true"] is False
|
||||
assert r_out["confidence"] == 0.0
|
||||
|
||||
# at_time BEFORE the window: child not yet born → false.
|
||||
r_before = was_true_at(g, "PARENT_OF", "Parent", "Child", "3rd_age.year_250")
|
||||
assert r_before["was_true"] is False
|
||||
|
||||
|
||||
def test_yaml_edge_wins_when_markdown_already_has_unbounded_edge(tmp_path):
|
||||
"""Slice 1.7 integration: structured YAML bounds must not be lost
|
||||
when a markdown extractor has already produced an unbounded edge
|
||||
for the same (subject, relation, object) tuple."""
|
||||
from lore_engine_poc.parsers import Triple, iter_structured_yaml
|
||||
from lore_engine_poc.tools import build_graph
|
||||
|
||||
_yaml(tmp_path, "family_tree.yaml", """
|
||||
lineage: "test_lineage"
|
||||
members:
|
||||
- id: "parent"
|
||||
name: "Parent"
|
||||
born: "3rd_age.year_200"
|
||||
died: "3rd_age.year_350"
|
||||
- id: "child"
|
||||
name: "Child"
|
||||
born: "3rd_age.year_300"
|
||||
parents: ["parent"]
|
||||
""")
|
||||
|
||||
# Markdown-style triple: same fact, no bounds. Should NOT
|
||||
# overwrite the YAML edge's bounds.
|
||||
markdown_triple = Triple(
|
||||
subject="Parent",
|
||||
relation="PARENT_OF",
|
||||
object="Child",
|
||||
source_path="/tmp/markdown.md",
|
||||
source_slug="markdown",
|
||||
extraction_confidence=0.6,
|
||||
source_confidence=1.0,
|
||||
reliability="canonical",
|
||||
)
|
||||
|
||||
g = build_graph([], [markdown_triple] + list(iter_structured_yaml(str(tmp_path))))
|
||||
|
||||
edge = next(
|
||||
e for e in g.edges_by_subject.get("Parent", {}).get("PARENT_OF", [])
|
||||
if e.object == "Child"
|
||||
)
|
||||
# The YAML bounds must survive the merge. The markdown edge
|
||||
# contributes as a second source but does not zero out the bounds.
|
||||
assert edge.valid_from == "3rd_age.year_300"
|
||||
assert edge.valid_until == "3rd_age.year_350"
|
||||
assert "/tmp/markdown.md" in edge.sources
|
||||
Reference in New Issue
Block a user