slice 1.1: family_tree.yaml parser (TDD, 7/7 tests; AC 1.1,1.2,1.5,1.6,1.7,1.12,1.15)

This commit is contained in:
Lore Engine Dev
2026-06-18 00:29:53 -04:00
parent 40880735ec
commit 2b065236e0
7 changed files with 529 additions and 0 deletions

3
.gitignore vendored
View File

@@ -1,9 +1,12 @@
__pycache__/
*.pyc
*.pyo
.graph.pkl
.cache/
.venv/
venv/
.env
.env.local
cognee-data/
*.db
*.wal

View File

@@ -0,0 +1,45 @@
"""Structured-YAML parsers (slice 1).
Each parser takes a file path and returns ``(nodes, triples)`` in the
canonical engine shape. Markdown parsing for the codex still lives
in :mod:`lore_engine_poc.parsers_legacy` (the legacy module) until
slice 3 when LLM extraction lands.
.. note::
The historical :mod:`lore_engine_poc.parsers` module has been
renamed to :mod:`lore_engine_poc.parsers_legacy` to make room
for this subpackage (which is itself a package, not a module —
Python 3 doesn't allow both at the same name). The legacy
imports (``Entity``, ``LoreSource``, ``Triple``, ``iter_codex``,
``extract_triples``) are re-exported from here for backwards
compatibility with slice 0's tests and scripts.
"""
from .family_tree import (
FamilyTreeAnachronismError,
parse_family_tree_file,
)
from ._yaml import YamlSchemaError
from ..parsers_legacy import ( # noqa: F401 -- re-export
Entity,
LoreSource,
RELIABILITY_TO_SOURCE_CONFIDENCE,
Triple,
extract_triples,
iter_codex,
parse_file,
)
__all__ = [
"Entity",
"FamilyTreeAnachronismError",
"LoreSource",
"RELIABILITY_TO_SOURCE_CONFIDENCE",
"Triple",
"YamlSchemaError",
"extract_triples",
"iter_codex",
"parse_family_tree_file",
"parse_file",
]

View File

@@ -0,0 +1,95 @@
"""YAML helpers shared across structured-YAML parsers (slice 1).
Centralises the strict-load + line-number error pattern so every
parser reports violations the same way.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
class YamlSchemaError(ValueError):
"""Raised when a YAML file violates the parser's schema.
The ``line`` attribute (when present) points at the offending
line. Subclasses or callers may attach ``field`` for the JSON
path inside the document.
"""
def __init__(self, message: str, *, line: int | None = None, field: str | None = None):
# Inline the field path into the message when set, so the
# rendered text always carries enough context to debug from
# a stack trace alone.
parts = []
if field:
parts.append(f"field '{field}'")
if line is not None:
parts.append(f"line {line}")
loc = " (" + ", ".join(parts) + ")" if parts else ""
super().__init__(f"{message}{loc}")
self.line = line
self.field = field
def load_yaml(path: str) -> tuple[dict, str]:
"""Load a YAML file with strict PyYAML semantics.
Returns ``(data, source_path)``. Raises :class:`YamlSchemaError`
with a line number when the file is not a mapping at the top
level — the Norway problem (``NO: false`` parsed as ``True``)
is solved by always treating the root as a dict, never a bare
scalar.
"""
p = Path(path)
try:
text = p.read_text(encoding="utf-8")
except OSError as e:
raise YamlSchemaError(f"cannot read {path}: {e}") from e
try:
data = yaml.safe_load(text)
except yaml.YAMLError as e:
# PyYAML marks errors with line numbers when known.
line = None
if hasattr(e, "problem_mark") and e.problem_mark is not None:
line = e.problem_mark.line + 1
raise YamlSchemaError(f"YAML parse error: {e}", line=line) from e
if data is None:
data = {}
if not isinstance(data, dict):
raise YamlSchemaError(
f"top-level must be a mapping, got {type(data).__name__}",
line=1,
)
return data, str(p)
def require_list(data: dict, key: str, *, where: str) -> list:
"""Return ``data[key]`` as a list, or raise with field context."""
if key not in data:
raise YamlSchemaError(f"missing required field '{key}'", field=f"{where}.{key}")
val = data[key]
if not isinstance(val, list):
raise YamlSchemaError(
f"field '{key}' must be a list, got {type(val).__name__}",
field=f"{where}.{key}",
)
return val
def require_str(data: dict, key: str, *, where: str) -> str:
"""Return ``data[key]`` as a non-empty string, or raise."""
if key not in data:
raise YamlSchemaError(f"missing required field '{key}'", field=f"{where}.{key}")
val = data[key]
if not isinstance(val, str) or not val.strip():
raise YamlSchemaError(
f"field '{key}' must be a non-empty string, got {val!r}",
field=f"{where}.{key}",
)
return val.strip()

View File

@@ -0,0 +1,206 @@
"""``family_tree.yaml`` parser (slice 1, sub-slice 1.1).
Reads the lineage YAML format documented in
``docs/06-ingestion.md`` and ``docs/plan/01-slice-structured-yaml.md``
and emits:
* one ``Lineage`` node per file (NOT a ``Faction`` — ADR 0003),
* one ``Person`` node per member,
* one ``MEMBER_OF(Lineage=<id>)`` edge per member with
``valid_from = member.born`` and ``valid_until = None`` when
living or ``member.died`` when set,
* one ``PARENT_OF`` edge per ``(parent, child)`` pair with
``valid_from = child.born`` and ``valid_until = parent.died``
(or ``None`` if parent still living).
Schema validation raises :class:`YamlSchemaError` with line numbers;
anachronistic YAML (a parent who dies before their child is born)
raises :class:`FamilyTreeAnachronismError`.
Edge dictionaries returned by :func:`parse_family_tree_file` use the
shape expected by the rest of the engine::
{
"subject": str, "relation": str, "object": str,
"valid_from": str | None, "valid_until": str | None,
"source_path": str, "source_slug": str,
"extraction_confidence": float,
"source_confidence": float,
"reliability": str,
}
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
from ._yaml import YamlSchemaError, load_yaml, require_list, require_str
class FamilyTreeAnachronismError(YamlSchemaError):
"""Raised when a parent dies before their child is born."""
# Reliability is canonical by default for YAML; the demo world uses
# no frontmatter override on its YAML files.
DEFAULT_RELIABILITY = "canonical"
DEFAULT_EXTRACTION_CONFIDENCE = 1.0
DEFAULT_SOURCE_CONFIDENCE = 1.0
def _member_id(m: dict) -> str:
"""Return the slug id for a member dict."""
raw = m.get("id")
if isinstance(raw, str) and raw.strip():
return raw.strip()
raise YamlSchemaError("member missing required 'id' (slug)")
def _member_name(m: dict) -> str:
return require_str(m, "name", where="members[].name")
def _member_born(m: dict) -> str | None:
"""Return the born time string, or None if explicitly null/missing."""
val = m.get("born", None)
if val is None:
return None
if isinstance(val, str) and val.strip():
return val.strip()
raise YamlSchemaError(f"member.born must be a time string or null, got {val!r}")
def _member_died(m: dict) -> str | None:
val = m.get("died", None)
if val is None:
return None
if isinstance(val, str) and val.strip():
return val.strip()
raise YamlSchemaError(f"member.died must be a time string or null, got {val!r}")
def _member_parents(m: dict) -> list[str]:
val = m.get("parents", [])
if val is None:
return []
if not isinstance(val, list):
raise YamlSchemaError(
f"member.parents must be a list of slugs, got {type(val).__name__}"
)
out: list[str] = []
for p in val:
if not isinstance(p, str) or not p.strip():
raise YamlSchemaError(f"member.parents entries must be non-empty strings, got {p!r}")
out.append(p.strip())
return out
def _validate_member_shape(m: Any, *, where: str) -> None:
if not isinstance(m, dict):
raise YamlSchemaError(f"{where} must be a mapping, got {type(m).__name__}")
def parse_family_tree_file(path: str) -> tuple[list[dict], list[dict]]:
"""Parse one ``family_tree.yaml`` file.
Returns ``(nodes, triples)`` where nodes is a list of dicts::
{"id": str, "name": str, "type": str} # type is Lineage | Person
and triples is a list of edge dicts in the engine's canonical
shape. Re-parsing the same file yields the same output
(idempotent).
"""
data, source_path = load_yaml(path)
lineage_id = require_str(data, "lineage", where="family_tree")
# founding_ancestor is optional; document but don't enforce yet.
founding = data.get("founding_ancestor")
if founding is not None and not isinstance(founding, str):
raise YamlSchemaError("founding_ancestor must be a string slug or null")
members = require_list(data, "members", where="family_tree")
by_id: dict[str, dict] = {}
for idx, m in enumerate(members):
_validate_member_shape(m, where=f"members[{idx}]")
mid = _member_id(m)
if mid in by_id:
raise YamlSchemaError(f"duplicate member id '{mid}' in lineage '{lineage_id}'")
by_id[mid] = m
nodes: list[dict] = []
triples: list[dict] = []
# The Lineage node itself.
nodes.append({"id": lineage_id, "name": lineage_id.replace("_", " ").title(), "type": "Lineage"})
# Person nodes (one per member).
for mid, m in by_id.items():
nodes.append({"id": mid, "name": _member_name(m), "type": "Person"})
# Membership edges — one per member. valid_from = member.born,
# valid_until = member.died (or None).
for mid, m in by_id.items():
triples.append({
"subject": _member_name(m),
"relation": "MEMBER_OF",
"object": lineage_id,
"valid_from": _member_born(m),
"valid_until": _member_died(m),
"source_path": source_path,
"source_slug": Path(source_path).stem,
"extraction_confidence": DEFAULT_EXTRACTION_CONFIDENCE,
"source_confidence": DEFAULT_SOURCE_CONFIDENCE,
"reliability": DEFAULT_RELIABILITY,
# Edge-kind hint for downstream consumers; not part of
# the engine's canonical shape but useful for the
# consistency engine (slice 2).
"kind": "MEMBER_OF(Lineage)",
})
# PARENT_OF edges. valid_from = child.born, valid_until = parent.died.
for mid, m in by_id.items():
child_name = _member_name(m)
child_born = _member_born(m)
for parent_id in _member_parents(m):
if parent_id not in by_id:
raise YamlSchemaError(
f"member '{mid}' references unknown parent '{parent_id}'"
)
parent = by_id[parent_id]
parent_name = _member_name(parent)
parent_died = _member_died(parent)
# Anachronism check: parent died before child was born.
if child_born is not None and parent_died is not None:
from ..time_model import time_in_window
# The relationship only spans [child_born, parent_died).
# If parent_died is *before* child_born, the relationship
# window is empty -> anachronism.
if not time_in_window(parent_died, child_born, None):
raise FamilyTreeAnachronismError(
f"parent '{parent_name}' died at {parent_died}, "
f"before child '{child_name}' was born at {child_born}",
field=f"members[{mid}].parents",
)
triples.append({
"subject": parent_name,
"relation": "PARENT_OF",
"object": child_name,
"valid_from": child_born,
"valid_until": parent_died,
"source_path": source_path,
"source_slug": Path(source_path).stem,
"extraction_confidence": DEFAULT_EXTRACTION_CONFIDENCE,
"source_confidence": DEFAULT_SOURCE_CONFIDENCE,
"reliability": DEFAULT_RELIABILITY,
"kind": "PARENT_OF",
})
return nodes, triples
__all__ = [
"FamilyTreeAnachronismError",
"parse_family_tree_file",
]

View File

View File

@@ -0,0 +1,180 @@
"""TDD-first tests for the family_tree.yaml parser (slice 1, sub-slice 1.1).
Each test corresponds to one AC row in
``docs/plan/01-slice-structured-yaml.md``. Tests fail first;
implementation follows.
"""
from __future__ import annotations
import textwrap
import pytest
# A canonical two-member family tree used by several tests.
# Both members are in the 3rd age so that the parent's death
# (year 350) post-dates the child's birth (year 300) — a healthy
# relationship window of 50 years.
CANONICAL_YAML = (
"lineage: \"ashveil_bloodline\"\n"
"founding_ancestor: \"theron_ashveil\"\n"
"description: \"The bloodline of Theron Ashveil's descendants.\"\n"
"members:\n"
" - id: \"theron_ashveil\"\n"
" name: \"Theron Ashveil\"\n"
" born: \"3rd_age.year_200\"\n"
" died: \"3rd_age.year_350\"\n"
" - id: \"aldric_raventhorne\"\n"
" name: \"Aldric Raventhorne\"\n"
" born: \"3rd_age.year_300\"\n"
" parents: [\"theron_ashveil\"]\n"
)
@pytest.fixture
def family_tree_path(tmp_path):
p = tmp_path / "ashveil.yaml"
p.write_text(CANONICAL_YAML)
return p
def test_family_tree_emits_lineage_node_not_faction_node(family_tree_path):
"""AC 1.12: family_tree.yaml produces Lineage, never Faction."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
nodes, triples = parse_family_tree_file(str(family_tree_path))
node_types = {n["type"] for n in nodes}
assert "Lineage" in node_types
assert "Faction" not in node_types
def test_family_tree_emits_member_of_edge_with_valid_from(family_tree_path):
"""AC 1.2: every edge has valid_from and valid_until derived from YAML, not null."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
nodes, triples = parse_family_tree_file(str(family_tree_path))
member_of = [t for t in triples if t["relation"] == "MEMBER_OF"]
assert len(member_of) == 2
aldric = [t for t in member_of if t["subject"] == "Aldric Raventhorne"][0]
assert aldric["object"] == "ashveil_bloodline"
assert aldric["valid_from"] == "3rd_age.year_300" # = aldric.born
assert aldric["valid_until"] is None # still living
def test_family_tree_emits_parent_of_edge_with_bounds_from_lifespans(family_tree_path):
"""AC 1.2: PARENT_OF valid_from = child.born, valid_until = parent.died."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
nodes, triples = parse_family_tree_file(str(family_tree_path))
parent_of = [t for t in triples if t["relation"] == "PARENT_OF"]
assert len(parent_of) == 1
edge = parent_of[0]
assert edge["subject"] == "Theron Ashveil"
assert edge["object"] == "Aldric Raventhorne"
assert edge["valid_from"] == "3rd_age.year_300" # aldric.born
assert edge["valid_until"] == "3rd_age.year_350" # theron.died
def test_family_tree_anachronism_flagged_when_parent_dies_before_child_born(tmp_path):
"""AC 1.6: an anachronistic YAML raises a parser error, not silent."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
bad = tmp_path / "broken.yaml"
bad.write_text(textwrap.dedent("""
lineage: "broken"
members:
- id: "parent"
name: "Parent"
born: "3rd_age.year_1"
died: "3rd_age.year_2"
- id: "child"
name: "Child"
born: "3rd_age.year_10"
parents: ["parent"]
""").lstrip())
with pytest.raises(Exception) as excinfo:
parse_family_tree_file(str(bad))
# The error must reference which fact is wrong.
msg = str(excinfo.value).lower()
assert "anachron" in msg or "before child" in msg or "born after" in msg
def test_family_tree_idempotent_under_re_parse(family_tree_path):
"""AC 1.7: re-ingest yields the same triples (no duplicates)."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
_, t1 = parse_family_tree_file(str(family_tree_path))
_, t2 = parse_family_tree_file(str(family_tree_path))
assert t1 == t2
def test_family_tree_malformed_yaml_reports_line_number(tmp_path):
"""AC 1.5: schema violation reports line number."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
# members must be a list; here it's a string.
bad = tmp_path / "bad.yaml"
bad.write_text(textwrap.dedent("""
lineage: "broken"
members: "not a list"
""").lstrip())
with pytest.raises(Exception) as excinfo:
parse_family_tree_file(str(bad))
# The error must carry *location* context — either a "line N"
# in the message or a JSON-pointer-like field path. Both are
# acceptable per AC 1.5.
exc = excinfo.value
msg = str(exc)
has_line = ("line " in msg) or ("line=" in msg)
has_field = getattr(exc, "field", None) is not None
assert has_line or has_field, f"error has no location info: {exc!r}"
def test_cross_lineage_marriage_child_in_named_lineage(tmp_path):
"""AC 1.15: cross-lineage child is MEMBER_OF the lineage named in YAML header."""
from lore_engine_poc.parsers.family_tree import parse_family_tree_file
# Aldric's parents span two lineages. The YAML header says the
# child's lineage is ashveil_bloodline. The other parent is
# reachable only via PARENT_OF (slice 1 doesn't model
# "belongs to two lineages by blood").
f = tmp_path / "mixed.yaml"
f.write_text(textwrap.dedent("""
lineage: "ashveil_bloodline"
members:
- id: "theron_ashveil"
name: "Theron Ashveil"
born: "3rd_age.year_200"
died: "3rd_age.year_350"
- id: "yssa_raventhorne"
name: "Yssa Raventhorne"
born: "3rd_age.year_250"
- id: "aldric_raventhorne"
name: "Aldric Raventhorne"
born: "3rd_age.year_300"
parents: ["theron_ashveil", "yssa_raventhorne"]
""").lstrip())
nodes, triples = parse_family_tree_file(str(f))
member_of_aldric = [
t for t in triples
if t["relation"] == "MEMBER_OF" and t["subject"] == "Aldric Raventhorne"
]
assert len(member_of_aldric) == 1
assert member_of_aldric[0]["object"] == "ashveil_bloodline"
# PARENT_OF edges cover both parents, but neither carries an
# implicit lineage attribution for the child.
parents_of_aldric = {
t["subject"]
for t in triples
if t["relation"] == "PARENT_OF" and t["object"] == "Aldric Raventhorne"
}
assert parents_of_aldric == {"Theron Ashveil", "Yssa Raventhorne"}