Files

140 lines
4.2 KiB
Python

"""Lore Engine POC — LLM extraction (slice 3).
The extractor takes an :class:`Entity` and a
:class:`LLMProvider`, calls the LLM with the entity's prose
body, parses the response, and returns a list of
:class:`Triple` objects ready to merge into the graph.
The extractor is **failure-tolerant by design**. Every failure
mode — provider timeout, malformed JSON, malformed triple,
unknown relation, missing source — degrades to ``[]`` for
that chunk. The caller (the ingest script) gets a partial
result and the deterministic path continues to work.
Why so defensive?
* LLM output is genuinely noisy. We can't trust the
provider to always return valid JSON, and we can't trust
the LLM to always emit canonical relation names. Hard
failure on bad input would lose us the whole chunk; soft
failure (skip the bad triple) keeps the rest.
* The ingest script wants to add LLM triples *on top of*
the deterministic graph. A bad LLM call shouldn't take
the determinism down with it.
"""
from __future__ import annotations
import json
import logging
import sys
from typing import Any
from .llm import LLMProvider
from .parsers_legacy import Entity, LoreSource, Triple
from .prompts.extraction import CANONICAL_RELATIONS as _CANONICAL
# Public re-export so callers can introspect the set without
# reaching into the prompts subpackage.
CANONICAL_RELATIONS: frozenset[str] = _CANONICAL
_log = logging.getLogger("lore_engine_poc.extraction")
def _emit_unknown_relation_warning(relation: str, source_path: str) -> None:
"""Surface dropped relations on stderr so the world-builder
can grow ``CANONICAL_RELATIONS`` as the LLM discovers new
useful ones. Per slice 3 plan: drop + log to stderr."""
print(
f"[extraction] dropped unknown relation {relation!r} "
f"from {source_path}",
file=sys.stderr,
)
def _coerce_str(value: Any) -> str:
"""LLM output may have ints, None, etc. — coerce to a clean string."""
if value is None:
return ""
return str(value).strip()
def extract_from_chunk(
entity: Entity,
provider: LLMProvider,
*,
extraction_confidence: float = 0.5,
) -> list[Triple]:
"""Run the LLM extractor on ``entity.body`` and return typed triples.
Returns ``[]`` on any failure (provider exception, bad JSON,
malformed response, all-triples-rejected). The caller is
responsible for merging these into the wider graph build.
"""
body = entity.body or ""
if not body.strip():
return []
# Lazy import — keeps the import graph small for tests
# that only want the FakeProvider.
from .prompts.extraction import EXTRACTION_PROMPT
prompt = EXTRACTION_PROMPT.format(body=body)
messages = [{"role": "user", "content": prompt}]
try:
raw = provider.chat(messages=messages)
except Exception as exc:
_log.warning(
"provider raised on %s: %s", entity.slug, exc
)
return []
try:
parsed = json.loads(raw)
except (json.JSONDecodeError, TypeError):
_log.warning(
"non-JSON response from provider for %s", entity.slug
)
return []
if not isinstance(parsed, list):
_log.warning(
"expected JSON array, got %s for %s",
type(parsed).__name__, entity.slug,
)
return []
source: LoreSource | None = entity.sources[0] if entity.sources else None
if source is None:
return []
out: list[Triple] = []
for item in parsed:
if not isinstance(item, list) or len(item) != 3:
continue
s, r, o = (_coerce_str(x) for x in item)
if not s or not r or not o:
continue
if s == o:
# self-loop — almost always wrong
continue
if r not in CANONICAL_RELATIONS:
_emit_unknown_relation_warning(r, source.path)
continue
out.append(Triple(
subject=s,
relation=r,
object=o,
source_path=source.path,
source_slug=entity.slug,
extraction_confidence=extraction_confidence,
source_confidence=source.source_confidence,
reliability=source.reliability,
))
return out
__all__ = ["CANONICAL_RELATIONS", "extract_from_chunk"]