140 lines
4.2 KiB
Python
140 lines
4.2 KiB
Python
"""Lore Engine POC — LLM extraction (slice 3).
|
|
|
|
The extractor takes an :class:`Entity` and a
|
|
:class:`LLMProvider`, calls the LLM with the entity's prose
|
|
body, parses the response, and returns a list of
|
|
:class:`Triple` objects ready to merge into the graph.
|
|
|
|
The extractor is **failure-tolerant by design**. Every failure
|
|
mode — provider timeout, malformed JSON, malformed triple,
|
|
unknown relation, missing source — degrades to ``[]`` for
|
|
that chunk. The caller (the ingest script) gets a partial
|
|
result and the deterministic path continues to work.
|
|
|
|
Why so defensive?
|
|
|
|
* LLM output is genuinely noisy. We can't trust the
|
|
provider to always return valid JSON, and we can't trust
|
|
the LLM to always emit canonical relation names. Hard
|
|
failure on bad input would lose us the whole chunk; soft
|
|
failure (skip the bad triple) keeps the rest.
|
|
* The ingest script wants to add LLM triples *on top of*
|
|
the deterministic graph. A bad LLM call shouldn't take
|
|
the determinism down with it.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import sys
|
|
from typing import Any
|
|
|
|
from .llm import LLMProvider
|
|
from .parsers_legacy import Entity, LoreSource, Triple
|
|
from .prompts.extraction import CANONICAL_RELATIONS as _CANONICAL
|
|
|
|
|
|
# Public re-export so callers can introspect the set without
|
|
# reaching into the prompts subpackage.
|
|
CANONICAL_RELATIONS: frozenset[str] = _CANONICAL
|
|
|
|
|
|
_log = logging.getLogger("lore_engine_poc.extraction")
|
|
|
|
|
|
def _emit_unknown_relation_warning(relation: str, source_path: str) -> None:
|
|
"""Surface dropped relations on stderr so the world-builder
|
|
can grow ``CANONICAL_RELATIONS`` as the LLM discovers new
|
|
useful ones. Per slice 3 plan: drop + log to stderr."""
|
|
print(
|
|
f"[extraction] dropped unknown relation {relation!r} "
|
|
f"from {source_path}",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
|
|
def _coerce_str(value: Any) -> str:
|
|
"""LLM output may have ints, None, etc. — coerce to a clean string."""
|
|
if value is None:
|
|
return ""
|
|
return str(value).strip()
|
|
|
|
|
|
def extract_from_chunk(
|
|
entity: Entity,
|
|
provider: LLMProvider,
|
|
*,
|
|
extraction_confidence: float = 0.5,
|
|
) -> list[Triple]:
|
|
"""Run the LLM extractor on ``entity.body`` and return typed triples.
|
|
|
|
Returns ``[]`` on any failure (provider exception, bad JSON,
|
|
malformed response, all-triples-rejected). The caller is
|
|
responsible for merging these into the wider graph build.
|
|
"""
|
|
body = entity.body or ""
|
|
if not body.strip():
|
|
return []
|
|
|
|
# Lazy import — keeps the import graph small for tests
|
|
# that only want the FakeProvider.
|
|
from .prompts.extraction import EXTRACTION_PROMPT
|
|
|
|
prompt = EXTRACTION_PROMPT.format(body=body)
|
|
messages = [{"role": "user", "content": prompt}]
|
|
|
|
try:
|
|
raw = provider.chat(messages=messages)
|
|
except Exception as exc:
|
|
_log.warning(
|
|
"provider raised on %s: %s", entity.slug, exc
|
|
)
|
|
return []
|
|
|
|
try:
|
|
parsed = json.loads(raw)
|
|
except (json.JSONDecodeError, TypeError):
|
|
_log.warning(
|
|
"non-JSON response from provider for %s", entity.slug
|
|
)
|
|
return []
|
|
|
|
if not isinstance(parsed, list):
|
|
_log.warning(
|
|
"expected JSON array, got %s for %s",
|
|
type(parsed).__name__, entity.slug,
|
|
)
|
|
return []
|
|
|
|
source: LoreSource | None = entity.sources[0] if entity.sources else None
|
|
if source is None:
|
|
return []
|
|
|
|
out: list[Triple] = []
|
|
for item in parsed:
|
|
if not isinstance(item, list) or len(item) != 3:
|
|
continue
|
|
s, r, o = (_coerce_str(x) for x in item)
|
|
if not s or not r or not o:
|
|
continue
|
|
if s == o:
|
|
# self-loop — almost always wrong
|
|
continue
|
|
if r not in CANONICAL_RELATIONS:
|
|
_emit_unknown_relation_warning(r, source.path)
|
|
continue
|
|
out.append(Triple(
|
|
subject=s,
|
|
relation=r,
|
|
object=o,
|
|
source_path=source.path,
|
|
source_slug=entity.slug,
|
|
extraction_confidence=extraction_confidence,
|
|
source_confidence=source.source_confidence,
|
|
reliability=source.reliability,
|
|
))
|
|
return out
|
|
|
|
|
|
__all__ = ["CANONICAL_RELATIONS", "extract_from_chunk"] |