lore-engine-poc-v3/lore_engine_poc/extraction.py

"""Lore Engine POC — LLM extraction (slice 3).

The extractor takes an :class:`Entity` and a
:class:`LLMProvider`, calls the LLM with the entity's prose
body, parses the response, and returns a list of
:class:`Triple` objects ready to merge into the graph.

The extractor is **failure-tolerant by design**. Every failure
mode — provider timeout, malformed JSON, malformed triple,
unknown relation, missing source — degrades to ``[]`` for
that chunk. The caller (the ingest script) gets a partial
result and the deterministic path continues to work.

Why so defensive?

* LLM output is genuinely noisy. We can't trust the
  provider to always return valid JSON, and we can't trust
  the LLM to always emit canonical relation names. Hard
  failure on bad input would lose us the whole chunk; soft
  failure (skip the bad triple) keeps the rest.
* The ingest script wants to add LLM triples *on top of*
  the deterministic graph. A bad LLM call shouldn't take
  the determinism down with it.
"""

from __future__ import annotations

import json
import logging
import sys
from typing import Any

from .llm import LLMProvider
from .parsers_legacy import Entity, LoreSource, Triple
from .prompts.extraction import CANONICAL_RELATIONS as _CANONICAL


# Public re-export so callers can introspect the set without
# reaching into the prompts subpackage.
CANONICAL_RELATIONS: frozenset[str] = _CANONICAL


_log = logging.getLogger("lore_engine_poc.extraction")


def _emit_unknown_relation_warning(relation: str, source_path: str) -> None:
    """Surface dropped relations on stderr so the world-builder
    can grow ``CANONICAL_RELATIONS`` as the LLM discovers new
    useful ones. Per slice 3 plan: drop + log to stderr."""
    print(
        f"[extraction] dropped unknown relation {relation!r} "
        f"from {source_path}",
        file=sys.stderr,
    )


def _coerce_str(value: Any) -> str:
    """LLM output may have ints, None, etc. — coerce to a clean string."""
    if value is None:
        return ""
    return str(value).strip()


def extract_from_chunk(
    entity: Entity,
    provider: LLMProvider,
    *,
    extraction_confidence: float = 0.5,
) -> list[Triple]:
    """Run the LLM extractor on ``entity.body`` and return typed triples.

    Returns ``[]`` on any failure (provider exception, bad JSON,
    malformed response, all-triples-rejected). The caller is
    responsible for merging these into the wider graph build.
    """
    body = entity.body or ""
    if not body.strip():
        return []

    # Lazy import — keeps the import graph small for tests
    # that only want the FakeProvider.
    from .prompts.extraction import EXTRACTION_PROMPT

    prompt = EXTRACTION_PROMPT.format(body=body)
    messages = [{"role": "user", "content": prompt}]

    try:
        raw = provider.chat(messages=messages)
    except Exception as exc:
        _log.warning(
            "provider raised on %s: %s", entity.slug, exc
        )
        return []

    try:
        parsed = json.loads(raw)
    except (json.JSONDecodeError, TypeError):
        _log.warning(
            "non-JSON response from provider for %s", entity.slug
        )
        return []

    if not isinstance(parsed, list):
        _log.warning(
            "expected JSON array, got %s for %s",
            type(parsed).__name__, entity.slug,
        )
        return []

    source: LoreSource | None = entity.sources[0] if entity.sources else None
    if source is None:
        return []

    out: list[Triple] = []
    for item in parsed:
        if not isinstance(item, list) or len(item) != 3:
            continue
        s, r, o = (_coerce_str(x) for x in item)
        if not s or not r or not o:
            continue
        if s == o:
            # self-loop — almost always wrong
            continue
        if r not in CANONICAL_RELATIONS:
            _emit_unknown_relation_warning(r, source.path)
            continue
        out.append(Triple(
            subject=s,
            relation=r,
            object=o,
            source_path=source.path,
            source_slug=entity.slug,
            extraction_confidence=extraction_confidence,
            source_confidence=source.source_confidence,
            reliability=source.reliability,
        ))
    return out


__all__ = ["CANONICAL_RELATIONS", "extract_from_chunk"]