lore-engine-poc/tests/test_embeddings_plugin.py

"""
Tests for plugins/embeddings.py — the pgvector-backed image semantic search plugin.

Two test tiers:
- Unit tests of the SQL/cosine logic with hand-crafted embeddings.
- Integration test that exercises the full pipeline against a live pgvector DB
  (the running `lore-postgres-pgvector` container, or whatever PG_URL points at).
- Semantic test that uses a stub embedder to prove the top-k ordering is correct
  for the mock-world's 4 images (Aldric, Vex, Thornwall, Battle).

A real sentence-transformers model is NOT required for these tests — the
embedder is a small monkey-patchable seam.
"""
import os
import sys
import math
import pytest

# Make the gateway package importable so the plugin can `from server import ...`
GATEWAY_DIR = os.path.join(os.path.dirname(__file__), "..", "gateway")
sys.path.insert(0, GATEWAY_DIR)

# Plugin files load from a directory path; the server module points REGISTRY
# at a module-level singleton, which we reuse by registering the plugin in
# an isolated registry. We import the plugin module manually with a sys.path
# that includes `plugins/`.


# ─── Helpers ────────────────────────────────────────────────────────────────

def make_vec(dims=384, seed=0):
    """Deterministic unit-ish vector: all components = 1/sqrt(dims)."""
    v = [0.0] * dims
    v[seed % dims] = 1.0
    v[(seed + 1) % dims] = 0.5
    norm = math.sqrt(sum(x * x for x in v)) or 1.0
    return [x / norm for x in v]


def shift_vec(base, dims=384, jitter_dims=10, scale=0.9):
    """Make a vector that's close to base but slightly different — used to
    simulate "semantically similar" embeddings in tests."""
    v = list(base)
    for i in range(jitter_dims):
        v[i] = base[i] * scale
    norm = math.sqrt(sum(x * x for x in v)) or 1.0
    return [x / norm for x in v]


# ─── Unit tests: SQL/cosine logic via real pgvector ─────────────────────────
# These run against the live `lore-postgres-pgvector` container (port 5433).
# CI can be configured to skip them if PG_PGVECTOR_URL is unset.

PG_PGVECTOR_URL = os.environ.get(
    "TEST_PG_PGVECTOR_URL",
    "postgresql://lore:***@localhost:5433/lore",
)


@pytest.fixture(scope="module")
def pg_conn():
    import psycopg2
    conn = psycopg2.connect(PG_PGVECTOR_URL)
    # Ensure schema
    with conn.cursor() as cur:
        cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
        cur.execute("""
            CREATE TABLE IF NOT EXISTS image_manifest (
                id           BIGSERIAL PRIMARY KEY,
                image_id     TEXT NOT NULL UNIQUE,
                object_key   TEXT NOT NULL,
                entity_id    TEXT,
                entity_type  TEXT,
                caption      TEXT NOT NULL,
                tags         TEXT[],
                era          TEXT,
                uploaded_at  TIMESTAMPTZ NOT NULL DEFAULT now(),
                width        INT,
                height       INT,
                bytes        BIGINT
            );
        """)
        cur.execute("""
            CREATE TABLE IF NOT EXISTS image_embedding (
                image_id     TEXT PRIMARY KEY,
                embedding    vector(384) NOT NULL,
                embedded_at  TIMESTAMPTZ NOT NULL DEFAULT now()
            );
        """)
    conn.commit()
    yield conn
    # Cleanup
    with conn.cursor() as cur:
        cur.execute("DELETE FROM image_embedding;")
        cur.execute("DELETE FROM image_manifest;")
    conn.commit()
    conn.close()


@pytest.fixture
def clean_tables(pg_conn):
    with pg_conn.cursor() as cur:
        cur.execute("DELETE FROM image_embedding;")
        cur.execute("DELETE FROM image_manifest;")
    pg_conn.commit()
    yield


def test_image_embedding_table_accepts_vector(pg_conn, clean_tables):
    """RED→GREEN: the table stores 384-dim vectors and they round-trip."""
    with pg_conn.cursor() as cur:
        cur.execute("""
            INSERT INTO image_manifest
                (image_id, object_key, caption)
            VALUES ('t1', 'k1', 'cap1')
            ON CONFLICT (image_id) DO NOTHING;
        """)
        v = make_vec(seed=1)
        cur.execute(
            "INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
            ("t1", v),
        )
    pg_conn.commit()
    with pg_conn.cursor() as cur:
        cur.execute("SELECT embedding FROM image_embedding WHERE image_id = 't1';")
        (raw,) = cur.fetchone()
        # pgvector returns a string like '[0.1,0.2,...]'
        assert raw.startswith("[") and raw.endswith("]"), raw[:50]
    pg_conn.commit()


def test_cosine_distance_orders_by_similarity(pg_conn, clean_tables):
    """The top-k query orders by `<=>` (cosine distance), not L2 or L1."""
    from plugins.embeddings import _search_by_vector
    with pg_conn.cursor() as cur:
        for i, img_id in enumerate(["aldric", "vex", "thornwall", "battle"]):
            cur.execute(
                "INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
                (img_id, f"k/{img_id}", f"caption for {img_id}"),
            )
        base = make_vec(seed=42)
        # Aldric's embedding is closest to the query
        aldric_v = shift_vec(base, jitter_dims=0, scale=1.0)        # identical
        vex_v    = shift_vec(base, jitter_dims=20, scale=0.6)       # further
        thorn_v  = shift_vec(base, jitter_dims=60, scale=0.4)       # much further
        battle_v = shift_vec(base, jitter_dims=120, scale=0.1)      # almost orthogonal
        for img_id, vec in [
            ("aldric", aldric_v), ("vex", vex_v),
            ("thornwall", thorn_v), ("battle", battle_v),
        ]:
            cur.execute(
                "INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
                (img_id, vec),
            )
    pg_conn.commit()
    out = _search_by_vector(base, limit=4, pg_url=PG_PGVECTOR_URL)
    ids = [r["image_id"] for r in out]
    assert ids[0] == "aldric", f"expected aldric first, got {ids}"
    # Aldric should beat vex, vex should beat thornwall, etc.
    assert ids.index("aldric") < ids.index("vex")
    assert ids.index("vex") < ids.index("thornwall") < ids.index("battle")


# ─── Unit tests: embed_images dedupes by image_id ───────────────────────────

def test_embed_images_only_embeds_missing(pg_conn, clean_tables, monkeypatch):
    """embed_images should only compute embeddings for rows that don't have one yet."""
    from plugins import embeddings
    with pg_conn.cursor() as cur:
        for i, img_id in enumerate(["a", "b", "c"]):
            cur.execute(
                "INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
                (img_id, f"k/{img_id}", f"cap {img_id}"),
            )
        # 'a' already has an embedding
        cur.execute(
            "INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
            ("a", make_vec(seed=99)),
        )
    pg_conn.commit()

    called_with = []

    def fake_encode(texts, **kwargs):
        called_with.extend(texts)
        # Return a vector per text
        return [make_vec(seed=hash(t) % 384) for t in texts]

    # Patch the lazy model loader
    monkeypatch.setattr(embeddings, "_get_model", lambda: type("M", (), {"encode": staticmethod(fake_encode)})())

    count = embeddings._do_embed_images(limit=10, pg_url=PG_PGVECTOR_URL)
    assert count == 2, f"expected 2 new embeddings, got {count}"
    # 'a' should NOT have been re-embedded
    assert "a" not in called_with
    assert set(called_with) == {"cap b", "cap c"}

    # Subsequent call should be a no-op
    count2 = embeddings._do_embed_images(limit=10, pg_url=PG_PGVECTOR_URL)
    assert count2 == 0


# ─── Semantic test: stub embedder, mock-world 4 images ─────────────────────

def test_semantic_search_with_stub_embedder(pg_conn, clean_tables, monkeypatch):
    """With a stub embedder, `search_images_semantic` returns the right top-1
    for two distinct queries against the 4 mock images."""
    from plugins import embeddings
    # 4 mock images with hard-coded "embeddings" that simulate their captions.
    # Each caption becomes a unit vector pointing into a distinct axis, and
    # the query is a noisy version of the target axis.
    captions = {
        "aldric":    [1, 0, 0, 0],  # noble lord, scar
        "vex":       [0, 1, 0, 0],  # sneaky thief, hood
        "thornwall": [0, 0, 1, 0],  # keep, dawn
        "battle":    [0, 0, 0, 1],  # battle, banners
    }
    # Pad to 384 dims
    def pad(v):
        out = [0.0] * 384
        for i, x in enumerate(v):
            out[i] = float(x)
        return out
    with pg_conn.cursor() as cur:
        for img_id, base in captions.items():
            cur.execute(
                "INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
                (img_id, f"k/{img_id}", img_id),
            )
            cur.execute(
                "INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
                (img_id, pad(base)),
            )
    pg_conn.commit()

    # Stub model: encode(text) → a 384-dim vector matching the doc whose
    # caption best matches the text. Deterministic.
    def stub_encode(texts, **kwargs):
        keyword_axis = {
            "noble": 0, "lord": 0, "scar": 0,
            "sneaky": 1, "thief": 1, "hood": 1,
            "keep": 2, "dawn": 2,
            "battle": 3, "banners": 3,
        }
        out = []
        for t in texts:
            v = [0.0] * 384
            for word, axis in keyword_axis.items():
                if word in t.lower():
                    v[axis] = 1.0
            if not any(v):
                v[0] = 1.0  # default
            out.append(v)
        return out

    monkeypatch.setattr(embeddings, "_get_model", lambda: type("M", (), {"encode": staticmethod(stub_encode)})())

    r1 = embeddings._do_search_semantic("a noble lord with a scar", limit=1, pg_url=PG_PGVECTOR_URL)
    assert r1["images"][0]["image_id"] == "aldric", r1

    r2 = embeddings._do_search_semantic("a sneaky thief in a hood", limit=1, pg_url=PG_PGVECTOR_URL)
    assert r2["images"][0]["image_id"] == "vex", r2