Files
lore-engine-poc/tests/test_embeddings_plugin.py
Hermes add264eb04 T2: pgvector image embeddings — plugin, schema, seed, hook, tests
- docker-compose: swap postgres image to pgvector/pgvector:pg16
- postgres/init.sql: CREATE EXTENSION vector; image_embedding table
- plugins/embeddings.py: embed_images + search_images_semantic
  (sentence-transformers all-MiniLM-L6-v2, lazy-loaded, pgvector <=> cosine)
- plugins/images.py: register_image kicks off background embed worker
- seed.py: seed_embeddings writes 4 embeddings for the mock images
- README: semantic image search section + T3 note
- 11 tests across 4 files, all green:
    test_embeddings_plugin.py (4): schema, ordering, idempotency, stub
    test_embeddings_real_model.py (3): real MiniLM, acceptance queries
    test_register_image_hook.py (2): manifest row, end-to-end hook
    test_seed_embeddings.py (2): writes 4, idempotent
- Includes T3 consistency plugin skeleton (4 stub tools)
2026-06-16 14:30:10 +00:00

263 lines
10 KiB
Python

"""
Tests for plugins/embeddings.py — the pgvector-backed image semantic search plugin.
Two test tiers:
- Unit tests of the SQL/cosine logic with hand-crafted embeddings.
- Integration test that exercises the full pipeline against a live pgvector DB
(the running `lore-postgres-pgvector` container, or whatever PG_URL points at).
- Semantic test that uses a stub embedder to prove the top-k ordering is correct
for the mock-world's 4 images (Aldric, Vex, Thornwall, Battle).
A real sentence-transformers model is NOT required for these tests — the
embedder is a small monkey-patchable seam.
"""
import os
import sys
import math
import pytest
# Make the gateway package importable so the plugin can `from server import ...`
GATEWAY_DIR = os.path.join(os.path.dirname(__file__), "..", "gateway")
sys.path.insert(0, GATEWAY_DIR)
# Plugin files load from a directory path; the server module points REGISTRY
# at a module-level singleton, which we reuse by registering the plugin in
# an isolated registry. We import the plugin module manually with a sys.path
# that includes `plugins/`.
# ─── Helpers ────────────────────────────────────────────────────────────────
def make_vec(dims=384, seed=0):
"""Deterministic unit-ish vector: all components = 1/sqrt(dims)."""
v = [0.0] * dims
v[seed % dims] = 1.0
v[(seed + 1) % dims] = 0.5
norm = math.sqrt(sum(x * x for x in v)) or 1.0
return [x / norm for x in v]
def shift_vec(base, dims=384, jitter_dims=10, scale=0.9):
"""Make a vector that's close to base but slightly different — used to
simulate "semantically similar" embeddings in tests."""
v = list(base)
for i in range(jitter_dims):
v[i] = base[i] * scale
norm = math.sqrt(sum(x * x for x in v)) or 1.0
return [x / norm for x in v]
# ─── Unit tests: SQL/cosine logic via real pgvector ─────────────────────────
# These run against the live `lore-postgres-pgvector` container (port 5433).
# CI can be configured to skip them if PG_PGVECTOR_URL is unset.
PG_PGVECTOR_URL = os.environ.get(
"TEST_PG_PGVECTOR_URL",
"postgresql://lore:***@localhost:5433/lore",
)
@pytest.fixture(scope="module")
def pg_conn():
import psycopg2
conn = psycopg2.connect(PG_PGVECTOR_URL)
# Ensure schema
with conn.cursor() as cur:
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cur.execute("""
CREATE TABLE IF NOT EXISTS image_manifest (
id BIGSERIAL PRIMARY KEY,
image_id TEXT NOT NULL UNIQUE,
object_key TEXT NOT NULL,
entity_id TEXT,
entity_type TEXT,
caption TEXT NOT NULL,
tags TEXT[],
era TEXT,
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
width INT,
height INT,
bytes BIGINT
);
""")
cur.execute("""
CREATE TABLE IF NOT EXISTS image_embedding (
image_id TEXT PRIMARY KEY,
embedding vector(384) NOT NULL,
embedded_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
""")
conn.commit()
yield conn
# Cleanup
with conn.cursor() as cur:
cur.execute("DELETE FROM image_embedding;")
cur.execute("DELETE FROM image_manifest;")
conn.commit()
conn.close()
@pytest.fixture
def clean_tables(pg_conn):
with pg_conn.cursor() as cur:
cur.execute("DELETE FROM image_embedding;")
cur.execute("DELETE FROM image_manifest;")
pg_conn.commit()
yield
def test_image_embedding_table_accepts_vector(pg_conn, clean_tables):
"""RED→GREEN: the table stores 384-dim vectors and they round-trip."""
with pg_conn.cursor() as cur:
cur.execute("""
INSERT INTO image_manifest
(image_id, object_key, caption)
VALUES ('t1', 'k1', 'cap1')
ON CONFLICT (image_id) DO NOTHING;
""")
v = make_vec(seed=1)
cur.execute(
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
("t1", v),
)
pg_conn.commit()
with pg_conn.cursor() as cur:
cur.execute("SELECT embedding FROM image_embedding WHERE image_id = 't1';")
(raw,) = cur.fetchone()
# pgvector returns a string like '[0.1,0.2,...]'
assert raw.startswith("[") and raw.endswith("]"), raw[:50]
pg_conn.commit()
def test_cosine_distance_orders_by_similarity(pg_conn, clean_tables):
"""The top-k query orders by `<=>` (cosine distance), not L2 or L1."""
from plugins.embeddings import _search_by_vector
with pg_conn.cursor() as cur:
for i, img_id in enumerate(["aldric", "vex", "thornwall", "battle"]):
cur.execute(
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
(img_id, f"k/{img_id}", f"caption for {img_id}"),
)
base = make_vec(seed=42)
# Aldric's embedding is closest to the query
aldric_v = shift_vec(base, jitter_dims=0, scale=1.0) # identical
vex_v = shift_vec(base, jitter_dims=20, scale=0.6) # further
thorn_v = shift_vec(base, jitter_dims=60, scale=0.4) # much further
battle_v = shift_vec(base, jitter_dims=120, scale=0.1) # almost orthogonal
for img_id, vec in [
("aldric", aldric_v), ("vex", vex_v),
("thornwall", thorn_v), ("battle", battle_v),
]:
cur.execute(
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
(img_id, vec),
)
pg_conn.commit()
out = _search_by_vector(base, limit=4, pg_url=PG_PGVECTOR_URL)
ids = [r["image_id"] for r in out]
assert ids[0] == "aldric", f"expected aldric first, got {ids}"
# Aldric should beat vex, vex should beat thornwall, etc.
assert ids.index("aldric") < ids.index("vex")
assert ids.index("vex") < ids.index("thornwall") < ids.index("battle")
# ─── Unit tests: embed_images dedupes by image_id ───────────────────────────
def test_embed_images_only_embeds_missing(pg_conn, clean_tables, monkeypatch):
"""embed_images should only compute embeddings for rows that don't have one yet."""
from plugins import embeddings
with pg_conn.cursor() as cur:
for i, img_id in enumerate(["a", "b", "c"]):
cur.execute(
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
(img_id, f"k/{img_id}", f"cap {img_id}"),
)
# 'a' already has an embedding
cur.execute(
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
("a", make_vec(seed=99)),
)
pg_conn.commit()
called_with = []
def fake_encode(texts, **kwargs):
called_with.extend(texts)
# Return a vector per text
return [make_vec(seed=hash(t) % 384) for t in texts]
# Patch the lazy model loader
monkeypatch.setattr(embeddings, "_get_model", lambda: type("M", (), {"encode": staticmethod(fake_encode)})())
count = embeddings._do_embed_images(limit=10, pg_url=PG_PGVECTOR_URL)
assert count == 2, f"expected 2 new embeddings, got {count}"
# 'a' should NOT have been re-embedded
assert "a" not in called_with
assert set(called_with) == {"cap b", "cap c"}
# Subsequent call should be a no-op
count2 = embeddings._do_embed_images(limit=10, pg_url=PG_PGVECTOR_URL)
assert count2 == 0
# ─── Semantic test: stub embedder, mock-world 4 images ─────────────────────
def test_semantic_search_with_stub_embedder(pg_conn, clean_tables, monkeypatch):
"""With a stub embedder, `search_images_semantic` returns the right top-1
for two distinct queries against the 4 mock images."""
from plugins import embeddings
# 4 mock images with hard-coded "embeddings" that simulate their captions.
# Each caption becomes a unit vector pointing into a distinct axis, and
# the query is a noisy version of the target axis.
captions = {
"aldric": [1, 0, 0, 0], # noble lord, scar
"vex": [0, 1, 0, 0], # sneaky thief, hood
"thornwall": [0, 0, 1, 0], # keep, dawn
"battle": [0, 0, 0, 1], # battle, banners
}
# Pad to 384 dims
def pad(v):
out = [0.0] * 384
for i, x in enumerate(v):
out[i] = float(x)
return out
with pg_conn.cursor() as cur:
for img_id, base in captions.items():
cur.execute(
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
(img_id, f"k/{img_id}", img_id),
)
cur.execute(
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
(img_id, pad(base)),
)
pg_conn.commit()
# Stub model: encode(text) → a 384-dim vector matching the doc whose
# caption best matches the text. Deterministic.
def stub_encode(texts, **kwargs):
keyword_axis = {
"noble": 0, "lord": 0, "scar": 0,
"sneaky": 1, "thief": 1, "hood": 1,
"keep": 2, "dawn": 2,
"battle": 3, "banners": 3,
}
out = []
for t in texts:
v = [0.0] * 384
for word, axis in keyword_axis.items():
if word in t.lower():
v[axis] = 1.0
if not any(v):
v[0] = 1.0 # default
out.append(v)
return out
monkeypatch.setattr(embeddings, "_get_model", lambda: type("M", (), {"encode": staticmethod(stub_encode)})())
r1 = embeddings._do_search_semantic("a noble lord with a scar", limit=1, pg_url=PG_PGVECTOR_URL)
assert r1["images"][0]["image_id"] == "aldric", r1
r2 = embeddings._do_search_semantic("a sneaky thief in a hood", limit=1, pg_url=PG_PGVECTOR_URL)
assert r2["images"][0]["image_id"] == "vex", r2