- docker-compose: swap postgres image to pgvector/pgvector:pg16
- postgres/init.sql: CREATE EXTENSION vector; image_embedding table
- plugins/embeddings.py: embed_images + search_images_semantic
(sentence-transformers all-MiniLM-L6-v2, lazy-loaded, pgvector <=> cosine)
- plugins/images.py: register_image kicks off background embed worker
- seed.py: seed_embeddings writes 4 embeddings for the mock images
- README: semantic image search section + T3 note
- 11 tests across 4 files, all green:
test_embeddings_plugin.py (4): schema, ordering, idempotency, stub
test_embeddings_real_model.py (3): real MiniLM, acceptance queries
test_register_image_hook.py (2): manifest row, end-to-end hook
test_seed_embeddings.py (2): writes 4, idempotent
- Includes T3 consistency plugin skeleton (4 stub tools)
263 lines
10 KiB
Python
263 lines
10 KiB
Python
"""
|
|
Tests for plugins/embeddings.py — the pgvector-backed image semantic search plugin.
|
|
|
|
Two test tiers:
|
|
- Unit tests of the SQL/cosine logic with hand-crafted embeddings.
|
|
- Integration test that exercises the full pipeline against a live pgvector DB
|
|
(the running `lore-postgres-pgvector` container, or whatever PG_URL points at).
|
|
- Semantic test that uses a stub embedder to prove the top-k ordering is correct
|
|
for the mock-world's 4 images (Aldric, Vex, Thornwall, Battle).
|
|
|
|
A real sentence-transformers model is NOT required for these tests — the
|
|
embedder is a small monkey-patchable seam.
|
|
"""
|
|
import os
|
|
import sys
|
|
import math
|
|
import pytest
|
|
|
|
# Make the gateway package importable so the plugin can `from server import ...`
|
|
GATEWAY_DIR = os.path.join(os.path.dirname(__file__), "..", "gateway")
|
|
sys.path.insert(0, GATEWAY_DIR)
|
|
|
|
# Plugin files load from a directory path; the server module points REGISTRY
|
|
# at a module-level singleton, which we reuse by registering the plugin in
|
|
# an isolated registry. We import the plugin module manually with a sys.path
|
|
# that includes `plugins/`.
|
|
|
|
|
|
# ─── Helpers ────────────────────────────────────────────────────────────────
|
|
|
|
def make_vec(dims=384, seed=0):
|
|
"""Deterministic unit-ish vector: all components = 1/sqrt(dims)."""
|
|
v = [0.0] * dims
|
|
v[seed % dims] = 1.0
|
|
v[(seed + 1) % dims] = 0.5
|
|
norm = math.sqrt(sum(x * x for x in v)) or 1.0
|
|
return [x / norm for x in v]
|
|
|
|
|
|
def shift_vec(base, dims=384, jitter_dims=10, scale=0.9):
|
|
"""Make a vector that's close to base but slightly different — used to
|
|
simulate "semantically similar" embeddings in tests."""
|
|
v = list(base)
|
|
for i in range(jitter_dims):
|
|
v[i] = base[i] * scale
|
|
norm = math.sqrt(sum(x * x for x in v)) or 1.0
|
|
return [x / norm for x in v]
|
|
|
|
|
|
# ─── Unit tests: SQL/cosine logic via real pgvector ─────────────────────────
|
|
# These run against the live `lore-postgres-pgvector` container (port 5433).
|
|
# CI can be configured to skip them if PG_PGVECTOR_URL is unset.
|
|
|
|
PG_PGVECTOR_URL = os.environ.get(
|
|
"TEST_PG_PGVECTOR_URL",
|
|
"postgresql://lore:***@localhost:5433/lore",
|
|
)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def pg_conn():
|
|
import psycopg2
|
|
conn = psycopg2.connect(PG_PGVECTOR_URL)
|
|
# Ensure schema
|
|
with conn.cursor() as cur:
|
|
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS image_manifest (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
image_id TEXT NOT NULL UNIQUE,
|
|
object_key TEXT NOT NULL,
|
|
entity_id TEXT,
|
|
entity_type TEXT,
|
|
caption TEXT NOT NULL,
|
|
tags TEXT[],
|
|
era TEXT,
|
|
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
width INT,
|
|
height INT,
|
|
bytes BIGINT
|
|
);
|
|
""")
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS image_embedding (
|
|
image_id TEXT PRIMARY KEY,
|
|
embedding vector(384) NOT NULL,
|
|
embedded_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
);
|
|
""")
|
|
conn.commit()
|
|
yield conn
|
|
# Cleanup
|
|
with conn.cursor() as cur:
|
|
cur.execute("DELETE FROM image_embedding;")
|
|
cur.execute("DELETE FROM image_manifest;")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
@pytest.fixture
|
|
def clean_tables(pg_conn):
|
|
with pg_conn.cursor() as cur:
|
|
cur.execute("DELETE FROM image_embedding;")
|
|
cur.execute("DELETE FROM image_manifest;")
|
|
pg_conn.commit()
|
|
yield
|
|
|
|
|
|
def test_image_embedding_table_accepts_vector(pg_conn, clean_tables):
|
|
"""RED→GREEN: the table stores 384-dim vectors and they round-trip."""
|
|
with pg_conn.cursor() as cur:
|
|
cur.execute("""
|
|
INSERT INTO image_manifest
|
|
(image_id, object_key, caption)
|
|
VALUES ('t1', 'k1', 'cap1')
|
|
ON CONFLICT (image_id) DO NOTHING;
|
|
""")
|
|
v = make_vec(seed=1)
|
|
cur.execute(
|
|
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
|
|
("t1", v),
|
|
)
|
|
pg_conn.commit()
|
|
with pg_conn.cursor() as cur:
|
|
cur.execute("SELECT embedding FROM image_embedding WHERE image_id = 't1';")
|
|
(raw,) = cur.fetchone()
|
|
# pgvector returns a string like '[0.1,0.2,...]'
|
|
assert raw.startswith("[") and raw.endswith("]"), raw[:50]
|
|
pg_conn.commit()
|
|
|
|
|
|
def test_cosine_distance_orders_by_similarity(pg_conn, clean_tables):
|
|
"""The top-k query orders by `<=>` (cosine distance), not L2 or L1."""
|
|
from plugins.embeddings import _search_by_vector
|
|
with pg_conn.cursor() as cur:
|
|
for i, img_id in enumerate(["aldric", "vex", "thornwall", "battle"]):
|
|
cur.execute(
|
|
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
|
|
(img_id, f"k/{img_id}", f"caption for {img_id}"),
|
|
)
|
|
base = make_vec(seed=42)
|
|
# Aldric's embedding is closest to the query
|
|
aldric_v = shift_vec(base, jitter_dims=0, scale=1.0) # identical
|
|
vex_v = shift_vec(base, jitter_dims=20, scale=0.6) # further
|
|
thorn_v = shift_vec(base, jitter_dims=60, scale=0.4) # much further
|
|
battle_v = shift_vec(base, jitter_dims=120, scale=0.1) # almost orthogonal
|
|
for img_id, vec in [
|
|
("aldric", aldric_v), ("vex", vex_v),
|
|
("thornwall", thorn_v), ("battle", battle_v),
|
|
]:
|
|
cur.execute(
|
|
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
|
|
(img_id, vec),
|
|
)
|
|
pg_conn.commit()
|
|
out = _search_by_vector(base, limit=4, pg_url=PG_PGVECTOR_URL)
|
|
ids = [r["image_id"] for r in out]
|
|
assert ids[0] == "aldric", f"expected aldric first, got {ids}"
|
|
# Aldric should beat vex, vex should beat thornwall, etc.
|
|
assert ids.index("aldric") < ids.index("vex")
|
|
assert ids.index("vex") < ids.index("thornwall") < ids.index("battle")
|
|
|
|
|
|
# ─── Unit tests: embed_images dedupes by image_id ───────────────────────────
|
|
|
|
def test_embed_images_only_embeds_missing(pg_conn, clean_tables, monkeypatch):
|
|
"""embed_images should only compute embeddings for rows that don't have one yet."""
|
|
from plugins import embeddings
|
|
with pg_conn.cursor() as cur:
|
|
for i, img_id in enumerate(["a", "b", "c"]):
|
|
cur.execute(
|
|
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
|
|
(img_id, f"k/{img_id}", f"cap {img_id}"),
|
|
)
|
|
# 'a' already has an embedding
|
|
cur.execute(
|
|
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
|
|
("a", make_vec(seed=99)),
|
|
)
|
|
pg_conn.commit()
|
|
|
|
called_with = []
|
|
|
|
def fake_encode(texts, **kwargs):
|
|
called_with.extend(texts)
|
|
# Return a vector per text
|
|
return [make_vec(seed=hash(t) % 384) for t in texts]
|
|
|
|
# Patch the lazy model loader
|
|
monkeypatch.setattr(embeddings, "_get_model", lambda: type("M", (), {"encode": staticmethod(fake_encode)})())
|
|
|
|
count = embeddings._do_embed_images(limit=10, pg_url=PG_PGVECTOR_URL)
|
|
assert count == 2, f"expected 2 new embeddings, got {count}"
|
|
# 'a' should NOT have been re-embedded
|
|
assert "a" not in called_with
|
|
assert set(called_with) == {"cap b", "cap c"}
|
|
|
|
# Subsequent call should be a no-op
|
|
count2 = embeddings._do_embed_images(limit=10, pg_url=PG_PGVECTOR_URL)
|
|
assert count2 == 0
|
|
|
|
|
|
# ─── Semantic test: stub embedder, mock-world 4 images ─────────────────────
|
|
|
|
def test_semantic_search_with_stub_embedder(pg_conn, clean_tables, monkeypatch):
|
|
"""With a stub embedder, `search_images_semantic` returns the right top-1
|
|
for two distinct queries against the 4 mock images."""
|
|
from plugins import embeddings
|
|
# 4 mock images with hard-coded "embeddings" that simulate their captions.
|
|
# Each caption becomes a unit vector pointing into a distinct axis, and
|
|
# the query is a noisy version of the target axis.
|
|
captions = {
|
|
"aldric": [1, 0, 0, 0], # noble lord, scar
|
|
"vex": [0, 1, 0, 0], # sneaky thief, hood
|
|
"thornwall": [0, 0, 1, 0], # keep, dawn
|
|
"battle": [0, 0, 0, 1], # battle, banners
|
|
}
|
|
# Pad to 384 dims
|
|
def pad(v):
|
|
out = [0.0] * 384
|
|
for i, x in enumerate(v):
|
|
out[i] = float(x)
|
|
return out
|
|
with pg_conn.cursor() as cur:
|
|
for img_id, base in captions.items():
|
|
cur.execute(
|
|
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT DO NOTHING;",
|
|
(img_id, f"k/{img_id}", img_id),
|
|
)
|
|
cur.execute(
|
|
"INSERT INTO image_embedding (image_id, embedding) VALUES (%s, %s::vector);",
|
|
(img_id, pad(base)),
|
|
)
|
|
pg_conn.commit()
|
|
|
|
# Stub model: encode(text) → a 384-dim vector matching the doc whose
|
|
# caption best matches the text. Deterministic.
|
|
def stub_encode(texts, **kwargs):
|
|
keyword_axis = {
|
|
"noble": 0, "lord": 0, "scar": 0,
|
|
"sneaky": 1, "thief": 1, "hood": 1,
|
|
"keep": 2, "dawn": 2,
|
|
"battle": 3, "banners": 3,
|
|
}
|
|
out = []
|
|
for t in texts:
|
|
v = [0.0] * 384
|
|
for word, axis in keyword_axis.items():
|
|
if word in t.lower():
|
|
v[axis] = 1.0
|
|
if not any(v):
|
|
v[0] = 1.0 # default
|
|
out.append(v)
|
|
return out
|
|
|
|
monkeypatch.setattr(embeddings, "_get_model", lambda: type("M", (), {"encode": staticmethod(stub_encode)})())
|
|
|
|
r1 = embeddings._do_search_semantic("a noble lord with a scar", limit=1, pg_url=PG_PGVECTOR_URL)
|
|
assert r1["images"][0]["image_id"] == "aldric", r1
|
|
|
|
r2 = embeddings._do_search_semantic("a sneaky thief in a hood", limit=1, pg_url=PG_PGVECTOR_URL)
|
|
assert r2["images"][0]["image_id"] == "vex", r2
|