Files
lore-engine-poc/tests/test_embeddings_real_model.py
Hermes add264eb04 T2: pgvector image embeddings — plugin, schema, seed, hook, tests
- docker-compose: swap postgres image to pgvector/pgvector:pg16
- postgres/init.sql: CREATE EXTENSION vector; image_embedding table
- plugins/embeddings.py: embed_images + search_images_semantic
  (sentence-transformers all-MiniLM-L6-v2, lazy-loaded, pgvector <=> cosine)
- plugins/images.py: register_image kicks off background embed worker
- seed.py: seed_embeddings writes 4 embeddings for the mock images
- README: semantic image search section + T3 note
- 11 tests across 4 files, all green:
    test_embeddings_plugin.py (4): schema, ordering, idempotency, stub
    test_embeddings_real_model.py (3): real MiniLM, acceptance queries
    test_register_image_hook.py (2): manifest row, end-to-end hook
    test_seed_embeddings.py (2): writes 4, idempotent
- Includes T3 consistency plugin skeleton (4 stub tools)
2026-06-16 14:30:10 +00:00

110 lines
4.2 KiB
Python

"""
Integration test: real sentence-transformers model against the live pgvector DB.
This is the "does it actually work" test — it loads all-MiniLM-L6-v2, encodes
the 4 mock-world image captions, and asserts that natural-language queries
rank the right image first.
Skipped automatically if sentence-transformers is not importable.
"""
import os
import sys
import math
import pytest
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
GATEWAY = os.path.join(ROOT, "gateway")
PLUGINS = os.path.join(ROOT, "plugins")
for p in (GATEWAY, PLUGINS):
if p not in sys.path:
sys.path.insert(0, p)
PG_PGVECTOR_URL = os.environ.get(
"TEST_PG_PGVECTOR_URL",
"postgresql://lore:***@localhost:5433/lore",
)
# Skip this entire module if sentence-transformers is not installed.
sentence_transformers = pytest.importorskip("sentence_transformers")
CAPTIONS = [
("img_aldric_portrait",
"Portrait of Aldric Raventhorne, Lord of Thornwall. Middle-aged, dark hair, a scar above the left eye."),
("img_vex_portrait",
"Vex the Silent, a hooded thief from the alleys of Mardsville. Face mostly in shadow."),
("img_thornwall",
"Thornwall Keep at dawn. The banners of House Vyr fly from the battlements."),
("img_battle",
"The Battle of Black Spire, where Aldric defeated General Kael. House Vyr's banners hold the ridge."),
]
@pytest.fixture(scope="module")
def seeded_pg():
"""Bring the live pgvector DB to a known state with the 4 mock images."""
import psycopg2
conn = psycopg2.connect(PG_PGVECTOR_URL)
with conn.cursor() as cur:
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cur.execute("""
CREATE TABLE IF NOT EXISTS image_manifest (
id BIGSERIAL PRIMARY KEY,
image_id TEXT NOT NULL UNIQUE,
object_key TEXT NOT NULL,
entity_id TEXT,
entity_type TEXT,
caption TEXT NOT NULL,
tags TEXT[],
era TEXT,
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
width INT,
height INT,
bytes BIGINT
);
""")
cur.execute("""
CREATE TABLE IF NOT EXISTS image_embedding (
image_id TEXT PRIMARY KEY,
embedding vector(384) NOT NULL,
embedded_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
""")
for image_id, caption in CAPTIONS:
cur.execute(
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT (image_id) DO UPDATE SET caption = EXCLUDED.caption;",
(image_id, f"k/{image_id}", caption),
)
# Wipe embeddings so the test re-encodes
cur.execute("DELETE FROM image_embedding;")
conn.commit()
yield conn
conn.close()
def test_real_model_ranks_aldric_first(seeded_pg):
"""The headline acceptance criterion: 'a noble lord with a scar' → Aldric."""
from plugins import embeddings
n = embeddings._do_embed_images(limit=100, pg_url=PG_PGVECTOR_URL)
assert n == 4, f"expected to embed 4 images, got {n}"
r = embeddings._do_search_semantic("a noble lord with a scar", limit=1, pg_url=PG_PGVECTOR_URL)
assert r["count"] >= 1
assert r["images"][0]["image_id"] == "img_aldric_portrait", r
def test_real_model_ranks_vex_first(seeded_pg):
"""The second acceptance criterion: 'a sneaky thief in a hood' → Vex."""
from plugins import embeddings
r = embeddings._do_search_semantic("a sneaky thief in a hood", limit=1, pg_url=PG_PGVECTOR_URL)
assert r["count"] >= 1
assert r["images"][0]["image_id"] == "img_vex_portrait", r
def test_real_model_top4_against_all(seeded_pg):
"""Both top-2 queries should produce the expected top-2 from the corpus."""
from plugins import embeddings
r1 = embeddings._do_search_semantic("a noble lord with a scar", limit=2, pg_url=PG_PGVECTOR_URL)
assert r1["images"][0]["image_id"] == "img_aldric_portrait"
r2 = embeddings._do_search_semantic("a sneaky thief in a hood", limit=2, pg_url=PG_PGVECTOR_URL)
assert r2["images"][0]["image_id"] == "img_vex_portrait"