- docker-compose: swap postgres image to pgvector/pgvector:pg16
- postgres/init.sql: CREATE EXTENSION vector; image_embedding table
- plugins/embeddings.py: embed_images + search_images_semantic
(sentence-transformers all-MiniLM-L6-v2, lazy-loaded, pgvector <=> cosine)
- plugins/images.py: register_image kicks off background embed worker
- seed.py: seed_embeddings writes 4 embeddings for the mock images
- README: semantic image search section + T3 note
- 11 tests across 4 files, all green:
test_embeddings_plugin.py (4): schema, ordering, idempotency, stub
test_embeddings_real_model.py (3): real MiniLM, acceptance queries
test_register_image_hook.py (2): manifest row, end-to-end hook
test_seed_embeddings.py (2): writes 4, idempotent
- Includes T3 consistency plugin skeleton (4 stub tools)
110 lines
4.2 KiB
Python
110 lines
4.2 KiB
Python
"""
|
|
Integration test: real sentence-transformers model against the live pgvector DB.
|
|
|
|
This is the "does it actually work" test — it loads all-MiniLM-L6-v2, encodes
|
|
the 4 mock-world image captions, and asserts that natural-language queries
|
|
rank the right image first.
|
|
|
|
Skipped automatically if sentence-transformers is not importable.
|
|
"""
|
|
import os
|
|
import sys
|
|
import math
|
|
import pytest
|
|
|
|
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
GATEWAY = os.path.join(ROOT, "gateway")
|
|
PLUGINS = os.path.join(ROOT, "plugins")
|
|
for p in (GATEWAY, PLUGINS):
|
|
if p not in sys.path:
|
|
sys.path.insert(0, p)
|
|
|
|
PG_PGVECTOR_URL = os.environ.get(
|
|
"TEST_PG_PGVECTOR_URL",
|
|
"postgresql://lore:***@localhost:5433/lore",
|
|
)
|
|
|
|
# Skip this entire module if sentence-transformers is not installed.
|
|
sentence_transformers = pytest.importorskip("sentence_transformers")
|
|
|
|
CAPTIONS = [
|
|
("img_aldric_portrait",
|
|
"Portrait of Aldric Raventhorne, Lord of Thornwall. Middle-aged, dark hair, a scar above the left eye."),
|
|
("img_vex_portrait",
|
|
"Vex the Silent, a hooded thief from the alleys of Mardsville. Face mostly in shadow."),
|
|
("img_thornwall",
|
|
"Thornwall Keep at dawn. The banners of House Vyr fly from the battlements."),
|
|
("img_battle",
|
|
"The Battle of Black Spire, where Aldric defeated General Kael. House Vyr's banners hold the ridge."),
|
|
]
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def seeded_pg():
|
|
"""Bring the live pgvector DB to a known state with the 4 mock images."""
|
|
import psycopg2
|
|
conn = psycopg2.connect(PG_PGVECTOR_URL)
|
|
with conn.cursor() as cur:
|
|
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS image_manifest (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
image_id TEXT NOT NULL UNIQUE,
|
|
object_key TEXT NOT NULL,
|
|
entity_id TEXT,
|
|
entity_type TEXT,
|
|
caption TEXT NOT NULL,
|
|
tags TEXT[],
|
|
era TEXT,
|
|
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
width INT,
|
|
height INT,
|
|
bytes BIGINT
|
|
);
|
|
""")
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS image_embedding (
|
|
image_id TEXT PRIMARY KEY,
|
|
embedding vector(384) NOT NULL,
|
|
embedded_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
);
|
|
""")
|
|
for image_id, caption in CAPTIONS:
|
|
cur.execute(
|
|
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT (image_id) DO UPDATE SET caption = EXCLUDED.caption;",
|
|
(image_id, f"k/{image_id}", caption),
|
|
)
|
|
# Wipe embeddings so the test re-encodes
|
|
cur.execute("DELETE FROM image_embedding;")
|
|
conn.commit()
|
|
yield conn
|
|
conn.close()
|
|
|
|
|
|
def test_real_model_ranks_aldric_first(seeded_pg):
|
|
"""The headline acceptance criterion: 'a noble lord with a scar' → Aldric."""
|
|
from plugins import embeddings
|
|
n = embeddings._do_embed_images(limit=100, pg_url=PG_PGVECTOR_URL)
|
|
assert n == 4, f"expected to embed 4 images, got {n}"
|
|
|
|
r = embeddings._do_search_semantic("a noble lord with a scar", limit=1, pg_url=PG_PGVECTOR_URL)
|
|
assert r["count"] >= 1
|
|
assert r["images"][0]["image_id"] == "img_aldric_portrait", r
|
|
|
|
|
|
def test_real_model_ranks_vex_first(seeded_pg):
|
|
"""The second acceptance criterion: 'a sneaky thief in a hood' → Vex."""
|
|
from plugins import embeddings
|
|
r = embeddings._do_search_semantic("a sneaky thief in a hood", limit=1, pg_url=PG_PGVECTOR_URL)
|
|
assert r["count"] >= 1
|
|
assert r["images"][0]["image_id"] == "img_vex_portrait", r
|
|
|
|
|
|
def test_real_model_top4_against_all(seeded_pg):
|
|
"""Both top-2 queries should produce the expected top-2 from the corpus."""
|
|
from plugins import embeddings
|
|
r1 = embeddings._do_search_semantic("a noble lord with a scar", limit=2, pg_url=PG_PGVECTOR_URL)
|
|
assert r1["images"][0]["image_id"] == "img_aldric_portrait"
|
|
r2 = embeddings._do_search_semantic("a sneaky thief in a hood", limit=2, pg_url=PG_PGVECTOR_URL)
|
|
assert r2["images"][0]["image_id"] == "img_vex_portrait"
|