- docker-compose: swap postgres image to pgvector/pgvector:pg16
- postgres/init.sql: CREATE EXTENSION vector; image_embedding table
- plugins/embeddings.py: embed_images + search_images_semantic
(sentence-transformers all-MiniLM-L6-v2, lazy-loaded, pgvector <=> cosine)
- plugins/images.py: register_image kicks off background embed worker
- seed.py: seed_embeddings writes 4 embeddings for the mock images
- README: semantic image search section + T3 note
- 11 tests across 4 files, all green:
test_embeddings_plugin.py (4): schema, ordering, idempotency, stub
test_embeddings_real_model.py (3): real MiniLM, acceptance queries
test_register_image_hook.py (2): manifest row, end-to-end hook
test_seed_embeddings.py (2): writes 4, idempotent
- Includes T3 consistency plugin skeleton (4 stub tools)
104 lines
3.9 KiB
Python
104 lines
3.9 KiB
Python
"""
|
|
Tests for seed.py's embedding step. Verifies the seed function is idempotent
|
|
and writes the expected 4 embeddings against a live pgvector DB.
|
|
"""
|
|
import os
|
|
import sys
|
|
import pytest
|
|
import psycopg2
|
|
|
|
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
for p in (os.path.join(ROOT, "gateway"), os.path.join(ROOT, "plugins")):
|
|
if p not in sys.path:
|
|
sys.path.insert(0, p)
|
|
|
|
# Make `import seed` work even though seed.py isn't a package
|
|
sys.path.insert(0, ROOT)
|
|
|
|
pytest.importorskip("sentence_transformers")
|
|
|
|
PG_PGVECTOR_URL = os.environ.get(
|
|
"TEST_PG_PGVECTOR_URL",
|
|
"postgresql://lore:***@localhost:5433/lore",
|
|
)
|
|
|
|
CAPTIONS = [
|
|
("img_aldric_portrait",
|
|
"Portrait of Aldric Raventhorne, Lord of Thornwall. Middle-aged, dark hair, a scar above the left eye."),
|
|
("img_vex_portrait",
|
|
"Vex the Silent, a hooded thief from the alleys of Mardsville. Face mostly in shadow."),
|
|
("img_thornwall",
|
|
"Thornwall Keep at dawn. The banners of House Vyr fly from the battlements."),
|
|
("img_battle",
|
|
"The Battle of Black Spire, where Aldric defeated General Kael. House Vyr's banners hold the ridge."),
|
|
]
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def seed_pg():
|
|
conn = psycopg2.connect(PG_PGVECTOR_URL)
|
|
with conn.cursor() as cur:
|
|
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS image_manifest (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
image_id TEXT NOT NULL UNIQUE,
|
|
object_key TEXT NOT NULL,
|
|
entity_id TEXT, entity_type TEXT,
|
|
caption TEXT NOT NULL, tags TEXT[],
|
|
era TEXT, uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
width INT, height INT, bytes BIGINT
|
|
);
|
|
""")
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS image_embedding (
|
|
image_id TEXT PRIMARY KEY,
|
|
embedding vector(384) NOT NULL,
|
|
embedded_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
);
|
|
""")
|
|
for image_id, caption in CAPTIONS:
|
|
cur.execute(
|
|
"INSERT INTO image_manifest (image_id, object_key, caption) VALUES (%s,%s,%s) ON CONFLICT (image_id) DO UPDATE SET caption = EXCLUDED.caption;",
|
|
(image_id, f"k/{image_id}", caption),
|
|
)
|
|
conn.commit()
|
|
yield conn
|
|
conn.close()
|
|
|
|
|
|
def test_seed_embeddings_writes_four(seed_pg):
|
|
"""After a fresh seed, the 4 mock images have embeddings."""
|
|
from seed import seed_embeddings
|
|
# Wipe first to make sure we test the full write path
|
|
with seed_pg.cursor() as cur:
|
|
cur.execute("DELETE FROM image_embedding;")
|
|
seed_pg.commit()
|
|
seed_embeddings(seed_pg)
|
|
with seed_pg.cursor() as cur:
|
|
# Check that the 4 mock images specifically are embedded.
|
|
# (Other tests may have left additional manifest rows.)
|
|
cur.execute("""
|
|
SELECT image_id FROM image_embedding
|
|
WHERE image_id IN ('img_aldric_portrait','img_vex_portrait','img_thornwall','img_battle')
|
|
ORDER BY image_id
|
|
""")
|
|
rows = [r[0] for r in cur.fetchall()]
|
|
assert rows == ['img_aldric_portrait', 'img_battle', 'img_thornwall', 'img_vex_portrait'], rows
|
|
|
|
|
|
def test_seed_embeddings_is_idempotent(seed_pg):
|
|
"""Re-running seed_embeddings doesn't re-embed images that already have one."""
|
|
from seed import seed_embeddings
|
|
seed_embeddings(seed_pg)
|
|
with seed_pg.cursor() as cur:
|
|
# The 4 mock images should each have exactly one embedding row.
|
|
cur.execute("""
|
|
SELECT image_id, count(*) FROM image_embedding
|
|
WHERE image_id IN ('img_aldric_portrait','img_vex_portrait','img_thornwall','img_battle')
|
|
GROUP BY image_id
|
|
""")
|
|
rows = dict((r[0], r[1]) for r in cur.fetchall())
|
|
assert len(rows) == 4
|
|
assert all(c == 1 for c in rows.values()), rows
|