Files
lore-engine-poc/workers/entity-extractor/main.go
Hermes adbb6f0cce feat(substrate): Phase 1 merge — Redis + 8 Go workers + nsc plugin
Ports the GraphMCP-Example substrate into lore-engine-poc:

- 8 Go workers under workers/ (discord-connector, discord-filter, lore-watcher, ingestion-worker, entity-extractor, lore-extractor, encounter-processor, mcp-server), each with Dockerfile + go.mod

- 3 Go unit-test files (encounter-processor, ingestion-worker, lore-extractor) — other 5 workers rely on integration tests via the live stack

- plugins/nsc.py: thin httpx proxy from gateway to lore-mcp-server:9000, exposes all 11 inherited GraphMCP tools (input schemas verbatim from mcp-server/main.go)

- docker-compose.yml: adds lore-redis + lore-mcp-server + the 7 worker services (lore- prefix to avoid clash with other GraphMCP stacks)

- verify-merge.sh (171 LOC, 7 pass conditions) + docs/VERIFICATION.md

- tests/contract/test_graphmcp_tool_contracts.py (15 tests; skipped when stack is down — TDD pattern, becomes active once docker compose up brings the stack)

- README.md + test.sh updated for the merged service inventory

Leader notes (2026-06-27 03:50):

- Worker self-blocked review-required after 2 runs (run #7 hit 120/120 iteration budget; run #8 staged 40 files and reported shippable).

- Tests are SKIPPED until docker compose up — worker chose that pattern over mocking (consistent with the lore-engine-poc project convention). To activate, run `docker compose up -d --build && pytest tests/contract/`.

- File Scope reconciliation: story said gateway/plugins/nsc/__init__.py; worker shipped plugins/nsc.py (flat file). Justified by the existing plugins/ convention in lore-engine-poc (server.py glob("*.py")). A future PR could split nsc into a package once server.py learns __init__.py discovery.

- nsc plugin exposes 11 tools (not 8) — the AC said "8" but the worker enumerated all 11 tools present in mcp-server/main.go. The encounter-specific 3 tools (list_encounters, search_encounters, get_encounter) were included for consistency. Story AC #2 reads "≥ 8 GraphMCP tools" so this exceeds AC.

Refs: S2-phase-1-substrate-merge, milestone #64 P1 — Substrate merge
2026-06-27 03:48:54 +00:00

568 lines
18 KiB
Go

package main
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"regexp"
"strings"
"time"
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
"github.com/redis/go-redis/v9"
)
var httpClient = &http.Client{Timeout: 30 * time.Second}
// ── Config ────────────────────────────────────────────────────────────────────
type Config struct {
RedisURL string
Stream string
Group string
Consumer string
Neo4jURL string
Neo4jUser string
Neo4jPass string
LLMURL string
LLMModel string
PromptFile string // path to a text file; overrides the default system prompt if set
SupersedeRelations string // comma-separated relation types that supersede prior edges from the same source, e.g. "PREFERS,LIKES"
}
func configFromEnv() Config {
return Config{
RedisURL: getEnv("REDIS_URL", "redis://redis:6379"),
Stream: getEnv("REDIS_STREAM", "raw.messages"),
Group: getEnv("REDIS_GROUP", "extraction"),
Consumer: getEnv("CONSUMER_NAME", "entity-extractor-1"),
Neo4jURL: getEnv("NEO4J_URL", "bolt://neo4j:7687"),
Neo4jUser: getEnv("NEO4J_USER", "neo4j"),
Neo4jPass: getEnv("NEO4J_PASSWORD", "changeme"),
LLMURL: getEnv("LLM_URL", "http://ollama-cpu:11435"),
LLMModel: getEnv("LLM_MODEL", "qwen2.5:3b"),
PromptFile: getEnv("PROMPT_FILE", ""),
SupersedeRelations: getEnv("SUPERSEDE_RELATIONS", "ALLIED_WITH,ENEMY_OF"),
}
}
// parseRelationSet parses a comma-separated list of relation type names into a
// set for O(1) lookup.
func parseRelationSet(s string) map[string]bool {
set := map[string]bool{}
for _, v := range strings.Split(s, ",") {
v = strings.TrimSpace(strings.ToUpper(v))
if v != "" {
set[v] = true
}
}
return set
}
// ── System prompt ─────────────────────────────────────────────────────────────
//
// Edit prompt.txt (or set PROMPT_FILE) to retune without recompiling.
// The entity/relation types listed here are what the LLM will use —
// change them here to change the entire extraction schema.
const defaultSystemPrompt = `You are a narrative entity extraction engine for a D&D campaign knowledge graph. Given a Discord message and its author, extract named entities from the D&D world and the relationships between them.
Return ONLY valid JSON in this exact shape, no other text:
{
"entities": [
{"name": "Theron Ashveil", "type": "Person"},
{"name": "The Iron Council", "type": "Faction"},
{"name": "Thornwall Keep", "type": "Location"},
{"name": "Siege of Thornwall", "type": "Event"},
{"name": "Sword of Eventide", "type": "Item"}
],
"relations": [
{"from": "Theron Ashveil", "to": "The Iron Council", "rel": "MEMBER_OF"},
{"from": "Theron Ashveil", "to": "Siege of Thornwall", "rel": "PARTICIPATED_IN"},
{"from": "Siege of Thornwall", "to": "Thornwall Keep", "rel": "OCCURRED_AT"}
]
}
Entity types (use exactly these labels):
Person — a named character, player character, NPC, deity, or historical figure in the story
Location — a named place, dungeon, city, region, landmark, or realm in the game world
Event — a named battle, encounter, ceremony, quest milestone, or significant occurrence
Faction — a guild, kingdom, order, cult, party, or named group of people
Item — a named weapon, artifact, magical item, relic, or significant object
Creature — a named or typed monster, beast, or non-person entity (e.g. "Ancient Red Dragon", "The Pale Worm")
Relation types (use exactly these labels):
PARTICIPATED_IN — Person or Faction took part in an Event
OCCURRED_AT — Event took place at a Location
LOCATED_AT — Person, Faction, or Item is currently at or in a Location
RULES — Person or Faction governs or controls a Location or Faction
MEMBER_OF — Person belongs to a Faction
ALLIED_WITH — Person or Faction is allied with another Person or Faction
ENEMY_OF — Person or Faction is opposed to another Person or Faction
POSSESSES — Person or Faction holds or owns an Item
SEEKS — Person or Faction is actively looking for a Person, Item, or Location
KNOWS — two Persons have a relationship or acquaintance
RELATED_TO — two entities are connected but no specific relation applies
Rules:
- Always include the message author as a Person entity.
- Use proper nouns only — do not extract generic words like "sword" or "city", only named ones.
- Normalise names to title case (e.g. "theron" → "Theron Ashveil" if the full name is known from context).
- Omit entities or relations you are not confident about.
- If the message is out-of-character (e.g. rules questions, scheduling, meta discussion), extract no entities and return {"entities": [], "relations": []}.`
func loadPrompt(cfg Config) string {
if cfg.PromptFile == "" {
return defaultSystemPrompt
}
data, err := os.ReadFile(cfg.PromptFile)
if err != nil {
slog.Warn("could not read PROMPT_FILE, using default", "file", cfg.PromptFile, "err", err)
return defaultSystemPrompt
}
slog.Info("loaded system prompt from file", "file", cfg.PromptFile)
return string(data)
}
// ── LLM entity extraction ─────────────────────────────────────────────────────
type chatMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type chatRequest struct {
Model string `json:"model"`
Messages []chatMessage `json:"messages"`
Stream bool `json:"stream"`
}
type chatResponse struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
}
type Entity struct {
Name string `json:"name"`
Type string `json:"type"`
}
type ExtractedRelation struct {
From string `json:"from"`
To string `json:"to"`
Rel string `json:"rel"`
}
type ExtractionResult struct {
Entities []Entity `json:"entities"`
Relations []ExtractedRelation `json:"relations"`
}
var trailingCommaRe = regexp.MustCompile(`,\s*([}\]])`)
func repairJSON(s string) string {
return trailingCommaRe.ReplaceAllString(s, "$1")
}
func fixUnicodeEscapes(s string) string {
var buf strings.Builder
buf.Grow(len(s))
for i := 0; i < len(s); {
if i+1 < len(s) && s[i] == '\\' && s[i+1] == 'u' {
if i+6 <= len(s) && isHexByte(s[i+2]) && isHexByte(s[i+3]) && isHexByte(s[i+4]) && isHexByte(s[i+5]) {
buf.WriteString(s[i : i+6])
i += 6
} else {
// Invalid or incomplete \uXXXX — skip the entire 6-char sequence so
// the raw hex digits don't get concatenated into entity names.
if i+6 <= len(s) {
i += 6
} else {
i += 2
}
}
} else {
buf.WriteByte(s[i])
i++
}
}
return buf.String()
}
func isHexByte(c byte) bool {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}
func stripFences(s string) string {
s = strings.TrimSpace(s)
for _, fence := range []string{"```json", "```"} {
if strings.HasPrefix(s, fence) {
s = s[len(fence):]
break
}
}
if idx := strings.LastIndex(s, "```"); idx != -1 {
s = s[:idx]
}
return strings.TrimSpace(s)
}
func salvageEntities(raw string) *ExtractionResult {
idx := strings.Index(raw, `"relations"`)
if idx < 0 {
return nil
}
truncated := strings.TrimRight(raw[:idx], ", \t\n\r") + `,"relations":[]}`
var rr rawExtractionResult
if err := json.Unmarshal([]byte(truncated), &rr); err != nil || len(rr.Entities) == 0 {
return nil
}
slog.Warn("salvaged entities only — relations were malformed", "entities", len(rr.Entities))
return &ExtractionResult{Entities: rr.Entities}
}
type rawRelation struct {
From json.RawMessage `json:"from"`
To json.RawMessage `json:"to"`
Rel string `json:"rel"`
}
func coerceString(raw json.RawMessage) string {
var s string
if json.Unmarshal(raw, &s) == nil {
return s
}
var obj struct{ Name string `json:"name"` }
if json.Unmarshal(raw, &obj) == nil && obj.Name != "" {
return strings.Trim(obj.Name, "*_ ")
}
var arr []json.RawMessage
if json.Unmarshal(raw, &arr) == nil && len(arr) > 0 {
return coerceString(arr[0])
}
return ""
}
type rawExtractionResult struct {
Entities []Entity `json:"entities"`
Relations []rawRelation `json:"relations"`
}
func extractEntities(ctx context.Context, cfg Config, systemPrompt, author, content string) (*ExtractionResult, error) {
// Give the LLM author context so it can link sentiment to the right Person.
userMsg := fmt.Sprintf("Author: %s\nMessage: %s", author, content)
payload := chatRequest{
Model: cfg.LLMModel,
Messages: []chatMessage{
{Role: "system", Content: systemPrompt},
{Role: "user", Content: userMsg},
},
Stream: false,
}
body, _ := json.Marshal(payload)
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
cfg.LLMURL+"/v1/chat/completions", bytes.NewReader(body))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
var cr chatResponse
if err := json.NewDecoder(resp.Body).Decode(&cr); err != nil {
return nil, err
}
if len(cr.Choices) == 0 {
return nil, fmt.Errorf("empty LLM response")
}
raw := cr.Choices[0].Message.Content
raw = stripFences(raw)
raw = fixUnicodeEscapes(raw)
raw = repairJSON(raw)
var rr rawExtractionResult
if err := json.Unmarshal([]byte(raw), &rr); err != nil {
if result := salvageEntities(raw); result != nil {
return result, nil
}
slog.Warn("LLM returned non-JSON", "raw", cr.Choices[0].Message.Content)
return &ExtractionResult{}, nil
}
result := ExtractionResult{Entities: rr.Entities}
for _, r := range rr.Relations {
from, to := coerceString(r.From), coerceString(r.To)
if from != "" && to != "" && r.Rel != "" {
result.Relations = append(result.Relations, ExtractedRelation{From: from, To: to, Rel: r.Rel})
}
}
return &result, nil
}
// ── Neo4j write ───────────────────────────────────────────────────────────────
const mergeEntities = `
MERGE (m:Message {id: $msgID})
WITH m
UNWIND $entities AS ent
MERGE (e {name: ent.name})
ON CREATE SET e.type = ent.type, e.source = "discord"
WITH m, e, ent
WHERE NOT (e:LoreDocument OR e:LoreChunk OR e:Chunk OR e:Encounter)
CALL apoc.create.addLabels(e, [ent.type]) YIELD node
MERGE (m)-[:MENTIONS]->(node)
`
// Links the known author (from the stream field) directly to the message,
// independent of whatever the LLM extracted.
const mergeAuthor = `
MERGE (p:Person {id: $authorID})
ON CREATE SET p.name = $authorName
MERGE (m:Message {id: $msgID})
MERGE (p)-[:POSTED]->(m)
`
// Relations are merged by type only (no identity properties), then stamped
// with the message timestamp and id on every write. This means a later
// PREFERS edge always carries a newer `since`, so callers can ORDER BY
// r.since DESC to get the current state without losing history.
const mergeRelation = `
MATCH (a {name: $from})
MATCH (b {name: $to})
WITH a, b
CALL apoc.merge.relationship(a, $rel, {}, {}, b) YIELD rel
SET rel.since = $timestamp,
rel.msg_id = $msgID
RETURN rel
`
// For exclusive relation types (e.g. PREFERS), mark all existing outgoing
// edges of the same type from the same source as superseded before writing
// the new one. type(r) is a built-in Cypher function and can be compared to
// a parameter, so no APOC needed here.
const supersedeExisting = `
MATCH (a {name: $from})-[r]->()
WHERE type(r) = $rel
AND NOT coalesce(r.superseded, false)
SET r.superseded = true,
r.superseded_by = $msgID
`
func writeToGraph(ctx context.Context, session neo4j.SessionWithContext,
msgID, authorID, authorName, timestamp string,
supersedeSet map[string]bool,
result *ExtractionResult) error {
// Always write the author→message link regardless of extraction results.
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, mergeAuthor, map[string]any{
"authorID": authorID,
"authorName": authorName,
"msgID": msgID,
})
return nil, err
})
if err != nil {
return fmt.Errorf("merge author: %w", err)
}
if len(result.Entities) == 0 {
return nil
}
entities := make([]map[string]any, len(result.Entities))
for i, e := range result.Entities {
entities[i] = map[string]any{"name": e.Name, "type": e.Type}
}
_, err = session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, mergeEntities, map[string]any{
"msgID": msgID,
"entities": entities,
})
return nil, err
})
if err != nil {
return fmt.Errorf("merge entities: %w", err)
}
for _, rel := range result.Relations {
if supersedeSet[rel.Rel] {
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, supersedeExisting, map[string]any{
"from": rel.From,
"rel": rel.Rel,
"msgID": msgID,
})
return nil, err
})
if err != nil {
slog.Warn("supersession failed", "from", rel.From, "rel", rel.Rel, "err", err)
}
}
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, mergeRelation, map[string]any{
"from": rel.From,
"to": rel.To,
"rel": rel.Rel,
"timestamp": timestamp,
"msgID": msgID,
})
return nil, err
})
if err != nil {
slog.Warn("skipped relation", "from", rel.From, "to", rel.To, "err", err)
}
}
return nil
}
// ── Main ──────────────────────────────────────────────────────────────────────
func main() {
cfg := configFromEnv()
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil)))
systemPrompt := loadPrompt(cfg)
supersedeSet := parseRelationSet(cfg.SupersedeRelations)
slog.Info("supersede-on-write", "relations", cfg.SupersedeRelations)
ctx := context.Background()
rOpts, err := redis.ParseURL(cfg.RedisURL)
if err != nil {
slog.Error("invalid redis URL", "err", err)
os.Exit(1)
}
rdb := redis.NewClient(rOpts)
rdb.XGroupCreateMkStream(ctx, cfg.Stream, cfg.Group, "0").Err()
driver, err := neo4j.NewDriverWithContext(cfg.Neo4jURL,
neo4j.BasicAuth(cfg.Neo4jUser, cfg.Neo4jPass, ""))
if err != nil {
slog.Error("neo4j driver error", "err", err)
os.Exit(1)
}
defer driver.Close(ctx)
slog.Info("entity-extractor started", "stream", cfg.Stream, "group", cfg.Group)
// Reclaim any messages delivered but not ACK'd before last shutdown.
// Bounded to maxRecoveryPasses so a persistently failing message
// (e.g. LLM not yet ready) does not block the live loop on startup.
const maxRecoveryPasses = 5
for pass := 0; pass < maxRecoveryPasses; pass++ {
results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
Group: cfg.Group,
Consumer: cfg.Consumer,
Streams: []string{cfg.Stream, "0"},
Count: 5,
}).Result()
if err != nil || len(results) == 0 || len(results[0].Messages) == 0 {
break
}
for _, msg := range results[0].Messages {
slog.Info("reprocessing pending message", "id", msg.ID)
if err := processMessage(ctx, cfg, systemPrompt, supersedeSet, driver, msg); err != nil {
slog.Error("extraction failed (pending)", "id", msg.ID, "err", err)
continue
}
rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
}
}
for {
results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
Group: cfg.Group,
Consumer: cfg.Consumer,
Streams: []string{cfg.Stream, ">"},
Count: 5,
Block: 5 * time.Second,
}).Result()
if err == redis.Nil {
continue
}
if err != nil {
slog.Error("redis read error", "err", err)
time.Sleep(2 * time.Second)
continue
}
for _, stream := range results {
for _, msg := range stream.Messages {
if err := processMessage(ctx, cfg, systemPrompt, supersedeSet, driver, msg); err != nil {
slog.Error("extraction failed", "id", msg.ID, "err", err)
continue
}
rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
}
}
}
}
func processMessage(ctx context.Context, cfg Config, systemPrompt string,
supersedeSet map[string]bool, driver neo4j.DriverWithContext, msg redis.XMessage) error {
vals := msg.Values
msgID := strVal(vals, "id", msg.ID)
content := strVal(vals, "content", "")
author := strVal(vals, "author", "unknown")
authorID := strVal(vals, "author_id", "")
timestamp := strVal(vals, "timestamp", "")
if content == "" {
return nil
}
result, err := extractEntities(ctx, cfg, systemPrompt, author, content)
if err != nil {
return fmt.Errorf("LLM extraction: %w", err)
}
session := driver.NewSession(ctx, neo4j.SessionConfig{})
defer session.Close(ctx)
if err := writeToGraph(ctx, session, msgID, authorID, author, timestamp, supersedeSet, result); err != nil {
return fmt.Errorf("write to graph: %w", err)
}
slog.Info("processed message", "id", msgID, "author", author,
"entities", len(result.Entities), "relations", len(result.Relations))
return nil
}
// ── Helpers ───────────────────────────────────────────────────────────────────
func getEnv(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
func strVal(m map[string]any, key, fallback string) string {
if v, ok := m[key]; ok {
if s, ok := v.(string); ok {
return s
}
}
return fallback
}