Files
lore-engine-poc/workers/lore-extractor/main.go
Hermes adbb6f0cce feat(substrate): Phase 1 merge — Redis + 8 Go workers + nsc plugin
Ports the GraphMCP-Example substrate into lore-engine-poc:

- 8 Go workers under workers/ (discord-connector, discord-filter, lore-watcher, ingestion-worker, entity-extractor, lore-extractor, encounter-processor, mcp-server), each with Dockerfile + go.mod

- 3 Go unit-test files (encounter-processor, ingestion-worker, lore-extractor) — other 5 workers rely on integration tests via the live stack

- plugins/nsc.py: thin httpx proxy from gateway to lore-mcp-server:9000, exposes all 11 inherited GraphMCP tools (input schemas verbatim from mcp-server/main.go)

- docker-compose.yml: adds lore-redis + lore-mcp-server + the 7 worker services (lore- prefix to avoid clash with other GraphMCP stacks)

- verify-merge.sh (171 LOC, 7 pass conditions) + docs/VERIFICATION.md

- tests/contract/test_graphmcp_tool_contracts.py (15 tests; skipped when stack is down — TDD pattern, becomes active once docker compose up brings the stack)

- README.md + test.sh updated for the merged service inventory

Leader notes (2026-06-27 03:50):

- Worker self-blocked review-required after 2 runs (run #7 hit 120/120 iteration budget; run #8 staged 40 files and reported shippable).

- Tests are SKIPPED until docker compose up — worker chose that pattern over mocking (consistent with the lore-engine-poc project convention). To activate, run `docker compose up -d --build && pytest tests/contract/`.

- File Scope reconciliation: story said gateway/plugins/nsc/__init__.py; worker shipped plugins/nsc.py (flat file). Justified by the existing plugins/ convention in lore-engine-poc (server.py glob("*.py")). A future PR could split nsc into a package once server.py learns __init__.py discovery.

- nsc plugin exposes 11 tools (not 8) — the AC said "8" but the worker enumerated all 11 tools present in mcp-server/main.go. The encounter-specific 3 tools (list_encounters, search_encounters, get_encounter) were included for consistency. Story AC #2 reads "≥ 8 GraphMCP tools" so this exceeds AC.

Refs: S2-phase-1-substrate-merge, milestone #64 P1 — Substrate merge
2026-06-27 03:48:54 +00:00

657 lines
21 KiB
Go

package main
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"regexp"
"strings"
"time"
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
"github.com/redis/go-redis/v9"
)
var httpClient = &http.Client{Timeout: 30 * time.Second}
// ── Config ────────────────────────────────────────────────────────────────────
type Config struct {
RedisURL string
Stream string
Group string
Consumer string
Neo4jURL string
Neo4jUser string
Neo4jPass string
LLMURL string
LLMModel string
PromptFile string
}
func configFromEnv() Config {
return Config{
RedisURL: getEnv("REDIS_URL", "redis://redis:6379"),
Stream: getEnv("REDIS_STREAM", "raw.lore"),
Group: getEnv("REDIS_GROUP", "lore-extraction"),
Consumer: getEnv("CONSUMER_NAME", "lore-extractor-1"),
Neo4jURL: getEnv("NEO4J_URL", "bolt://neo4j:7687"),
Neo4jUser: getEnv("NEO4J_USER", "neo4j"),
Neo4jPass: getEnv("NEO4J_PASSWORD", "changeme"),
LLMURL: getEnv("LLM_URL", "http://ollama-cpu:11435"),
LLMModel: getEnv("LLM_MODEL", "qwen2.5:3b"),
PromptFile: getEnv("PROMPT_FILE", ""),
}
}
// ── System prompt ─────────────────────────────────────────────────────────────
const defaultSystemPrompt = `You are a lore entity extraction engine for a D&D campaign knowledge graph. Given a passage of lore text (a biography, story, history, or worldbuilding document), extract named entities and the relationships between them.
Return ONLY valid JSON in this exact shape, no other text:
{
"entities": [
{"name": "Theron Ashveil", "type": "Person"},
{"name": "The Iron Council", "type": "Faction"},
{"name": "Thornwall Keep", "type": "Location"},
{"name": "Siege of Thornwall", "type": "Event", "temporal_hint": "Year 340 of the Third Age"},
{"name": "Sword of Eventide", "type": "Item"},
{"name": "Ancient Red Dragon", "type": "Creature"}
],
"relations": [
{"from": "Theron Ashveil", "to": "The Iron Council", "rel": "MEMBER_OF"},
{"from": "Siege of Thornwall", "to": "Thornwall Keep", "rel": "OCCURRED_AT"},
{"from": "Theron Ashveil", "to": "Sword of Eventide", "rel": "POSSESSES"}
]
}
Entity types (use exactly these labels):
Person — a named character, NPC, deity, or historical figure in the lore
Location — a named place, dungeon, city, region, landmark, realm, or geographic feature
Event — a named historical event, battle, ceremony, meeting, or significant occurrence
Faction — a named guild, kingdom, order, cult, party, or group of people
Item — a named weapon, artifact, magical item, relic, or significant object
Creature — a named or typed monster, beast, or non-person entity (e.g. "Ancient Red Dragon", "The Pale Worm")
Relation types (use exactly these labels):
PARTICIPATED_IN — Person or Faction took part in an Event
OCCURRED_AT — Event took place at a Location
LOCATED_AT — Person, Faction, Item, or Creature is found at or in a Location
RULES — Person or Faction governs or controls a Location or Faction
MEMBER_OF — Person belongs to a Faction
ALLIED_WITH — Person or Faction is allied with another Person or Faction
ENEMY_OF — Person or Faction is opposed to another Person or Faction
POSSESSES — Person or Faction holds or owns an Item
SEEKS — Person or Faction is actively looking for a Person, Item, or Location
KNOWS — two Persons have a relationship or acquaintance
PRECEDED — this Event preceded another Event chronologically
CREATED_BY — Item or Location was made or founded by a Person or Faction
Rules:
- Only extract entities that are explicitly named in the text.
- Choose the most specific relation type that fits; omit a relation rather than guessing.
- Omit entities or relations you are not confident about.
- Do not invent names or relationships not present in the text.
- temporal_hint: for every Event entity this field is REQUIRED. Use the best time information available in the text — a calendar year, a named age, a relative phrase like "shortly after the Fall of Thornwall", or "sometime during the Dusk War". Write "unknown era" only as an absolute last resort. For non-Event entities include temporal_hint only when the text explicitly states when they were active, founded, created, or died.`
func loadPrompt(cfg Config) string {
if cfg.PromptFile == "" {
return defaultSystemPrompt
}
data, err := os.ReadFile(cfg.PromptFile)
if err != nil {
slog.Warn("could not read PROMPT_FILE, using default", "file", cfg.PromptFile, "err", err)
return defaultSystemPrompt
}
slog.Info("loaded system prompt from file", "file", cfg.PromptFile)
return string(data)
}
// ── LLM entity extraction ─────────────────────────────────────────────────────
type chatMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type chatRequest struct {
Model string `json:"model"`
Messages []chatMessage `json:"messages"`
Stream bool `json:"stream"`
}
type chatResponse struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
}
type Entity struct {
Name string `json:"name"`
Type string `json:"type"`
TemporalHint string `json:"temporal_hint,omitempty"`
}
type ExtractedRelation struct {
From string `json:"from"`
To string `json:"to"`
Rel string `json:"rel"`
}
type ExtractionResult struct {
Entities []Entity `json:"entities"`
Relations []ExtractedRelation `json:"relations"`
}
// trailingCommaRe matches commas immediately before a closing brace or bracket.
var trailingCommaRe = regexp.MustCompile(`,\s*([}\]])`)
// repairJSON fixes common LLM JSON mistakes: trailing commas.
func repairJSON(s string) string {
return trailingCommaRe.ReplaceAllString(s, "$1")
}
// fixUnicodeEscapes removes \uXXXX sequences where the four chars aren't valid hex.
// Models sometimes emit \u201g (g is not hex) which makes Go's JSON parser fail.
func fixUnicodeEscapes(s string) string {
var buf strings.Builder
buf.Grow(len(s))
for i := 0; i < len(s); {
if i+1 < len(s) && s[i] == '\\' && s[i+1] == 'u' {
if i+6 <= len(s) && isHexByte(s[i+2]) && isHexByte(s[i+3]) && isHexByte(s[i+4]) && isHexByte(s[i+5]) {
buf.WriteString(s[i : i+6])
i += 6
} else {
// Invalid or incomplete \uXXXX — skip the entire 6-char sequence so
// the raw hex digits don't get concatenated into entity names.
if i+6 <= len(s) {
i += 6
} else {
i += 2
}
}
} else {
buf.WriteByte(s[i])
i++
}
}
return buf.String()
}
func isHexByte(c byte) bool {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}
// stripFences removes markdown code fences that models often wrap JSON in.
func stripFences(s string) string {
s = strings.TrimSpace(s)
// Remove opening fence (```json or ```)
for _, fence := range []string{"```json", "```"} {
if strings.HasPrefix(s, fence) {
s = s[len(fence):]
break
}
}
// Remove closing fence
if idx := strings.LastIndex(s, "```"); idx != -1 {
s = s[:idx]
}
return strings.TrimSpace(s)
}
// salvageEntities tries to recover just the entities array when relations are malformed.
func salvageEntities(raw string) *ExtractionResult {
idx := strings.Index(raw, `"relations"`)
if idx < 0 {
return nil
}
truncated := strings.TrimRight(raw[:idx], ", \t\n\r") + `,"relations":[]}`
var rr rawExtractionResult
if err := json.Unmarshal([]byte(truncated), &rr); err != nil || len(rr.Entities) == 0 {
return nil
}
slog.Warn("salvaged entities only — relations were malformed", "entities", len(rr.Entities))
return &ExtractionResult{Entities: rr.Entities}
}
// rawRelation accepts from/to as a string, {"name":"..."} object, or ["..."] array.
type rawRelation struct {
From json.RawMessage `json:"from"`
To json.RawMessage `json:"to"`
Rel string `json:"rel"`
}
func coerceString(raw json.RawMessage) string {
var s string
if json.Unmarshal(raw, &s) == nil {
return s
}
var obj struct{ Name string `json:"name"` }
if json.Unmarshal(raw, &obj) == nil && obj.Name != "" {
return strings.Trim(obj.Name, "*_ ")
}
var arr []json.RawMessage
if json.Unmarshal(raw, &arr) == nil && len(arr) > 0 {
return coerceString(arr[0])
}
return ""
}
type rawExtractionResult struct {
Entities []Entity `json:"entities"`
Relations []rawRelation `json:"relations"`
}
// loadKnownEntities queries the graph for already-established entity names and
// returns a formatted hint block. Injecting this into the LLM prompt anchors
// extraction to canonical spellings, preventing "the Timeless" vs
// "Gromm The Timeless" duplicates and hallucinated location names.
func loadKnownEntities(ctx context.Context, driver neo4j.DriverWithContext) string {
session := driver.NewSession(ctx, neo4j.SessionConfig{})
defer session.Close(ctx)
result, err := session.ExecuteRead(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
res, err := tx.Run(ctx, `
MATCH (d:LoreDocument)-[:FEATURES]->(e)
WHERE e.name IS NOT NULL
WITH e.name AS name,
[l IN labels(e) WHERE l IN ['Person','Location','Faction','Event','Item','Creature']][0] AS etype,
count(d) AS mentions
WHERE etype IS NOT NULL
RETURN name, etype ORDER BY mentions DESC LIMIT 100
`, nil)
if err != nil {
return nil, err
}
byType := map[string][]string{}
for res.Next(ctx) {
name, _ := res.Record().Get("name")
etype, _ := res.Record().Get("etype")
n, _ := name.(string)
t, _ := etype.(string)
if n != "" && t != "" {
byType[t] = append(byType[t], n)
}
}
return byType, res.Err()
})
if err != nil || result == nil {
return ""
}
byType, ok := result.(map[string][]string)
if !ok || len(byType) == 0 {
return ""
}
var sb strings.Builder
sb.WriteString("\nKnown canonical entity names already in this campaign's graph — use these exact spellings whenever the passage refers to them, even by nickname or title:\n")
for _, t := range []string{"Person", "Location", "Faction", "Event", "Item", "Creature"} {
if names, ok := byType[t]; ok && len(names) > 0 {
sb.WriteString(fmt.Sprintf(" %s: %s\n", t, strings.Join(names, ", ")))
}
}
return sb.String()
}
func extractEntities(ctx context.Context, cfg Config, systemPrompt, title, content, knownEntities string) (*ExtractionResult, error) {
userMsg := fmt.Sprintf("Document title: %s\n\nPassage:\n%s", title, content)
if knownEntities != "" {
userMsg += "\n" + knownEntities
}
payload := chatRequest{
Model: cfg.LLMModel,
Messages: []chatMessage{
{Role: "system", Content: systemPrompt},
{Role: "user", Content: userMsg},
},
Stream: false,
}
body, _ := json.Marshal(payload)
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
cfg.LLMURL+"/v1/chat/completions", bytes.NewReader(body))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
var cr chatResponse
if err := json.NewDecoder(resp.Body).Decode(&cr); err != nil {
return nil, err
}
if len(cr.Choices) == 0 {
return nil, fmt.Errorf("empty LLM response")
}
raw := cr.Choices[0].Message.Content
raw = stripFences(raw)
raw = fixUnicodeEscapes(raw)
raw = repairJSON(raw)
var rr rawExtractionResult
if err := json.Unmarshal([]byte(raw), &rr); err != nil {
// Relations are often malformed; try salvaging just entities.
if result := salvageEntities(raw); result != nil {
return result, nil
}
slog.Warn("LLM returned non-JSON", "raw", cr.Choices[0].Message.Content)
return &ExtractionResult{}, nil
}
result := ExtractionResult{Entities: rr.Entities}
for _, r := range rr.Relations {
from, to := coerceString(r.From), coerceString(r.To)
if from != "" && to != "" && r.Rel != "" {
result.Relations = append(result.Relations, ExtractedRelation{From: from, To: to, Rel: r.Rel})
}
}
return &result, nil
}
// ── Neo4j write ───────────────────────────────────────────────────────────────
// Links extracted lore entities to the source LoreDocument, stamps temporal_hint
// on Event nodes when provided, marks all as lore_verified, and applies the
// entity type label via APOC.
const mergeLoreEntities = `
MERGE (d:LoreDocument {id: $docID})
WITH d
UNWIND $entities AS ent
MERGE (e {name: ent.name})
ON CREATE SET e.type = ent.type, e.source = "lore", e.lore_verified = true
ON MATCH SET e.lore_verified = true
WITH d, e, ent
WHERE NOT (e:LoreDocument OR e:LoreChunk OR e:Chunk OR e:Message OR e:Encounter)
FOREACH (_ IN CASE WHEN ent.temporal_hint IS NOT NULL AND ent.temporal_hint <> "" THEN [1] ELSE [] END |
SET e.temporal_hint = ent.temporal_hint
)
WITH d, e, ent
CALL apoc.create.addLabels(e, [ent.type]) YIELD node
MERGE (d)-[:FEATURES]->(node)
`
// applyAliasesCypher sets the aliases array on the primary entity of a lore
// document (the entity whose name matches the document title). Also sets
// lore_verified in case the entity existed before its lore doc was ingested.
const applyAliasesCypher = `
MATCH (e)
WHERE e.name = $name
AND NOT e:LoreDocument AND NOT e:LoreChunk AND NOT e:Chunk
SET e.aliases = $aliases, e.lore_verified = true
`
// mergeRelation creates or updates a typed relation between two lore entities.
const mergeLoreRelation = `
MATCH (a {name: $from})
MATCH (b {name: $to})
WITH a, b
CALL apoc.merge.relationship(a, $rel, {}, {}, b) YIELD rel
SET rel.since = $uploadedAt,
rel.doc_id = $docID
RETURN rel
`
// detectContradictions finds cases where this document's LOCATED_AT or RULES
// claims conflict with a prior document's claim about the same entity, and
// creates an explicit Contradiction node linking all parties.
const detectContradictionsQuery = `
MATCH (a)-[r1]->(x)
WHERE r1.doc_id = $docID AND type(r1) IN ['LOCATED_AT', 'RULES']
WITH a, type(r1) AS predicate, x.name AS claimA, r1.doc_id AS docA
MATCH (a)-[r2]->(y)
WHERE type(r2) = predicate AND r2.doc_id <> docA AND y.name <> claimA
WITH a, predicate, claimA, docA, y.name AS claimB, r2.doc_id AS docB
MERGE (contra:Contradiction {
subject: a.name,
predicate: predicate,
claim_a: claimA,
doc_a: docA,
claim_b: claimB,
doc_b: docB
})
ON CREATE SET contra.detected_at = $detectedAt, contra.flagged = true
WITH a, contra
MERGE (a)-[:HAS_CONTRADICTION]->(contra)
RETURN count(contra) AS total
`
func writeToGraph(ctx context.Context, session neo4j.SessionWithContext,
docID, title, uploadedAt string, result *ExtractionResult) error {
if len(result.Entities) == 0 {
return nil
}
entities := make([]map[string]any, len(result.Entities))
for i, e := range result.Entities {
entities[i] = map[string]any{
"name": e.Name,
"type": e.Type,
"temporal_hint": e.TemporalHint,
}
}
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, mergeLoreEntities, map[string]any{
"docID": docID,
"entities": entities,
})
return nil, err
})
if err != nil {
return fmt.Errorf("merge lore entities: %w", err)
}
for _, rel := range result.Relations {
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, mergeLoreRelation, map[string]any{
"from": rel.From,
"to": rel.To,
"rel": rel.Rel,
"uploadedAt": uploadedAt,
"docID": docID,
})
return nil, err
})
if err != nil {
slog.Warn("skipped lore relation", "from", rel.From, "to", rel.To, "rel", rel.Rel, "err", err)
}
}
flagContradictions(ctx, session, docID, uploadedAt)
return nil
}
func applyDocumentAliases(ctx context.Context, session neo4j.SessionWithContext, primaryEntity string, aliases []string) {
if primaryEntity == "" || len(aliases) == 0 {
return
}
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
_, err := tx.Run(ctx, applyAliasesCypher, map[string]any{
"name": primaryEntity,
"aliases": aliases,
})
return nil, err
})
if err != nil {
slog.Warn("failed to apply aliases", "entity", primaryEntity, "err", err)
} else {
slog.Info("applied aliases to entity", "entity", primaryEntity, "aliases", aliases)
}
}
func flagContradictions(ctx context.Context, session neo4j.SessionWithContext, docID, detectedAt string) {
result, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
res, err := tx.Run(ctx, detectContradictionsQuery, map[string]any{
"docID": docID,
"detectedAt": detectedAt,
})
if err != nil {
return int64(0), err
}
if res.Next(ctx) {
total, _ := res.Record().Get("total")
return total, res.Err()
}
return int64(0), res.Err()
})
if err != nil {
slog.Warn("contradiction detection failed", "doc_id", docID, "err", err)
return
}
if n, ok := result.(int64); ok && n > 0 {
slog.Info("contradictions flagged", "doc_id", docID, "count", n)
}
}
// ── Main ──────────────────────────────────────────────────────────────────────
func main() {
cfg := configFromEnv()
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil)))
systemPrompt := loadPrompt(cfg)
ctx := context.Background()
rOpts, err := redis.ParseURL(cfg.RedisURL)
if err != nil {
slog.Error("invalid redis URL", "err", err)
os.Exit(1)
}
rdb := redis.NewClient(rOpts)
rdb.XGroupCreateMkStream(ctx, cfg.Stream, cfg.Group, "0").Err()
driver, err := neo4j.NewDriverWithContext(cfg.Neo4jURL,
neo4j.BasicAuth(cfg.Neo4jUser, cfg.Neo4jPass, ""))
if err != nil {
slog.Error("neo4j driver error", "err", err)
os.Exit(1)
}
defer driver.Close(ctx)
slog.Info("lore-extractor started", "stream", cfg.Stream, "group", cfg.Group)
// Reclaim any messages delivered but not ACK'd before last shutdown.
// Bounded to maxRecoveryPasses so a persistently failing message
// (e.g. LLM not yet ready) does not block the live loop on startup.
const maxRecoveryPasses = 5
for pass := 0; pass < maxRecoveryPasses; pass++ {
results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
Group: cfg.Group,
Consumer: cfg.Consumer,
Streams: []string{cfg.Stream, "0"},
Count: 3,
}).Result()
if err != nil || len(results) == 0 || len(results[0].Messages) == 0 {
break
}
for _, msg := range results[0].Messages {
slog.Info("reprocessing pending message", "id", msg.ID)
if err := processMessage(ctx, cfg, systemPrompt, driver, msg); err != nil {
slog.Error("lore extraction failed (pending)", "id", msg.ID, "err", err)
continue
}
rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
}
}
for {
results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
Group: cfg.Group,
Consumer: cfg.Consumer,
Streams: []string{cfg.Stream, ">"},
Count: 3,
Block: 5 * time.Second,
}).Result()
if err == redis.Nil {
continue
}
if err != nil {
slog.Error("redis read error", "err", err)
time.Sleep(2 * time.Second)
continue
}
for _, stream := range results {
for _, msg := range stream.Messages {
if err := processMessage(ctx, cfg, systemPrompt, driver, msg); err != nil {
slog.Error("lore extraction failed", "id", msg.ID, "err", err)
continue
}
rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
}
}
}
}
func processMessage(ctx context.Context, cfg Config, systemPrompt string,
driver neo4j.DriverWithContext, msg redis.XMessage) error {
vals := msg.Values
docID := strVal(vals, "id", msg.ID)
title := strVal(vals, "title", "Untitled")
content := strVal(vals, "content", "")
uploadedAt := strVal(vals, "uploaded_at", time.Now().UTC().Format(time.RFC3339))
primaryEntity := strVal(vals, "primary_entity", title)
aliasesJSON := strVal(vals, "aliases", "[]")
var aliases []string
json.Unmarshal([]byte(aliasesJSON), &aliases) //nolint:errcheck
if content == "" {
return nil
}
knownEntities := loadKnownEntities(ctx, driver)
result, err := extractEntities(ctx, cfg, systemPrompt, title, content, knownEntities)
if err != nil {
return fmt.Errorf("LLM extraction: %w", err)
}
session := driver.NewSession(ctx, neo4j.SessionConfig{})
defer session.Close(ctx)
if err := writeToGraph(ctx, session, docID, title, uploadedAt, result); err != nil {
return fmt.Errorf("write to graph: %w", err)
}
applyDocumentAliases(ctx, session, primaryEntity, aliases)
slog.Info("processed lore document", "doc_id", docID, "title", title,
"entities", len(result.Entities), "relations", len(result.Relations))
return nil
}
// ── Helpers ───────────────────────────────────────────────────────────────────
func getEnv(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
func strVal(m map[string]any, key, fallback string) string {
if v, ok := m[key]; ok {
if s, ok := v.(string); ok {
return s
}
}
return fallback
}