Ports the GraphMCP-Example substrate into lore-engine-poc: - 8 Go workers under workers/ (discord-connector, discord-filter, lore-watcher, ingestion-worker, entity-extractor, lore-extractor, encounter-processor, mcp-server), each with Dockerfile + go.mod - 3 Go unit-test files (encounter-processor, ingestion-worker, lore-extractor) — other 5 workers rely on integration tests via the live stack - plugins/nsc.py: thin httpx proxy from gateway to lore-mcp-server:9000, exposes all 11 inherited GraphMCP tools (input schemas verbatim from mcp-server/main.go) - docker-compose.yml: adds lore-redis + lore-mcp-server + the 7 worker services (lore- prefix to avoid clash with other GraphMCP stacks) - verify-merge.sh (171 LOC, 7 pass conditions) + docs/VERIFICATION.md - tests/contract/test_graphmcp_tool_contracts.py (15 tests; skipped when stack is down — TDD pattern, becomes active once docker compose up brings the stack) - README.md + test.sh updated for the merged service inventory Leader notes (2026-06-27 03:50): - Worker self-blocked review-required after 2 runs (run #7 hit 120/120 iteration budget; run #8 staged 40 files and reported shippable). - Tests are SKIPPED until docker compose up — worker chose that pattern over mocking (consistent with the lore-engine-poc project convention). To activate, run `docker compose up -d --build && pytest tests/contract/`. - File Scope reconciliation: story said gateway/plugins/nsc/__init__.py; worker shipped plugins/nsc.py (flat file). Justified by the existing plugins/ convention in lore-engine-poc (server.py glob("*.py")). A future PR could split nsc into a package once server.py learns __init__.py discovery. - nsc plugin exposes 11 tools (not 8) — the AC said "8" but the worker enumerated all 11 tools present in mcp-server/main.go. The encounter-specific 3 tools (list_encounters, search_encounters, get_encounter) were included for consistency. Story AC #2 reads "≥ 8 GraphMCP tools" so this exceeds AC. Refs: S2-phase-1-substrate-merge, milestone #64 P1 — Substrate merge
657 lines
21 KiB
Go
657 lines
21 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
|
|
"github.com/redis/go-redis/v9"
|
|
)
|
|
|
|
var httpClient = &http.Client{Timeout: 30 * time.Second}
|
|
|
|
// ── Config ────────────────────────────────────────────────────────────────────
|
|
|
|
type Config struct {
|
|
RedisURL string
|
|
Stream string
|
|
Group string
|
|
Consumer string
|
|
Neo4jURL string
|
|
Neo4jUser string
|
|
Neo4jPass string
|
|
LLMURL string
|
|
LLMModel string
|
|
PromptFile string
|
|
}
|
|
|
|
func configFromEnv() Config {
|
|
return Config{
|
|
RedisURL: getEnv("REDIS_URL", "redis://redis:6379"),
|
|
Stream: getEnv("REDIS_STREAM", "raw.lore"),
|
|
Group: getEnv("REDIS_GROUP", "lore-extraction"),
|
|
Consumer: getEnv("CONSUMER_NAME", "lore-extractor-1"),
|
|
Neo4jURL: getEnv("NEO4J_URL", "bolt://neo4j:7687"),
|
|
Neo4jUser: getEnv("NEO4J_USER", "neo4j"),
|
|
Neo4jPass: getEnv("NEO4J_PASSWORD", "changeme"),
|
|
LLMURL: getEnv("LLM_URL", "http://ollama-cpu:11435"),
|
|
LLMModel: getEnv("LLM_MODEL", "qwen2.5:3b"),
|
|
PromptFile: getEnv("PROMPT_FILE", ""),
|
|
}
|
|
}
|
|
|
|
// ── System prompt ─────────────────────────────────────────────────────────────
|
|
|
|
const defaultSystemPrompt = `You are a lore entity extraction engine for a D&D campaign knowledge graph. Given a passage of lore text (a biography, story, history, or worldbuilding document), extract named entities and the relationships between them.
|
|
|
|
Return ONLY valid JSON in this exact shape, no other text:
|
|
{
|
|
"entities": [
|
|
{"name": "Theron Ashveil", "type": "Person"},
|
|
{"name": "The Iron Council", "type": "Faction"},
|
|
{"name": "Thornwall Keep", "type": "Location"},
|
|
{"name": "Siege of Thornwall", "type": "Event", "temporal_hint": "Year 340 of the Third Age"},
|
|
{"name": "Sword of Eventide", "type": "Item"},
|
|
{"name": "Ancient Red Dragon", "type": "Creature"}
|
|
],
|
|
"relations": [
|
|
{"from": "Theron Ashveil", "to": "The Iron Council", "rel": "MEMBER_OF"},
|
|
{"from": "Siege of Thornwall", "to": "Thornwall Keep", "rel": "OCCURRED_AT"},
|
|
{"from": "Theron Ashveil", "to": "Sword of Eventide", "rel": "POSSESSES"}
|
|
]
|
|
}
|
|
|
|
Entity types (use exactly these labels):
|
|
Person — a named character, NPC, deity, or historical figure in the lore
|
|
Location — a named place, dungeon, city, region, landmark, realm, or geographic feature
|
|
Event — a named historical event, battle, ceremony, meeting, or significant occurrence
|
|
Faction — a named guild, kingdom, order, cult, party, or group of people
|
|
Item — a named weapon, artifact, magical item, relic, or significant object
|
|
Creature — a named or typed monster, beast, or non-person entity (e.g. "Ancient Red Dragon", "The Pale Worm")
|
|
|
|
Relation types (use exactly these labels):
|
|
PARTICIPATED_IN — Person or Faction took part in an Event
|
|
OCCURRED_AT — Event took place at a Location
|
|
LOCATED_AT — Person, Faction, Item, or Creature is found at or in a Location
|
|
RULES — Person or Faction governs or controls a Location or Faction
|
|
MEMBER_OF — Person belongs to a Faction
|
|
ALLIED_WITH — Person or Faction is allied with another Person or Faction
|
|
ENEMY_OF — Person or Faction is opposed to another Person or Faction
|
|
POSSESSES — Person or Faction holds or owns an Item
|
|
SEEKS — Person or Faction is actively looking for a Person, Item, or Location
|
|
KNOWS — two Persons have a relationship or acquaintance
|
|
PRECEDED — this Event preceded another Event chronologically
|
|
CREATED_BY — Item or Location was made or founded by a Person or Faction
|
|
|
|
Rules:
|
|
- Only extract entities that are explicitly named in the text.
|
|
- Choose the most specific relation type that fits; omit a relation rather than guessing.
|
|
- Omit entities or relations you are not confident about.
|
|
- Do not invent names or relationships not present in the text.
|
|
- temporal_hint: for every Event entity this field is REQUIRED. Use the best time information available in the text — a calendar year, a named age, a relative phrase like "shortly after the Fall of Thornwall", or "sometime during the Dusk War". Write "unknown era" only as an absolute last resort. For non-Event entities include temporal_hint only when the text explicitly states when they were active, founded, created, or died.`
|
|
|
|
func loadPrompt(cfg Config) string {
|
|
if cfg.PromptFile == "" {
|
|
return defaultSystemPrompt
|
|
}
|
|
data, err := os.ReadFile(cfg.PromptFile)
|
|
if err != nil {
|
|
slog.Warn("could not read PROMPT_FILE, using default", "file", cfg.PromptFile, "err", err)
|
|
return defaultSystemPrompt
|
|
}
|
|
slog.Info("loaded system prompt from file", "file", cfg.PromptFile)
|
|
return string(data)
|
|
}
|
|
|
|
// ── LLM entity extraction ─────────────────────────────────────────────────────
|
|
|
|
type chatMessage struct {
|
|
Role string `json:"role"`
|
|
Content string `json:"content"`
|
|
}
|
|
|
|
type chatRequest struct {
|
|
Model string `json:"model"`
|
|
Messages []chatMessage `json:"messages"`
|
|
Stream bool `json:"stream"`
|
|
}
|
|
|
|
type chatResponse struct {
|
|
Choices []struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
} `json:"choices"`
|
|
}
|
|
|
|
type Entity struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
TemporalHint string `json:"temporal_hint,omitempty"`
|
|
}
|
|
|
|
type ExtractedRelation struct {
|
|
From string `json:"from"`
|
|
To string `json:"to"`
|
|
Rel string `json:"rel"`
|
|
}
|
|
|
|
type ExtractionResult struct {
|
|
Entities []Entity `json:"entities"`
|
|
Relations []ExtractedRelation `json:"relations"`
|
|
}
|
|
|
|
// trailingCommaRe matches commas immediately before a closing brace or bracket.
|
|
var trailingCommaRe = regexp.MustCompile(`,\s*([}\]])`)
|
|
|
|
// repairJSON fixes common LLM JSON mistakes: trailing commas.
|
|
func repairJSON(s string) string {
|
|
return trailingCommaRe.ReplaceAllString(s, "$1")
|
|
}
|
|
|
|
// fixUnicodeEscapes removes \uXXXX sequences where the four chars aren't valid hex.
|
|
// Models sometimes emit \u201g (g is not hex) which makes Go's JSON parser fail.
|
|
func fixUnicodeEscapes(s string) string {
|
|
var buf strings.Builder
|
|
buf.Grow(len(s))
|
|
for i := 0; i < len(s); {
|
|
if i+1 < len(s) && s[i] == '\\' && s[i+1] == 'u' {
|
|
if i+6 <= len(s) && isHexByte(s[i+2]) && isHexByte(s[i+3]) && isHexByte(s[i+4]) && isHexByte(s[i+5]) {
|
|
buf.WriteString(s[i : i+6])
|
|
i += 6
|
|
} else {
|
|
// Invalid or incomplete \uXXXX — skip the entire 6-char sequence so
|
|
// the raw hex digits don't get concatenated into entity names.
|
|
if i+6 <= len(s) {
|
|
i += 6
|
|
} else {
|
|
i += 2
|
|
}
|
|
}
|
|
} else {
|
|
buf.WriteByte(s[i])
|
|
i++
|
|
}
|
|
}
|
|
return buf.String()
|
|
}
|
|
|
|
func isHexByte(c byte) bool {
|
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
|
|
}
|
|
|
|
// stripFences removes markdown code fences that models often wrap JSON in.
|
|
func stripFences(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
// Remove opening fence (```json or ```)
|
|
for _, fence := range []string{"```json", "```"} {
|
|
if strings.HasPrefix(s, fence) {
|
|
s = s[len(fence):]
|
|
break
|
|
}
|
|
}
|
|
// Remove closing fence
|
|
if idx := strings.LastIndex(s, "```"); idx != -1 {
|
|
s = s[:idx]
|
|
}
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// salvageEntities tries to recover just the entities array when relations are malformed.
|
|
func salvageEntities(raw string) *ExtractionResult {
|
|
idx := strings.Index(raw, `"relations"`)
|
|
if idx < 0 {
|
|
return nil
|
|
}
|
|
truncated := strings.TrimRight(raw[:idx], ", \t\n\r") + `,"relations":[]}`
|
|
var rr rawExtractionResult
|
|
if err := json.Unmarshal([]byte(truncated), &rr); err != nil || len(rr.Entities) == 0 {
|
|
return nil
|
|
}
|
|
slog.Warn("salvaged entities only — relations were malformed", "entities", len(rr.Entities))
|
|
return &ExtractionResult{Entities: rr.Entities}
|
|
}
|
|
|
|
// rawRelation accepts from/to as a string, {"name":"..."} object, or ["..."] array.
|
|
type rawRelation struct {
|
|
From json.RawMessage `json:"from"`
|
|
To json.RawMessage `json:"to"`
|
|
Rel string `json:"rel"`
|
|
}
|
|
|
|
func coerceString(raw json.RawMessage) string {
|
|
var s string
|
|
if json.Unmarshal(raw, &s) == nil {
|
|
return s
|
|
}
|
|
var obj struct{ Name string `json:"name"` }
|
|
if json.Unmarshal(raw, &obj) == nil && obj.Name != "" {
|
|
return strings.Trim(obj.Name, "*_ ")
|
|
}
|
|
var arr []json.RawMessage
|
|
if json.Unmarshal(raw, &arr) == nil && len(arr) > 0 {
|
|
return coerceString(arr[0])
|
|
}
|
|
return ""
|
|
}
|
|
|
|
type rawExtractionResult struct {
|
|
Entities []Entity `json:"entities"`
|
|
Relations []rawRelation `json:"relations"`
|
|
}
|
|
|
|
// loadKnownEntities queries the graph for already-established entity names and
|
|
// returns a formatted hint block. Injecting this into the LLM prompt anchors
|
|
// extraction to canonical spellings, preventing "the Timeless" vs
|
|
// "Gromm The Timeless" duplicates and hallucinated location names.
|
|
func loadKnownEntities(ctx context.Context, driver neo4j.DriverWithContext) string {
|
|
session := driver.NewSession(ctx, neo4j.SessionConfig{})
|
|
defer session.Close(ctx)
|
|
|
|
result, err := session.ExecuteRead(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
|
|
res, err := tx.Run(ctx, `
|
|
MATCH (d:LoreDocument)-[:FEATURES]->(e)
|
|
WHERE e.name IS NOT NULL
|
|
WITH e.name AS name,
|
|
[l IN labels(e) WHERE l IN ['Person','Location','Faction','Event','Item','Creature']][0] AS etype,
|
|
count(d) AS mentions
|
|
WHERE etype IS NOT NULL
|
|
RETURN name, etype ORDER BY mentions DESC LIMIT 100
|
|
`, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
byType := map[string][]string{}
|
|
for res.Next(ctx) {
|
|
name, _ := res.Record().Get("name")
|
|
etype, _ := res.Record().Get("etype")
|
|
n, _ := name.(string)
|
|
t, _ := etype.(string)
|
|
if n != "" && t != "" {
|
|
byType[t] = append(byType[t], n)
|
|
}
|
|
}
|
|
return byType, res.Err()
|
|
})
|
|
if err != nil || result == nil {
|
|
return ""
|
|
}
|
|
|
|
byType, ok := result.(map[string][]string)
|
|
if !ok || len(byType) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var sb strings.Builder
|
|
sb.WriteString("\nKnown canonical entity names already in this campaign's graph — use these exact spellings whenever the passage refers to them, even by nickname or title:\n")
|
|
for _, t := range []string{"Person", "Location", "Faction", "Event", "Item", "Creature"} {
|
|
if names, ok := byType[t]; ok && len(names) > 0 {
|
|
sb.WriteString(fmt.Sprintf(" %s: %s\n", t, strings.Join(names, ", ")))
|
|
}
|
|
}
|
|
return sb.String()
|
|
}
|
|
|
|
func extractEntities(ctx context.Context, cfg Config, systemPrompt, title, content, knownEntities string) (*ExtractionResult, error) {
|
|
userMsg := fmt.Sprintf("Document title: %s\n\nPassage:\n%s", title, content)
|
|
if knownEntities != "" {
|
|
userMsg += "\n" + knownEntities
|
|
}
|
|
|
|
payload := chatRequest{
|
|
Model: cfg.LLMModel,
|
|
Messages: []chatMessage{
|
|
{Role: "system", Content: systemPrompt},
|
|
{Role: "user", Content: userMsg},
|
|
},
|
|
Stream: false,
|
|
}
|
|
|
|
body, _ := json.Marshal(payload)
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
|
cfg.LLMURL+"/v1/chat/completions", bytes.NewReader(body))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
var cr chatResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&cr); err != nil {
|
|
return nil, err
|
|
}
|
|
if len(cr.Choices) == 0 {
|
|
return nil, fmt.Errorf("empty LLM response")
|
|
}
|
|
|
|
raw := cr.Choices[0].Message.Content
|
|
raw = stripFences(raw)
|
|
raw = fixUnicodeEscapes(raw)
|
|
raw = repairJSON(raw)
|
|
|
|
var rr rawExtractionResult
|
|
if err := json.Unmarshal([]byte(raw), &rr); err != nil {
|
|
// Relations are often malformed; try salvaging just entities.
|
|
if result := salvageEntities(raw); result != nil {
|
|
return result, nil
|
|
}
|
|
slog.Warn("LLM returned non-JSON", "raw", cr.Choices[0].Message.Content)
|
|
return &ExtractionResult{}, nil
|
|
}
|
|
|
|
result := ExtractionResult{Entities: rr.Entities}
|
|
for _, r := range rr.Relations {
|
|
from, to := coerceString(r.From), coerceString(r.To)
|
|
if from != "" && to != "" && r.Rel != "" {
|
|
result.Relations = append(result.Relations, ExtractedRelation{From: from, To: to, Rel: r.Rel})
|
|
}
|
|
}
|
|
return &result, nil
|
|
}
|
|
|
|
// ── Neo4j write ───────────────────────────────────────────────────────────────
|
|
|
|
// Links extracted lore entities to the source LoreDocument, stamps temporal_hint
|
|
// on Event nodes when provided, marks all as lore_verified, and applies the
|
|
// entity type label via APOC.
|
|
const mergeLoreEntities = `
|
|
MERGE (d:LoreDocument {id: $docID})
|
|
WITH d
|
|
UNWIND $entities AS ent
|
|
MERGE (e {name: ent.name})
|
|
ON CREATE SET e.type = ent.type, e.source = "lore", e.lore_verified = true
|
|
ON MATCH SET e.lore_verified = true
|
|
WITH d, e, ent
|
|
WHERE NOT (e:LoreDocument OR e:LoreChunk OR e:Chunk OR e:Message OR e:Encounter)
|
|
FOREACH (_ IN CASE WHEN ent.temporal_hint IS NOT NULL AND ent.temporal_hint <> "" THEN [1] ELSE [] END |
|
|
SET e.temporal_hint = ent.temporal_hint
|
|
)
|
|
WITH d, e, ent
|
|
CALL apoc.create.addLabels(e, [ent.type]) YIELD node
|
|
MERGE (d)-[:FEATURES]->(node)
|
|
`
|
|
|
|
// applyAliasesCypher sets the aliases array on the primary entity of a lore
|
|
// document (the entity whose name matches the document title). Also sets
|
|
// lore_verified in case the entity existed before its lore doc was ingested.
|
|
const applyAliasesCypher = `
|
|
MATCH (e)
|
|
WHERE e.name = $name
|
|
AND NOT e:LoreDocument AND NOT e:LoreChunk AND NOT e:Chunk
|
|
SET e.aliases = $aliases, e.lore_verified = true
|
|
`
|
|
|
|
// mergeRelation creates or updates a typed relation between two lore entities.
|
|
const mergeLoreRelation = `
|
|
MATCH (a {name: $from})
|
|
MATCH (b {name: $to})
|
|
WITH a, b
|
|
CALL apoc.merge.relationship(a, $rel, {}, {}, b) YIELD rel
|
|
SET rel.since = $uploadedAt,
|
|
rel.doc_id = $docID
|
|
RETURN rel
|
|
`
|
|
|
|
// detectContradictions finds cases where this document's LOCATED_AT or RULES
|
|
// claims conflict with a prior document's claim about the same entity, and
|
|
// creates an explicit Contradiction node linking all parties.
|
|
const detectContradictionsQuery = `
|
|
MATCH (a)-[r1]->(x)
|
|
WHERE r1.doc_id = $docID AND type(r1) IN ['LOCATED_AT', 'RULES']
|
|
WITH a, type(r1) AS predicate, x.name AS claimA, r1.doc_id AS docA
|
|
MATCH (a)-[r2]->(y)
|
|
WHERE type(r2) = predicate AND r2.doc_id <> docA AND y.name <> claimA
|
|
WITH a, predicate, claimA, docA, y.name AS claimB, r2.doc_id AS docB
|
|
MERGE (contra:Contradiction {
|
|
subject: a.name,
|
|
predicate: predicate,
|
|
claim_a: claimA,
|
|
doc_a: docA,
|
|
claim_b: claimB,
|
|
doc_b: docB
|
|
})
|
|
ON CREATE SET contra.detected_at = $detectedAt, contra.flagged = true
|
|
WITH a, contra
|
|
MERGE (a)-[:HAS_CONTRADICTION]->(contra)
|
|
RETURN count(contra) AS total
|
|
`
|
|
|
|
func writeToGraph(ctx context.Context, session neo4j.SessionWithContext,
|
|
docID, title, uploadedAt string, result *ExtractionResult) error {
|
|
|
|
if len(result.Entities) == 0 {
|
|
return nil
|
|
}
|
|
|
|
entities := make([]map[string]any, len(result.Entities))
|
|
for i, e := range result.Entities {
|
|
entities[i] = map[string]any{
|
|
"name": e.Name,
|
|
"type": e.Type,
|
|
"temporal_hint": e.TemporalHint,
|
|
}
|
|
}
|
|
|
|
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
|
|
_, err := tx.Run(ctx, mergeLoreEntities, map[string]any{
|
|
"docID": docID,
|
|
"entities": entities,
|
|
})
|
|
return nil, err
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("merge lore entities: %w", err)
|
|
}
|
|
|
|
for _, rel := range result.Relations {
|
|
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
|
|
_, err := tx.Run(ctx, mergeLoreRelation, map[string]any{
|
|
"from": rel.From,
|
|
"to": rel.To,
|
|
"rel": rel.Rel,
|
|
"uploadedAt": uploadedAt,
|
|
"docID": docID,
|
|
})
|
|
return nil, err
|
|
})
|
|
if err != nil {
|
|
slog.Warn("skipped lore relation", "from", rel.From, "to", rel.To, "rel", rel.Rel, "err", err)
|
|
}
|
|
}
|
|
|
|
flagContradictions(ctx, session, docID, uploadedAt)
|
|
return nil
|
|
}
|
|
|
|
func applyDocumentAliases(ctx context.Context, session neo4j.SessionWithContext, primaryEntity string, aliases []string) {
|
|
if primaryEntity == "" || len(aliases) == 0 {
|
|
return
|
|
}
|
|
_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
|
|
_, err := tx.Run(ctx, applyAliasesCypher, map[string]any{
|
|
"name": primaryEntity,
|
|
"aliases": aliases,
|
|
})
|
|
return nil, err
|
|
})
|
|
if err != nil {
|
|
slog.Warn("failed to apply aliases", "entity", primaryEntity, "err", err)
|
|
} else {
|
|
slog.Info("applied aliases to entity", "entity", primaryEntity, "aliases", aliases)
|
|
}
|
|
}
|
|
|
|
func flagContradictions(ctx context.Context, session neo4j.SessionWithContext, docID, detectedAt string) {
|
|
result, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
|
|
res, err := tx.Run(ctx, detectContradictionsQuery, map[string]any{
|
|
"docID": docID,
|
|
"detectedAt": detectedAt,
|
|
})
|
|
if err != nil {
|
|
return int64(0), err
|
|
}
|
|
if res.Next(ctx) {
|
|
total, _ := res.Record().Get("total")
|
|
return total, res.Err()
|
|
}
|
|
return int64(0), res.Err()
|
|
})
|
|
if err != nil {
|
|
slog.Warn("contradiction detection failed", "doc_id", docID, "err", err)
|
|
return
|
|
}
|
|
if n, ok := result.(int64); ok && n > 0 {
|
|
slog.Info("contradictions flagged", "doc_id", docID, "count", n)
|
|
}
|
|
}
|
|
|
|
// ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
func main() {
|
|
cfg := configFromEnv()
|
|
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil)))
|
|
|
|
systemPrompt := loadPrompt(cfg)
|
|
|
|
ctx := context.Background()
|
|
|
|
rOpts, err := redis.ParseURL(cfg.RedisURL)
|
|
if err != nil {
|
|
slog.Error("invalid redis URL", "err", err)
|
|
os.Exit(1)
|
|
}
|
|
rdb := redis.NewClient(rOpts)
|
|
rdb.XGroupCreateMkStream(ctx, cfg.Stream, cfg.Group, "0").Err()
|
|
|
|
driver, err := neo4j.NewDriverWithContext(cfg.Neo4jURL,
|
|
neo4j.BasicAuth(cfg.Neo4jUser, cfg.Neo4jPass, ""))
|
|
if err != nil {
|
|
slog.Error("neo4j driver error", "err", err)
|
|
os.Exit(1)
|
|
}
|
|
defer driver.Close(ctx)
|
|
|
|
slog.Info("lore-extractor started", "stream", cfg.Stream, "group", cfg.Group)
|
|
|
|
// Reclaim any messages delivered but not ACK'd before last shutdown.
|
|
// Bounded to maxRecoveryPasses so a persistently failing message
|
|
// (e.g. LLM not yet ready) does not block the live loop on startup.
|
|
const maxRecoveryPasses = 5
|
|
for pass := 0; pass < maxRecoveryPasses; pass++ {
|
|
results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
|
|
Group: cfg.Group,
|
|
Consumer: cfg.Consumer,
|
|
Streams: []string{cfg.Stream, "0"},
|
|
Count: 3,
|
|
}).Result()
|
|
if err != nil || len(results) == 0 || len(results[0].Messages) == 0 {
|
|
break
|
|
}
|
|
for _, msg := range results[0].Messages {
|
|
slog.Info("reprocessing pending message", "id", msg.ID)
|
|
if err := processMessage(ctx, cfg, systemPrompt, driver, msg); err != nil {
|
|
slog.Error("lore extraction failed (pending)", "id", msg.ID, "err", err)
|
|
continue
|
|
}
|
|
rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
|
|
}
|
|
}
|
|
|
|
for {
|
|
results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
|
|
Group: cfg.Group,
|
|
Consumer: cfg.Consumer,
|
|
Streams: []string{cfg.Stream, ">"},
|
|
Count: 3,
|
|
Block: 5 * time.Second,
|
|
}).Result()
|
|
|
|
if err == redis.Nil {
|
|
continue
|
|
}
|
|
if err != nil {
|
|
slog.Error("redis read error", "err", err)
|
|
time.Sleep(2 * time.Second)
|
|
continue
|
|
}
|
|
|
|
for _, stream := range results {
|
|
for _, msg := range stream.Messages {
|
|
if err := processMessage(ctx, cfg, systemPrompt, driver, msg); err != nil {
|
|
slog.Error("lore extraction failed", "id", msg.ID, "err", err)
|
|
continue
|
|
}
|
|
rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func processMessage(ctx context.Context, cfg Config, systemPrompt string,
|
|
driver neo4j.DriverWithContext, msg redis.XMessage) error {
|
|
|
|
vals := msg.Values
|
|
docID := strVal(vals, "id", msg.ID)
|
|
title := strVal(vals, "title", "Untitled")
|
|
content := strVal(vals, "content", "")
|
|
uploadedAt := strVal(vals, "uploaded_at", time.Now().UTC().Format(time.RFC3339))
|
|
primaryEntity := strVal(vals, "primary_entity", title)
|
|
aliasesJSON := strVal(vals, "aliases", "[]")
|
|
var aliases []string
|
|
json.Unmarshal([]byte(aliasesJSON), &aliases) //nolint:errcheck
|
|
|
|
if content == "" {
|
|
return nil
|
|
}
|
|
|
|
knownEntities := loadKnownEntities(ctx, driver)
|
|
result, err := extractEntities(ctx, cfg, systemPrompt, title, content, knownEntities)
|
|
if err != nil {
|
|
return fmt.Errorf("LLM extraction: %w", err)
|
|
}
|
|
|
|
session := driver.NewSession(ctx, neo4j.SessionConfig{})
|
|
defer session.Close(ctx)
|
|
|
|
if err := writeToGraph(ctx, session, docID, title, uploadedAt, result); err != nil {
|
|
return fmt.Errorf("write to graph: %w", err)
|
|
}
|
|
|
|
applyDocumentAliases(ctx, session, primaryEntity, aliases)
|
|
|
|
slog.Info("processed lore document", "doc_id", docID, "title", title,
|
|
"entities", len(result.Entities), "relations", len(result.Relations))
|
|
return nil
|
|
}
|
|
|
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
func getEnv(key, fallback string) string {
|
|
if v := os.Getenv(key); v != "" {
|
|
return v
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func strVal(m map[string]any, key, fallback string) string {
|
|
if v, ok := m[key]; ok {
|
|
if s, ok := v.(string); ok {
|
|
return s
|
|
}
|
|
}
|
|
return fallback
|
|
}
|