lore-engine-poc/workers/lore-extractor/main.go

package main

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"log/slog"
	"net/http"
	"os"
	"regexp"
	"strings"
	"time"

	"github.com/neo4j/neo4j-go-driver/v5/neo4j"
	"github.com/redis/go-redis/v9"
)

var httpClient = &http.Client{Timeout: 30 * time.Second}

// ── Config ────────────────────────────────────────────────────────────────────

type Config struct {
	RedisURL   string
	Stream     string
	Group      string
	Consumer   string
	Neo4jURL   string
	Neo4jUser  string
	Neo4jPass  string
	LLMURL     string
	LLMModel   string
	PromptFile string
}

func configFromEnv() Config {
	return Config{
		RedisURL:   getEnv("REDIS_URL", "redis://redis:6379"),
		Stream:     getEnv("REDIS_STREAM", "raw.lore"),
		Group:      getEnv("REDIS_GROUP", "lore-extraction"),
		Consumer:   getEnv("CONSUMER_NAME", "lore-extractor-1"),
		Neo4jURL:   getEnv("NEO4J_URL", "bolt://neo4j:7687"),
		Neo4jUser:  getEnv("NEO4J_USER", "neo4j"),
		Neo4jPass:  getEnv("NEO4J_PASSWORD", "changeme"),
		LLMURL:     getEnv("LLM_URL", "http://ollama-cpu:11435"),
		LLMModel:   getEnv("LLM_MODEL", "qwen2.5:3b"),
		PromptFile: getEnv("PROMPT_FILE", ""),
	}
}

// ── System prompt ─────────────────────────────────────────────────────────────

const defaultSystemPrompt = `You are a lore entity extraction engine for a D&D campaign knowledge graph. Given a passage of lore text (a biography, story, history, or worldbuilding document), extract named entities and the relationships between them.

Return ONLY valid JSON in this exact shape, no other text:
{
  "entities": [
    {"name": "Theron Ashveil", "type": "Person"},
    {"name": "The Iron Council", "type": "Faction"},
    {"name": "Thornwall Keep", "type": "Location"},
    {"name": "Siege of Thornwall", "type": "Event", "temporal_hint": "Year 340 of the Third Age"},
    {"name": "Sword of Eventide", "type": "Item"},
    {"name": "Ancient Red Dragon", "type": "Creature"}
  ],
  "relations": [
    {"from": "Theron Ashveil", "to": "The Iron Council", "rel": "MEMBER_OF"},
    {"from": "Siege of Thornwall", "to": "Thornwall Keep", "rel": "OCCURRED_AT"},
    {"from": "Theron Ashveil", "to": "Sword of Eventide", "rel": "POSSESSES"}
  ]
}

Entity types (use exactly these labels):
  Person    — a named character, NPC, deity, or historical figure in the lore
  Location  — a named place, dungeon, city, region, landmark, realm, or geographic feature
  Event     — a named historical event, battle, ceremony, meeting, or significant occurrence
  Faction   — a named guild, kingdom, order, cult, party, or group of people
  Item      — a named weapon, artifact, magical item, relic, or significant object
  Creature  — a named or typed monster, beast, or non-person entity (e.g. "Ancient Red Dragon", "The Pale Worm")

Relation types (use exactly these labels):
  PARTICIPATED_IN — Person or Faction took part in an Event
  OCCURRED_AT     — Event took place at a Location
  LOCATED_AT      — Person, Faction, Item, or Creature is found at or in a Location
  RULES           — Person or Faction governs or controls a Location or Faction
  MEMBER_OF       — Person belongs to a Faction
  ALLIED_WITH     — Person or Faction is allied with another Person or Faction
  ENEMY_OF        — Person or Faction is opposed to another Person or Faction
  POSSESSES       — Person or Faction holds or owns an Item
  SEEKS           — Person or Faction is actively looking for a Person, Item, or Location
  KNOWS           — two Persons have a relationship or acquaintance
  PRECEDED        — this Event preceded another Event chronologically
  CREATED_BY      — Item or Location was made or founded by a Person or Faction

Rules:
- Only extract entities that are explicitly named in the text.
- Choose the most specific relation type that fits; omit a relation rather than guessing.
- Omit entities or relations you are not confident about.
- Do not invent names or relationships not present in the text.
- temporal_hint: for every Event entity this field is REQUIRED. Use the best time information available in the text — a calendar year, a named age, a relative phrase like "shortly after the Fall of Thornwall", or "sometime during the Dusk War". Write "unknown era" only as an absolute last resort. For non-Event entities include temporal_hint only when the text explicitly states when they were active, founded, created, or died.`

func loadPrompt(cfg Config) string {
	if cfg.PromptFile == "" {
		return defaultSystemPrompt
	}
	data, err := os.ReadFile(cfg.PromptFile)
	if err != nil {
		slog.Warn("could not read PROMPT_FILE, using default", "file", cfg.PromptFile, "err", err)
		return defaultSystemPrompt
	}
	slog.Info("loaded system prompt from file", "file", cfg.PromptFile)
	return string(data)
}

// ── LLM entity extraction ─────────────────────────────────────────────────────

type chatMessage struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

type chatRequest struct {
	Model    string        `json:"model"`
	Messages []chatMessage `json:"messages"`
	Stream   bool          `json:"stream"`
}

type chatResponse struct {
	Choices []struct {
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
	} `json:"choices"`
}

type Entity struct {
	Name         string `json:"name"`
	Type         string `json:"type"`
	TemporalHint string `json:"temporal_hint,omitempty"`
}

type ExtractedRelation struct {
	From string `json:"from"`
	To   string `json:"to"`
	Rel  string `json:"rel"`
}

type ExtractionResult struct {
	Entities  []Entity            `json:"entities"`
	Relations []ExtractedRelation `json:"relations"`
}

// trailingCommaRe matches commas immediately before a closing brace or bracket.
var trailingCommaRe = regexp.MustCompile(`,\s*([}\]])`)

// repairJSON fixes common LLM JSON mistakes: trailing commas.
func repairJSON(s string) string {
	return trailingCommaRe.ReplaceAllString(s, "$1")
}

// fixUnicodeEscapes removes \uXXXX sequences where the four chars aren't valid hex.
// Models sometimes emit \u201g (g is not hex) which makes Go's JSON parser fail.
func fixUnicodeEscapes(s string) string {
	var buf strings.Builder
	buf.Grow(len(s))
	for i := 0; i < len(s); {
		if i+1 < len(s) && s[i] == '\\' && s[i+1] == 'u' {
			if i+6 <= len(s) && isHexByte(s[i+2]) && isHexByte(s[i+3]) && isHexByte(s[i+4]) && isHexByte(s[i+5]) {
				buf.WriteString(s[i : i+6])
				i += 6
			} else {
				// Invalid or incomplete \uXXXX — skip the entire 6-char sequence so
				// the raw hex digits don't get concatenated into entity names.
				if i+6 <= len(s) {
					i += 6
				} else {
					i += 2
				}
			}
		} else {
			buf.WriteByte(s[i])
			i++
		}
	}
	return buf.String()
}

func isHexByte(c byte) bool {
	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}

// stripFences removes markdown code fences that models often wrap JSON in.
func stripFences(s string) string {
	s = strings.TrimSpace(s)
	// Remove opening fence (```json or ```)
	for _, fence := range []string{"```json", "```"} {
		if strings.HasPrefix(s, fence) {
			s = s[len(fence):]
			break
		}
	}
	// Remove closing fence
	if idx := strings.LastIndex(s, "```"); idx != -1 {
		s = s[:idx]
	}
	return strings.TrimSpace(s)
}

// salvageEntities tries to recover just the entities array when relations are malformed.
func salvageEntities(raw string) *ExtractionResult {
	idx := strings.Index(raw, `"relations"`)
	if idx < 0 {
		return nil
	}
	truncated := strings.TrimRight(raw[:idx], ", \t\n\r") + `,"relations":[]}`
	var rr rawExtractionResult
	if err := json.Unmarshal([]byte(truncated), &rr); err != nil || len(rr.Entities) == 0 {
		return nil
	}
	slog.Warn("salvaged entities only — relations were malformed", "entities", len(rr.Entities))
	return &ExtractionResult{Entities: rr.Entities}
}

// rawRelation accepts from/to as a string, {"name":"..."} object, or ["..."] array.
type rawRelation struct {
	From json.RawMessage `json:"from"`
	To   json.RawMessage `json:"to"`
	Rel  string          `json:"rel"`
}

func coerceString(raw json.RawMessage) string {
	var s string
	if json.Unmarshal(raw, &s) == nil {
		return s
	}
	var obj struct{ Name string `json:"name"` }
	if json.Unmarshal(raw, &obj) == nil && obj.Name != "" {
		return strings.Trim(obj.Name, "*_ ")
	}
	var arr []json.RawMessage
	if json.Unmarshal(raw, &arr) == nil && len(arr) > 0 {
		return coerceString(arr[0])
	}
	return ""
}

type rawExtractionResult struct {
	Entities  []Entity      `json:"entities"`
	Relations []rawRelation `json:"relations"`
}

// loadKnownEntities queries the graph for already-established entity names and
// returns a formatted hint block. Injecting this into the LLM prompt anchors
// extraction to canonical spellings, preventing "the Timeless" vs
// "Gromm The Timeless" duplicates and hallucinated location names.
func loadKnownEntities(ctx context.Context, driver neo4j.DriverWithContext) string {
	session := driver.NewSession(ctx, neo4j.SessionConfig{})
	defer session.Close(ctx)

	result, err := session.ExecuteRead(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
		res, err := tx.Run(ctx, `
			MATCH (d:LoreDocument)-[:FEATURES]->(e)
			WHERE e.name IS NOT NULL
			WITH e.name AS name,
			     [l IN labels(e) WHERE l IN ['Person','Location','Faction','Event','Item','Creature']][0] AS etype,
			     count(d) AS mentions
			WHERE etype IS NOT NULL
			RETURN name, etype ORDER BY mentions DESC LIMIT 100
		`, nil)
		if err != nil {
			return nil, err
		}
		byType := map[string][]string{}
		for res.Next(ctx) {
			name, _ := res.Record().Get("name")
			etype, _ := res.Record().Get("etype")
			n, _ := name.(string)
			t, _ := etype.(string)
			if n != "" && t != "" {
				byType[t] = append(byType[t], n)
			}
		}
		return byType, res.Err()
	})
	if err != nil || result == nil {
		return ""
	}

	byType, ok := result.(map[string][]string)
	if !ok || len(byType) == 0 {
		return ""
	}

	var sb strings.Builder
	sb.WriteString("\nKnown canonical entity names already in this campaign's graph — use these exact spellings whenever the passage refers to them, even by nickname or title:\n")
	for _, t := range []string{"Person", "Location", "Faction", "Event", "Item", "Creature"} {
		if names, ok := byType[t]; ok && len(names) > 0 {
			sb.WriteString(fmt.Sprintf("  %s: %s\n", t, strings.Join(names, ", ")))
		}
	}
	return sb.String()
}

func extractEntities(ctx context.Context, cfg Config, systemPrompt, title, content, knownEntities string) (*ExtractionResult, error) {
	userMsg := fmt.Sprintf("Document title: %s\n\nPassage:\n%s", title, content)
	if knownEntities != "" {
		userMsg += "\n" + knownEntities
	}

	payload := chatRequest{
		Model: cfg.LLMModel,
		Messages: []chatMessage{
			{Role: "system", Content: systemPrompt},
			{Role: "user", Content: userMsg},
		},
		Stream: false,
	}

	body, _ := json.Marshal(payload)
	req, err := http.NewRequestWithContext(ctx, http.MethodPost,
		cfg.LLMURL+"/v1/chat/completions", bytes.NewReader(body))
	if err != nil {
		return nil, err
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := httpClient.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	var cr chatResponse
	if err := json.NewDecoder(resp.Body).Decode(&cr); err != nil {
		return nil, err
	}
	if len(cr.Choices) == 0 {
		return nil, fmt.Errorf("empty LLM response")
	}

	raw := cr.Choices[0].Message.Content
	raw = stripFences(raw)
	raw = fixUnicodeEscapes(raw)
	raw = repairJSON(raw)

	var rr rawExtractionResult
	if err := json.Unmarshal([]byte(raw), &rr); err != nil {
		// Relations are often malformed; try salvaging just entities.
		if result := salvageEntities(raw); result != nil {
			return result, nil
		}
		slog.Warn("LLM returned non-JSON", "raw", cr.Choices[0].Message.Content)
		return &ExtractionResult{}, nil
	}

	result := ExtractionResult{Entities: rr.Entities}
	for _, r := range rr.Relations {
		from, to := coerceString(r.From), coerceString(r.To)
		if from != "" && to != "" && r.Rel != "" {
			result.Relations = append(result.Relations, ExtractedRelation{From: from, To: to, Rel: r.Rel})
		}
	}
	return &result, nil
}

// ── Neo4j write ───────────────────────────────────────────────────────────────

// Links extracted lore entities to the source LoreDocument, stamps temporal_hint
// on Event nodes when provided, marks all as lore_verified, and applies the
// entity type label via APOC.
const mergeLoreEntities = `
MERGE (d:LoreDocument {id: $docID})
WITH d
UNWIND $entities AS ent
  MERGE (e {name: ent.name})
    ON CREATE SET e.type = ent.type, e.source = "lore", e.lore_verified = true
  ON MATCH SET e.lore_verified = true
  WITH d, e, ent
  WHERE NOT (e:LoreDocument OR e:LoreChunk OR e:Chunk OR e:Message OR e:Encounter)
  FOREACH (_ IN CASE WHEN ent.temporal_hint IS NOT NULL AND ent.temporal_hint <> "" THEN [1] ELSE [] END |
    SET e.temporal_hint = ent.temporal_hint
  )
  WITH d, e, ent
  CALL apoc.create.addLabels(e, [ent.type]) YIELD node
  MERGE (d)-[:FEATURES]->(node)
`

// applyAliasesCypher sets the aliases array on the primary entity of a lore
// document (the entity whose name matches the document title). Also sets
// lore_verified in case the entity existed before its lore doc was ingested.
const applyAliasesCypher = `
MATCH (e)
WHERE e.name = $name
  AND NOT e:LoreDocument AND NOT e:LoreChunk AND NOT e:Chunk
SET e.aliases = $aliases, e.lore_verified = true
`

// mergeRelation creates or updates a typed relation between two lore entities.
const mergeLoreRelation = `
MATCH (a {name: $from})
MATCH (b {name: $to})
WITH a, b
CALL apoc.merge.relationship(a, $rel, {}, {}, b) YIELD rel
SET rel.since  = $uploadedAt,
    rel.doc_id = $docID
RETURN rel
`

// detectContradictions finds cases where this document's LOCATED_AT or RULES
// claims conflict with a prior document's claim about the same entity, and
// creates an explicit Contradiction node linking all parties.
const detectContradictionsQuery = `
MATCH (a)-[r1]->(x)
WHERE r1.doc_id = $docID AND type(r1) IN ['LOCATED_AT', 'RULES']
WITH a, type(r1) AS predicate, x.name AS claimA, r1.doc_id AS docA
MATCH (a)-[r2]->(y)
WHERE type(r2) = predicate AND r2.doc_id <> docA AND y.name <> claimA
WITH a, predicate, claimA, docA, y.name AS claimB, r2.doc_id AS docB
MERGE (contra:Contradiction {
  subject:   a.name,
  predicate: predicate,
  claim_a:   claimA,
  doc_a:     docA,
  claim_b:   claimB,
  doc_b:     docB
})
ON CREATE SET contra.detected_at = $detectedAt, contra.flagged = true
WITH a, contra
MERGE (a)-[:HAS_CONTRADICTION]->(contra)
RETURN count(contra) AS total
`

func writeToGraph(ctx context.Context, session neo4j.SessionWithContext,
	docID, title, uploadedAt string, result *ExtractionResult) error {

	if len(result.Entities) == 0 {
		return nil
	}

	entities := make([]map[string]any, len(result.Entities))
	for i, e := range result.Entities {
		entities[i] = map[string]any{
			"name":          e.Name,
			"type":          e.Type,
			"temporal_hint": e.TemporalHint,
		}
	}

	_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
		_, err := tx.Run(ctx, mergeLoreEntities, map[string]any{
			"docID":    docID,
			"entities": entities,
		})
		return nil, err
	})
	if err != nil {
		return fmt.Errorf("merge lore entities: %w", err)
	}

	for _, rel := range result.Relations {
		_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
			_, err := tx.Run(ctx, mergeLoreRelation, map[string]any{
				"from":       rel.From,
				"to":         rel.To,
				"rel":        rel.Rel,
				"uploadedAt": uploadedAt,
				"docID":      docID,
			})
			return nil, err
		})
		if err != nil {
			slog.Warn("skipped lore relation", "from", rel.From, "to", rel.To, "rel", rel.Rel, "err", err)
		}
	}

	flagContradictions(ctx, session, docID, uploadedAt)
	return nil
}

func applyDocumentAliases(ctx context.Context, session neo4j.SessionWithContext, primaryEntity string, aliases []string) {
	if primaryEntity == "" || len(aliases) == 0 {
		return
	}
	_, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
		_, err := tx.Run(ctx, applyAliasesCypher, map[string]any{
			"name":    primaryEntity,
			"aliases": aliases,
		})
		return nil, err
	})
	if err != nil {
		slog.Warn("failed to apply aliases", "entity", primaryEntity, "err", err)
	} else {
		slog.Info("applied aliases to entity", "entity", primaryEntity, "aliases", aliases)
	}
}

func flagContradictions(ctx context.Context, session neo4j.SessionWithContext, docID, detectedAt string) {
	result, err := session.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
		res, err := tx.Run(ctx, detectContradictionsQuery, map[string]any{
			"docID":      docID,
			"detectedAt": detectedAt,
		})
		if err != nil {
			return int64(0), err
		}
		if res.Next(ctx) {
			total, _ := res.Record().Get("total")
			return total, res.Err()
		}
		return int64(0), res.Err()
	})
	if err != nil {
		slog.Warn("contradiction detection failed", "doc_id", docID, "err", err)
		return
	}
	if n, ok := result.(int64); ok && n > 0 {
		slog.Info("contradictions flagged", "doc_id", docID, "count", n)
	}
}

// ── Main ──────────────────────────────────────────────────────────────────────

func main() {
	cfg := configFromEnv()
	slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil)))

	systemPrompt := loadPrompt(cfg)

	ctx := context.Background()

	rOpts, err := redis.ParseURL(cfg.RedisURL)
	if err != nil {
		slog.Error("invalid redis URL", "err", err)
		os.Exit(1)
	}
	rdb := redis.NewClient(rOpts)
	rdb.XGroupCreateMkStream(ctx, cfg.Stream, cfg.Group, "0").Err()

	driver, err := neo4j.NewDriverWithContext(cfg.Neo4jURL,
		neo4j.BasicAuth(cfg.Neo4jUser, cfg.Neo4jPass, ""))
	if err != nil {
		slog.Error("neo4j driver error", "err", err)
		os.Exit(1)
	}
	defer driver.Close(ctx)

	slog.Info("lore-extractor started", "stream", cfg.Stream, "group", cfg.Group)

	// Reclaim any messages delivered but not ACK'd before last shutdown.
	// Bounded to maxRecoveryPasses so a persistently failing message
	// (e.g. LLM not yet ready) does not block the live loop on startup.
	const maxRecoveryPasses = 5
	for pass := 0; pass < maxRecoveryPasses; pass++ {
		results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
			Group:    cfg.Group,
			Consumer: cfg.Consumer,
			Streams:  []string{cfg.Stream, "0"},
			Count:    3,
		}).Result()
		if err != nil || len(results) == 0 || len(results[0].Messages) == 0 {
			break
		}
		for _, msg := range results[0].Messages {
			slog.Info("reprocessing pending message", "id", msg.ID)
			if err := processMessage(ctx, cfg, systemPrompt, driver, msg); err != nil {
				slog.Error("lore extraction failed (pending)", "id", msg.ID, "err", err)
				continue
			}
			rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
		}
	}

	for {
		results, err := rdb.XReadGroup(ctx, &redis.XReadGroupArgs{
			Group:    cfg.Group,
			Consumer: cfg.Consumer,
			Streams:  []string{cfg.Stream, ">"},
			Count:    3,
			Block:    5 * time.Second,
		}).Result()

		if err == redis.Nil {
			continue
		}
		if err != nil {
			slog.Error("redis read error", "err", err)
			time.Sleep(2 * time.Second)
			continue
		}

		for _, stream := range results {
			for _, msg := range stream.Messages {
				if err := processMessage(ctx, cfg, systemPrompt, driver, msg); err != nil {
					slog.Error("lore extraction failed", "id", msg.ID, "err", err)
					continue
				}
				rdb.XAck(ctx, cfg.Stream, cfg.Group, msg.ID)
			}
		}
	}
}

func processMessage(ctx context.Context, cfg Config, systemPrompt string,
	driver neo4j.DriverWithContext, msg redis.XMessage) error {

	vals := msg.Values
	docID := strVal(vals, "id", msg.ID)
	title := strVal(vals, "title", "Untitled")
	content := strVal(vals, "content", "")
	uploadedAt := strVal(vals, "uploaded_at", time.Now().UTC().Format(time.RFC3339))
	primaryEntity := strVal(vals, "primary_entity", title)
	aliasesJSON := strVal(vals, "aliases", "[]")
	var aliases []string
	json.Unmarshal([]byte(aliasesJSON), &aliases) //nolint:errcheck

	if content == "" {
		return nil
	}

	knownEntities := loadKnownEntities(ctx, driver)
	result, err := extractEntities(ctx, cfg, systemPrompt, title, content, knownEntities)
	if err != nil {
		return fmt.Errorf("LLM extraction: %w", err)
	}

	session := driver.NewSession(ctx, neo4j.SessionConfig{})
	defer session.Close(ctx)

	if err := writeToGraph(ctx, session, docID, title, uploadedAt, result); err != nil {
		return fmt.Errorf("write to graph: %w", err)
	}

	applyDocumentAliases(ctx, session, primaryEntity, aliases)

	slog.Info("processed lore document", "doc_id", docID, "title", title,
		"entities", len(result.Entities), "relations", len(result.Relations))
	return nil
}

// ── Helpers ───────────────────────────────────────────────────────────────────

func getEnv(key, fallback string) string {
	if v := os.Getenv(key); v != "" {
		return v
	}
	return fallback
}

func strVal(m map[string]any, key, fallback string) string {
	if v, ok := m[key]; ok {
		if s, ok := v.(string); ok {
			return s
		}
	}
	return fallback
}