Files
damascus-orchestrator/docker-compose.yml
Kay Kayyali 78bdee686f
Some checks failed
test / contract-and-unit (push) Failing after 15s
feat(orchestrator): /v1/performance endpoint + dashboard widgets (P7)
Adds the performance metrics endpoint and React Query hooks for the dashboard.

Backend:
- PerformanceResponse / PhaseMetrics / ProjectMetrics in api_schemas.py
- GET /v1/performance?days=N returns aggregated metrics from cost_ledger
  (avg request time, p95, avg tokens, avg cost) and events_outbox
  (stage progression timing, per-project failure rates)
- Verified working: 140 requests / 47 failures (33.6%), spec p95 9409s,
  build p95 3374s, mindmaps 26.8% failure rate

Frontend:
- usePerformance() hook with TypeScript interfaces
- Ready for widget creation (PerfPhaseTable, PerfStageProgression,
  PerfFailureRates, PerfTokenSparkline) — pending UI build

Build/test infra:
- Dockerfile and docker-compose.yml updates for the perf schema
2026-06-27 16:43:11 +00:00

306 lines
13 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
services:
db:
image: postgres:16
restart: unless-stopped
environment:
POSTGRES_USER: damascus
POSTGRES_PASSWORD: damascus
POSTGRES_DB: damascus
volumes:
- dbdata:/var/lib/postgresql/data
# Expose DB only on loopback for the E2E test suite (host runs pytest)
ports:
- "127.0.0.1:5432:5432"
healthcheck:
test: ["CMD", "pg_isready", "-U", "damascus", "-d", "damascus"]
interval: 5s
timeout: 5s
retries: 20
# Self-heal a tainted dbdata volume on bootstrap (skill pitfall
# "Stack drift after a compose-swap PR merge is the silent test-killer",
# 2026-06-23). After an engine-swap PR (e.g. MySQL→Postgres), the named
# `dbdata` volume may still hold the old engine's data, which makes
# `initdb` error with `directory exists but is not empty`. Detect the
# tainted state (non-empty AND no PG_VERSION) and wipe the directory
# before the entrypoint runs initdb. Idempotent and safe: a fresh volume
# is empty, and a healthy Postgres cluster keeps PG_VERSION so this is
# a no-op on subsequent restarts.
command: >
bash -c '
if [ -n "$$(ls -A /var/lib/postgresql/data 2>/dev/null)" ] \
&& [ ! -f /var/lib/postgresql/data/PG_VERSION ]; then
echo "[db] tainted data dir detected (no PG_VERSION); wiping /var/lib/postgresql/data/* before initdb";
rm -rf /var/lib/postgresql/data/* /var/lib/postgresql/data/.[!.]*;
fi;
exec docker-entrypoint.sh postgres
'
redis:
image: redis:7
restart: unless-stopped
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 5s
retries: 20
# Test-only Postgres for the pytest suite. The tests/conftest.py
# autouse `reset_state` fixture must NEVER touch the production DB
# (port 5432, holds live orchestrator state). Connect to `db-test:5432`
# from inside the orchestrator container, or `127.0.0.1:5433` from the
# host. Separate volume, separate credentials.
db-test:
image: postgres:16
restart: unless-stopped
environment:
POSTGRES_USER: damascus_test
POSTGRES_PASSWORD: damascus_test
POSTGRES_DB: damascus_test
volumes:
- dbtestdata:/var/lib/postgresql/data
ports:
- "127.0.0.1:5433:5432"
healthcheck:
test: ["CMD", "pg_isready", "-U", "damascus_test", "-d", "damascus_test"]
interval: 5s
timeout: 5s
retries: 20
command: >
bash -c '
if [ -n "$$(ls -A /var/lib/postgresql/data 2>/dev/null)" ] \
&& [ ! -f /var/lib/postgresql/data/PG_VERSION ]; then
echo "[db-test] tainted data dir detected (no PG_VERSION); wiping /var/lib/postgresql/data/* before initdb";
rm -rf /var/lib/postgresql/data/* /var/lib/postgresql/data/.[!.]*;
fi;
exec docker-entrypoint.sh postgres
'
orchestrator:
build: .
image: damascus-orchestrator:latest
restart: unless-stopped
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
extra_hosts:
- "host.docker.internal:host-gateway" # reach host's LiteLLM / Gitea
environment:
DAMASCUS_PG_HOST: db
DAMASCUS_PG_PORT: "5432"
DAMASCUS_PG_USER: damascus
DAMASCUS_PG_PASSWORD: damascus
DAMASCUS_PG_DB: damascus
# Taskiq broker transport (BullMQ-equivalent)
DAMASCUS_REDIS_URL: redis://redis:6379
# LLM proxy on the host (default port 4000)
DAMASCUS_LLM_BASE_URL: http://host.docker.internal:4000
DAMASCUS_LLM_API_KEY: sk-dummy
DAMASCUS_LLM_MODEL: minimax-m3
# Build phase cap (bumped 2026-06-27: 80 → 120 → 140 → 180 → 220 → 280; Shape 1c escape — 13+ rows hit cap simultaneously, worktrees have real partial code)
DAMASCUS_CLAUDE_MAX_TURNS: "320"
# Gitea on the host network (loopback-only API)
DAMASCUS_GITEA_URL: https://git.homelab.local
DAMASCUS_GITEA_TOKEN: "885a9202a1bc8231f1eb9f22e6edb978b20a4345"
# Discord relay (optional)
DAMASCUS_DISCORD_WEBHOOK: ""
# External concurrency id (override per host for multi-tick parallelism)
DAMASCUS_CONCURRENCY_ID: orch-1
DAMASCUS_MAX_CONCURRENT: "10"
# BMAD + wiki live inside the image at /opt/damascus/{bmad,llm-wiki}
DAMASCUS_BMAD_DIR: /opt/damascus/bmad
DAMASCUS_WIKI_DIR: /opt/damascus/llm-wiki
volumes:
- orchdata:/data
- worktrees:/workspace/worktrees
- projects:/workspace/projects
- ./specs:/data/specs
# Shared wiki (Karpathy-style LLM-wiki) — bind-mounted so workers can mount it
- ./wiki:/opt/damascus/llm-wiki
# Mount the host's BMAD output dirs under /opt/damascus/bmad/<project>/
- /root/restitution/_bmad-output:/opt/damascus/bmad/restitution/_bmad-output:ro
- /root/mindmaps-prds/_bmad-output:/opt/damascus/bmad/mindmaps/_bmad-output:ro
- /root/damascus-roadmap/_bmad-output:/opt/damascus/bmad/damascus-roadmap/_bmad-output:ro
# Lore Engine × GraphMCP substrate merge (Phase 4 epic — 7 phases)
# Tracked as #29: bind-mount per project is a config liability.
- /root/lore-engine-merge-prds/_bmad-output:/opt/damascus/bmad/lore-engine-merge/_bmad-output:ro
# Damascus Bug Fixes Q4 2026 (ADR-004 + ADR-005 — Quick Flow work)
- /root/damascus-bugfixes-q4-2026-prds/_bmad-output:/opt/damascus/bmad/damascus-bugfixes-q4-2026/_bmad-output:ro
# BMAD kit — templates, samples, and reference docs. Ships with the
# orchestrator repo at bmad/_kit/. Read-only.
- ./bmad/_kit:/opt/damascus/bmad/_kit:ro
# Legacy _kit location, kept for back-compat with the existing bind
- /home/kaykayyali/_bmad:/opt/damascus/bmad/_kit_legacy:ro
# hello-bmad sample project (for verification — remove in real deployments)
- /root/hello-bmad/_bmad-output:/opt/damascus/bmad/hello-bmad/_bmad-output:ro
# E2E test suite (read-only; tests run from the host)
- ./tests:/opt/damascus/tests:ro
# Taskiq worker — the global concurrency cap (design doc §10). For sync
# tasks (run_cycle), --max-threadpool-threads is the parallelism knob.
command: ["taskiq", "worker", "damascus.tasks:broker", "--use-process-pool", "--max-process-pool-processes", "10", "--max-threadpool-threads", "10"] # bumped 2026-06-27: 1→10 to match DAMASCUS_MAX_CONCURRENT=10 (taskiq 0.12.4 floor is 2)
orchestrator-scheduler:
image: damascus-orchestrator:latest
restart: unless-stopped
depends_on:
redis:
condition: service_healthy
environment:
DAMASCUS_REDIS_URL: redis://redis:6379
DAMASCUS_PG_HOST: db
DAMASCUS_PG_PORT: "5432"
DAMASCUS_PG_USER: damascus
DAMASCUS_PG_PASSWORD: damascus
DAMASCUS_PG_DB: damascus
# Exactly one scheduler enqueues the recurring run_cycle task on the cron.
# The path must point at the TaskiqScheduler instance, not the broker.
command: ["taskiq", "scheduler", "damascus.tasks:scheduler"]
sidecar-status:
image: python:3.12-slim
restart: unless-stopped
depends_on:
- db
volumes:
- orchdata:/data
working_dir: /data
command: ["python", "-m", "http.server", "9100", "--bind", "0.0.0.0"]
ports:
- "9100:9100"
# Visit http://<host>:9100/status/active.json for the external concurrency view.
damascus-api:
# P2 entry point: FastAPI service exposing the contract §2 endpoints.
# Same image as `orchestrator` (single-image-multiple-entrypoints); only
# the command differs. Bind mount /root/.hermes/.env so the container
# sees DAMASCUS_API_TOKEN; compose env_file reads the same source so the
# token is also available as a process env var.
build: .
image: damascus-orchestrator:latest
restart: unless-stopped
depends_on:
db:
condition: service_healthy
env_file:
- /root/.hermes/.env
environment:
DAMASCUS_PG_HOST: db
DAMASCUS_PG_PORT: "5432"
DAMASCUS_PG_USER: damascus
DAMASCUS_PG_PASSWORD: damascus
DAMASCUS_PG_DB: damascus
# Pool sizing (contract §6).
DAMASCUS_API_POOL_MIN: "2"
DAMASCUS_API_POOL_MAX: "5"
# Rate limits (contract §4). Override per-host if needed.
# Bumped 2026-06-27: 30→300 write, 120→1200 read to match the worker
# pool expansion to 10 procs × 10 threads (the per-IP bucket is shared).
DAMASCUS_WRITE_RATE_PER_MIN: "300"
DAMASCUS_READ_RATE_PER_MIN: "1200"
# UI bundle path (P4 ships the Vite build here). Empty dir → mount
# is a no-op per the contract.
DAMASCUS_UI_DIR: /opt/damascus/ui
volumes:
# P4 ships the UI bundle into the named `damascus_ui` volume;
# mount it read-only into the API container at the same path
# P2's StaticFiles looks at. Empty volume → API serves the API
# only, no crash.
- damascus_ui:/opt/damascus/ui:ro
# damascus-ntfy-bridge state (see skill devops/damascus-ntfy-bridge):
# the high-water mark of events_outbox ids the bridge has already
# pushed. Mounted as a named volume so it survives container
# recreates (otherwise a redeploy would re-ping for events the
# phone already received). Bind-mount the bridge script itself so
# it survives image rebuilds without a re-`docker cp`.
- damascus_ntfy_state:/var/lib/damascus-ntfy
- /root/.hermes/scripts/damascus-ntfy-bridge.py:/usr/local/bin/damascus-ntfy-bridge.py:ro
ports:
# LAN-only by contract §4 (Traefik terminates the public hostname
# separately; this port is bound to loopback so it's not exposed to
# the wider docker network).
- "127.0.0.1:9110:9110"
healthcheck:
test: ["CMD-SHELL", "python -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://127.0.0.1:9110/healthz', timeout=2).read() == b'{\\\"status\\\":\\\"ok\\\"}' else 1)\""]
interval: 10s
timeout: 5s
retries: 6
command: ["damascus", "serve", "--host", "0.0.0.0", "--port", "9110"]
# damascus-ui-build (P4) — one-shot build of the Vite SPA bundle.
#
# Builds the React 19 + Vite 6 + MUI 6 dashboard and writes the static
# output to the named volume `damascus_ui` at /opt/damascus/ui. The
# P2 `damascus-api` service (FastAPI on :9110) mounts that same
# volume and serves the bundle from / via StaticFiles. P2 will add:
#
# damascus-api:
# ...
# volumes:
# - damascus_ui:/opt/damascus/ui:ro
#
# Running `docker compose up damascus-ui-build` does the build, then
# the container exits 0. `docker compose up damascus-api` afterward
# sees the bundle on the volume.
#
# The API_BASE_URL build arg points the bundle at the in-network API
# for ad-hoc preview from a developer's host browser. Leave empty
# when running the full compose stack so the bundle uses
# window.location.origin (same-origin via the API).
damascus-ui-build:
build:
context: ./ui
dockerfile: Dockerfile
args:
VITE_API_BASE_URL: ""
image: damascus-ui:latest
volumes:
# Mount at the SAME path the bundle is written to in the image
# (/bundle). The named volume is initially empty, so this mount
# HIDES the in-image /bundle for the container's lifetime, but
# since the container only needs to keep the volume populated,
# the trick is to mount it into a parallel path and copy across:
# /bundle (in-image, read-only via overlay)
# /bundle-out (named volume, initially empty)
# The `cp` below copies the in-image bundle into the volume; the
# `sleep` keeps the container alive long enough for compose to
# record the exit; `restart: "no"` ensures compose doesn't loop.
- damascus_ui:/bundle-out
command:
- sh
- -c
- |
mkdir -p /bundle-out
cp -a /bundle/. /bundle-out/
echo "[damascus-ui-build] copied $$(du -sh /bundle-out | cut -f1) of UI bundle to damascus_ui volume"
# Hold the container open for a few seconds so compose's "exited"
# handling finishes cleanly. In CI a follow-up step can `docker
# compose up damascus-api` which will then see the volume.
sleep 5
restart: "no"
volumes:
dbdata:
dbtestdata:
orchdata:
worktrees:
projects:
# Named volume that carries the built UI bundle from the
# damascus-ui-build one-shot into the (P2) damascus-api container.
# Same volume, two services: build writes, api reads. The P4 contract
# says "drops it into a named volume `damascus_ui`" — this is that
# volume.
damascus_ui:
# Persistent state for the damascus-ntfy-bridge running inside the
# damascus-api container. Holds the bridge's high-water mark in
# state.json so container recreates don't re-ping for events the
# phone already received. See skill devops/damascus-ntfy-bridge.
damascus_ntfy_state: