Compare commits
10 Commits
feat/p6b-p
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 78bdee686f | |||
| 402193e9ab | |||
| 8bf73e255f | |||
| 339faf47a0 | |||
| 62f6234a18 | |||
| 969a83a3cd | |||
| 4d65e47558 | |||
| e0b4160a55 | |||
| 9c2a4da7b9 | |||
| 33e953d505 |
32
Dockerfile
32
Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.12-slim
|
||||
|
||||
# System tools the orchestrator shells out to
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
git curl ca-certificates bash \
|
||||
git curl ca-certificates bash gosu \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Trust the homelab mkcert CA so git/curl inside the container can reach
|
||||
@@ -44,9 +44,37 @@ RUN mkdir -p /data/logs /data/specs /data/status /workspace/projects /workspace/
|
||||
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
DAMASCUS_DATA_DIR=/data \
|
||||
DAMASCUS_WORKSPACE_DIR=/workspace
|
||||
DAMASCUS_WORKSPACE_DIR=/workspace \
|
||||
# Pre-warm Claude Code's safe.directory list so git refuses no worktree.
|
||||
# The orchestrator shells out to git inside worktrees owned by various
|
||||
# UIDs (root in container, host-root-mapped on the volume). Without this,
|
||||
# every `git status` / `git worktree add` fails with "dubious ownership".
|
||||
GIT_CONFIG_COUNT=1 \
|
||||
GIT_CONFIG_KEY_0=safe.directory \
|
||||
GIT_CONFIG_VALUE_0='*'
|
||||
|
||||
EXPOSE 9100
|
||||
|
||||
# NOTE on root vs non-root:
|
||||
#
|
||||
# Claude Code refuses `--permission-mode bypassPermissions` when running as
|
||||
# root/sudo (security policy). To use bypassPermissions, the orchestrator
|
||||
# would need to drop to a non-root user. BUT the named volumes
|
||||
# (`orchdata`, `projects`, `worktrees`) were created when this container
|
||||
# ran as root and chown inside the container is blocked by the user-
|
||||
# namespace mapping (host root maps to a high container UID that the
|
||||
# container's regular user can't chown to). So the orchestrator must
|
||||
# stay root for git worktree operations on the existing volumes.
|
||||
#
|
||||
# Instead, the build phase whitelists Bash commands via a project-local
|
||||
# `.claude/settings.local.json` written into the worktree before each
|
||||
# Claude Code invocation. `--permission-mode acceptEdits` honors those
|
||||
# allow-lists. See phases._run_claude_in_worktree and the
|
||||
# claude_settings_local template.
|
||||
#
|
||||
# `gosu` is installed for future use if we ever split root/non-root
|
||||
# cleanly across services.
|
||||
|
||||
# Taskiq worker is the automatic trigger (design doc §13). `--concurrency N`
|
||||
# is the global concurrency cap (§10); set via compose. The scheduler runs
|
||||
# as a separate compose service. `damascus cycle` is the manual one-shot.
|
||||
|
||||
@@ -44,6 +44,37 @@ services:
|
||||
timeout: 5s
|
||||
retries: 20
|
||||
|
||||
# Test-only Postgres for the pytest suite. The tests/conftest.py
|
||||
# autouse `reset_state` fixture must NEVER touch the production DB
|
||||
# (port 5432, holds live orchestrator state). Connect to `db-test:5432`
|
||||
# from inside the orchestrator container, or `127.0.0.1:5433` from the
|
||||
# host. Separate volume, separate credentials.
|
||||
db-test:
|
||||
image: postgres:16
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_USER: damascus_test
|
||||
POSTGRES_PASSWORD: damascus_test
|
||||
POSTGRES_DB: damascus_test
|
||||
volumes:
|
||||
- dbtestdata:/var/lib/postgresql/data
|
||||
ports:
|
||||
- "127.0.0.1:5433:5432"
|
||||
healthcheck:
|
||||
test: ["CMD", "pg_isready", "-U", "damascus_test", "-d", "damascus_test"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 20
|
||||
command: >
|
||||
bash -c '
|
||||
if [ -n "$$(ls -A /var/lib/postgresql/data 2>/dev/null)" ] \
|
||||
&& [ ! -f /var/lib/postgresql/data/PG_VERSION ]; then
|
||||
echo "[db-test] tainted data dir detected (no PG_VERSION); wiping /var/lib/postgresql/data/* before initdb";
|
||||
rm -rf /var/lib/postgresql/data/* /var/lib/postgresql/data/.[!.]*;
|
||||
fi;
|
||||
exec docker-entrypoint.sh postgres
|
||||
'
|
||||
|
||||
orchestrator:
|
||||
build: .
|
||||
image: damascus-orchestrator:latest
|
||||
@@ -69,6 +100,8 @@ services:
|
||||
DAMASCUS_LLM_BASE_URL: http://host.docker.internal:4000
|
||||
DAMASCUS_LLM_API_KEY: sk-dummy
|
||||
DAMASCUS_LLM_MODEL: minimax-m3
|
||||
# Build phase cap (bumped 2026-06-27: 80 → 120 → 140 → 180 → 220 → 280; Shape 1c escape — 13+ rows hit cap simultaneously, worktrees have real partial code)
|
||||
DAMASCUS_CLAUDE_MAX_TURNS: "320"
|
||||
|
||||
# Gitea on the host network (loopback-only API)
|
||||
DAMASCUS_GITEA_URL: https://git.homelab.local
|
||||
@@ -79,7 +112,7 @@ services:
|
||||
|
||||
# External concurrency id (override per host for multi-tick parallelism)
|
||||
DAMASCUS_CONCURRENCY_ID: orch-1
|
||||
DAMASCUS_MAX_CONCURRENT: "1"
|
||||
DAMASCUS_MAX_CONCURRENT: "10"
|
||||
|
||||
# BMAD + wiki live inside the image at /opt/damascus/{bmad,llm-wiki}
|
||||
DAMASCUS_BMAD_DIR: /opt/damascus/bmad
|
||||
@@ -94,6 +127,12 @@ services:
|
||||
# Mount the host's BMAD output dirs under /opt/damascus/bmad/<project>/
|
||||
- /root/restitution/_bmad-output:/opt/damascus/bmad/restitution/_bmad-output:ro
|
||||
- /root/mindmaps-prds/_bmad-output:/opt/damascus/bmad/mindmaps/_bmad-output:ro
|
||||
- /root/damascus-roadmap/_bmad-output:/opt/damascus/bmad/damascus-roadmap/_bmad-output:ro
|
||||
# Lore Engine × GraphMCP substrate merge (Phase 4 epic — 7 phases)
|
||||
# Tracked as #29: bind-mount per project is a config liability.
|
||||
- /root/lore-engine-merge-prds/_bmad-output:/opt/damascus/bmad/lore-engine-merge/_bmad-output:ro
|
||||
# Damascus Bug Fixes Q4 2026 (ADR-004 + ADR-005 — Quick Flow work)
|
||||
- /root/damascus-bugfixes-q4-2026-prds/_bmad-output:/opt/damascus/bmad/damascus-bugfixes-q4-2026/_bmad-output:ro
|
||||
# BMAD kit — templates, samples, and reference docs. Ships with the
|
||||
# orchestrator repo at bmad/_kit/. Read-only.
|
||||
- ./bmad/_kit:/opt/damascus/bmad/_kit:ro
|
||||
@@ -105,8 +144,7 @@ services:
|
||||
- ./tests:/opt/damascus/tests:ro
|
||||
# Taskiq worker — the global concurrency cap (design doc §10). For sync
|
||||
# tasks (run_cycle), --max-threadpool-threads is the parallelism knob.
|
||||
command: ["taskiq", "worker", "damascus.tasks:broker", "--max-threadpool-threads", "1"]
|
||||
|
||||
command: ["taskiq", "worker", "damascus.tasks:broker", "--use-process-pool", "--max-process-pool-processes", "10", "--max-threadpool-threads", "10"] # bumped 2026-06-27: 1→10 to match DAMASCUS_MAX_CONCURRENT=10 (taskiq 0.12.4 floor is 2)
|
||||
orchestrator-scheduler:
|
||||
image: damascus-orchestrator:latest
|
||||
restart: unless-stopped
|
||||
@@ -163,8 +201,10 @@ services:
|
||||
DAMASCUS_API_POOL_MAX: "5"
|
||||
|
||||
# Rate limits (contract §4). Override per-host if needed.
|
||||
DAMASCUS_WRITE_RATE_PER_MIN: "30"
|
||||
DAMASCUS_READ_RATE_PER_MIN: "120"
|
||||
# Bumped 2026-06-27: 30→300 write, 120→1200 read to match the worker
|
||||
# pool expansion to 10 procs × 10 threads (the per-IP bucket is shared).
|
||||
DAMASCUS_WRITE_RATE_PER_MIN: "300"
|
||||
DAMASCUS_READ_RATE_PER_MIN: "1200"
|
||||
|
||||
# UI bundle path (P4 ships the Vite build here). Empty dir → mount
|
||||
# is a no-op per the contract.
|
||||
@@ -249,6 +289,7 @@ services:
|
||||
|
||||
volumes:
|
||||
dbdata:
|
||||
dbtestdata:
|
||||
orchdata:
|
||||
worktrees:
|
||||
projects:
|
||||
|
||||
100
docs/P6B.md
Normal file
100
docs/P6B.md
Normal file
@@ -0,0 +1,100 @@
|
||||
# P6b — Playwright + MCP integration spec
|
||||
|
||||
**Branch:** `feat/p6b-playwright-e2e`
|
||||
**Status:** SHIPPED (this branch)
|
||||
**Worktree:** `/root/damascus-orchestrator-p6b`
|
||||
**Base:** `main @ acec3ea` (P6a merged)
|
||||
|
||||
## Background
|
||||
|
||||
PR #20 (`cfcd571`, "Damascus Entry Points P6: E2E verification") already shipped
|
||||
the P6b deliverables on `main` — `tests/e2e/test_entry_points_e2e.py` (667
|
||||
lines, 4-phase Playwright + MCP test) and `tests/e2e/conftest.py`. The P6b
|
||||
kanban card was drafted before the P6 split landed, so the body overlaps with
|
||||
P6 instead of complementing it.
|
||||
|
||||
P6b's contribution on this branch is therefore **a re-verification** plus a few
|
||||
small improvements:
|
||||
|
||||
1. **Re-verification against post-PR-#21 main** — the test runs end-to-end
|
||||
against the stack as it exists after the Ask-Hermes UX PR (#21) merged, and
|
||||
it still passes (3 back-to-back clean runs at 29–33s each).
|
||||
2. **`DAMASCUS_ROOT` / `DAMASCUS_EVIDENCE_NAME` env vars** — the test now
|
||||
reads these from the environment instead of hardcoding
|
||||
`/root/damascus-orchestrator`. Same file is now reusable from a worktree.
|
||||
3. **`tests/e2e/requirements.txt`** — pinned deps for a fresh venv.
|
||||
|
||||
## Changes on this branch vs `main`
|
||||
|
||||
```
|
||||
docs/P6B.md | new (this file)
|
||||
tests/e2e/requirements.txt | new (pinned deps)
|
||||
tests/e2e/test_entry_points_e2e.py | 6-line patch: env-var indirection
|
||||
```
|
||||
|
||||
The patched test runs identically against `main` (where the env vars default
|
||||
to the original paths). Run from the worktree with:
|
||||
|
||||
```bash
|
||||
cd /root/damascus-orchestrator-p6b
|
||||
DAMASCUS_ROOT=/root/damascus-orchestrator-p6b DAMASCUS_EVIDENCE_NAME=p6b \
|
||||
python3 -m pytest tests/e2e/test_entry_points_e2e.py -q -s
|
||||
```
|
||||
|
||||
## Evidence (on disk, gitignored)
|
||||
|
||||
```
|
||||
.hermes/evidence/p6b/
|
||||
├── README.md (run instructions + AC checklist)
|
||||
├── pytest.log (3rd consecutive green run, 29.35s)
|
||||
└── screenshots/
|
||||
├── 01_dashboard.png
|
||||
├── 01_ingest.png
|
||||
├── 02_build.png
|
||||
├── 03_review.png
|
||||
├── 04_merged.png
|
||||
├── 05_awaiting_human_drawer.png
|
||||
└── 06_answered.png
|
||||
```
|
||||
|
||||
7 screenshots + `pytest.log` prove the test ran green against the live stack
|
||||
on 2026-06-26 14:29 UTC. The `.hermes/evidence/` tree is gitignored
|
||||
(see `.gitignore` line 46), so evidence is intentionally not committed — it
|
||||
regenerates from the test.
|
||||
|
||||
## Acceptance criteria
|
||||
|
||||
- [x] `pytest tests/e2e/test_entry_points_e2e.py -q -s` exits 0 (last run:
|
||||
`1 passed in 29.35s`).
|
||||
- [x] All 7 screenshots present in `.hermes/evidence/p6b/screenshots/`.
|
||||
- [x] MCP stdio subprocess communicates cleanly (no init-error logs).
|
||||
- [x] Spec uses live stack (api at `127.0.0.1:9110`, MCP launched in stdio
|
||||
against the api container).
|
||||
- [x] No browser console errors during Phase 2 / 3.
|
||||
|
||||
## PR description (draft)
|
||||
|
||||
> **Damascus Entry Points — P6b: Playwright + MCP integration spec**
|
||||
>
|
||||
> Re-verifies the existing P6 e2e test (`tests/e2e/test_entry_points_e2e.py`,
|
||||
> shipped via PR #20) against the post-PR-#21 stack and adds a tiny
|
||||
> ergonomic improvement: `DAMASCUS_ROOT` and `DAMASCUS_EVIDENCE_NAME` are
|
||||
> now read from the environment so the same test is reusable from a
|
||||
> worktree without forking it. Also adds `tests/e2e/requirements.txt`
|
||||
> pinning the test deps.
|
||||
>
|
||||
> Three back-to-back clean runs at ~30s each against the live stack.
|
||||
> Evidence (screenshots + pytest.log) regenerated on the worktree at
|
||||
> `.hermes/evidence/p6b/` (gitignored by design).
|
||||
>
|
||||
> Complements P6a (`scripts/verify.sh`, bash recipe) and P6 itself (the
|
||||
> test file already on `main`).
|
||||
|
||||
## Notes
|
||||
|
||||
- The P6b kanban task's body describes the test as "outstanding work" but
|
||||
the file has been on `main` since 2026-06-25 via PR #20. The body was
|
||||
drafted before the P6 split, so this branch documents the overlap and
|
||||
ships the small improvement.
|
||||
- CI is intentionally out of scope per the task body. The spec runs locally
|
||||
against a live `docker compose up` stack.
|
||||
@@ -147,6 +147,27 @@ UI screenshots are produced by the P6 Playwright spec
|
||||
design — adding Playwright would expand it past the "manual recipe
|
||||
in <1 minute" budget this page targets.
|
||||
|
||||
## ADR-005: transient vs structural tests_failed
|
||||
|
||||
Added 2026-06-27. The build phase classifies 6 known transient error patterns
|
||||
(`project repo not found at`, `worktree setup:`, `Connection refused`,
|
||||
`Could not resolve host`, `TLS handshake timeout`, `rate limit`) and sets
|
||||
`feedback.transient = true` for matching errors. The cycle function's
|
||||
loop-breaker skips those:
|
||||
|
||||
- **Within 24h of `first_attempted_at`**: row stays in the same phase,
|
||||
no human_issue, emits `phase.transient_retry` event. Stale-claim
|
||||
window (default 30m) provides natural backoff.
|
||||
- **After 24h of persistent transient retries**: row escalates to
|
||||
`blocked` + human_issue is opened.
|
||||
|
||||
The column `work_items.first_attempted_at` (TIMESTAMPTZ, nullable) is
|
||||
set by `state.claim_for_*` on the first claim for a row. Migration
|
||||
`src/damascus/db/migrations/0007_first_attempted_at.sql` adds the column
|
||||
and backfills it from `updated_at` for existing rows. Forward-compatible:
|
||||
nullable + default NULL, so older orchestrator binaries can still read the
|
||||
table.
|
||||
|
||||
## Evidence log
|
||||
|
||||
Each run of `verify.sh` writes its full output to
|
||||
|
||||
@@ -71,6 +71,9 @@ CREATE TABLE IF NOT EXISTS work_items (
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
merged_at TIMESTAMPTZ DEFAULT NULL,
|
||||
-- ADR-005: set by claim_for_* on first claim; used by cycle.py to escalate
|
||||
-- persistent transient retries to blocked after 24h.
|
||||
first_attempted_at TIMESTAMPTZ DEFAULT NULL,
|
||||
UNIQUE (project, story_id)
|
||||
);
|
||||
|
||||
|
||||
@@ -879,6 +879,204 @@ def create_app() -> FastAPI:
|
||||
cost_today_usd=cost_today,
|
||||
)
|
||||
|
||||
# ---------- /v1/performance ----------------------------------------------
|
||||
# Added 2026-06-27: rolled-up perf metrics for the dashboard widgets.
|
||||
# Sourced entirely from cost_ledger (request time + tokens) and
|
||||
# events_outbox (stage progression + verdicts). Read-only, no schema
|
||||
# changes, no new writes.
|
||||
|
||||
@app.get(
|
||||
"/v1/performance",
|
||||
response_model=S.PerformanceResponse,
|
||||
tags=["stats"],
|
||||
)
|
||||
def get_performance(days: int = 7) -> S.PerformanceResponse:
|
||||
"""Rolled-up perf metrics over the last ``days`` (default 7, max 90).
|
||||
|
||||
Returns per-phase and per-project aggregates plus a top-N list of
|
||||
stage progressions (time spent in each phase per work_item).
|
||||
"""
|
||||
if days < 1 or days > 90:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="days must be in 1..90",
|
||||
)
|
||||
|
||||
# Window boundaries (UTC, naive to match recorded_at default).
|
||||
window_end = datetime.utcnow()
|
||||
window_start = window_end - timedelta(days=days)
|
||||
|
||||
with pool_cursor() as cur:
|
||||
# --- Per-phase request time + tokens ----------------------------
|
||||
# ``recorded_at`` LAG gives the time delta between consecutive
|
||||
# LLM calls for the same work_item in the same phase. The first
|
||||
# call in a work_item-phase group has no previous row → NULL
|
||||
# delta → filtered out of the time aggregates (but still counted
|
||||
# in the request_count denominator).
|
||||
cur.execute(
|
||||
"""
|
||||
WITH ranked AS (
|
||||
SELECT phase,
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
LAG(recorded_at) OVER (
|
||||
PARTITION BY work_item_id, phase
|
||||
ORDER BY recorded_at
|
||||
) AS prev_recorded_at,
|
||||
recorded_at
|
||||
FROM cost_ledger
|
||||
WHERE recorded_at >= %s AND recorded_at < %s
|
||||
AND work_item_id IS NOT NULL
|
||||
AND phase IS NOT NULL
|
||||
)
|
||||
SELECT phase,
|
||||
AVG(EXTRACT(EPOCH FROM (recorded_at - prev_recorded_at)))
|
||||
FILTER (WHERE prev_recorded_at IS NOT NULL) AS avg_secs,
|
||||
percentile_cont(0.5) WITHIN GROUP (
|
||||
ORDER BY EXTRACT(EPOCH FROM (recorded_at - prev_recorded_at))
|
||||
) FILTER (WHERE prev_recorded_at IS NOT NULL) AS p50_secs,
|
||||
percentile_cont(0.95) WITHIN GROUP (
|
||||
ORDER BY EXTRACT(EPOCH FROM (recorded_at - prev_recorded_at))
|
||||
) FILTER (WHERE prev_recorded_at IS NOT NULL) AS p95_secs,
|
||||
AVG(input_tokens) AS avg_in,
|
||||
AVG(output_tokens) AS avg_out,
|
||||
AVG(input_tokens + output_tokens) AS avg_tot,
|
||||
COUNT(*) AS n
|
||||
FROM ranked
|
||||
GROUP BY phase
|
||||
""",
|
||||
(window_start, window_end),
|
||||
)
|
||||
phase_rows = cur.fetchall()
|
||||
|
||||
# --- Per-project failure rate -----------------------------------
|
||||
# Total verdicts + failures per project, sourced from
|
||||
# phase.transition events.
|
||||
cur.execute(
|
||||
"""
|
||||
WITH recent AS (
|
||||
SELECT (eo.payload->>'verdict') AS verdict, wi.project AS project
|
||||
FROM events_outbox eo
|
||||
JOIN work_items wi ON wi.id = eo.work_item_id
|
||||
WHERE eo.kind = 'phase.transition'
|
||||
AND eo.created_at >= %s AND eo.created_at < %s
|
||||
AND wi.project IS NOT NULL
|
||||
),
|
||||
totals AS (
|
||||
SELECT project, COUNT(*) AS n FROM recent GROUP BY project
|
||||
),
|
||||
failures AS (
|
||||
SELECT project, COUNT(*) AS n FROM recent
|
||||
WHERE verdict IN ('tests_failed', 'rebase_conflict')
|
||||
GROUP BY project
|
||||
)
|
||||
SELECT t.project, t.n AS total, COALESCE(f.n, 0) AS failures
|
||||
FROM totals t
|
||||
LEFT JOIN failures f ON f.project = t.project
|
||||
""",
|
||||
(window_start, window_end),
|
||||
)
|
||||
proj_rows = cur.fetchall()
|
||||
|
||||
# --- Totals ------------------------------------------------------
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM cost_ledger "
|
||||
"WHERE recorded_at >= %s AND recorded_at < %s",
|
||||
(window_start, window_end),
|
||||
)
|
||||
total_requests = cur.fetchone()["n"]
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT COUNT(*) AS n FROM events_outbox eo
|
||||
JOIN work_items wi ON wi.id = eo.work_item_id
|
||||
WHERE eo.kind = 'phase.transition'
|
||||
AND eo.created_at >= %s AND eo.created_at < %s
|
||||
AND (eo.payload->>'verdict') IN ('tests_failed', 'rebase_conflict')
|
||||
""",
|
||||
(window_start, window_end),
|
||||
)
|
||||
total_failures = cur.fetchone()["n"]
|
||||
|
||||
# --- Stage progression ------------------------------------------
|
||||
# For each work_item, walk phase.transition events ordered by
|
||||
# created_at; time in phase = (next_event.created_at -
|
||||
# this_event.created_at). If a phase has no exit event in the
|
||||
# window (still in progress), use window_end as the exit.
|
||||
cur.execute(
|
||||
"""
|
||||
WITH transitions AS (
|
||||
SELECT eo.work_item_id AS wid,
|
||||
wi.project AS project,
|
||||
wi.story_id AS story_id,
|
||||
(eo.payload->>'from') AS phase,
|
||||
eo.created_at AS entered_at,
|
||||
LEAD(eo.created_at) OVER (
|
||||
PARTITION BY eo.work_item_id ORDER BY eo.created_at
|
||||
) AS exited_at
|
||||
FROM events_outbox eo
|
||||
JOIN work_items wi ON wi.id = eo.work_item_id
|
||||
WHERE eo.kind = 'phase.transition'
|
||||
AND eo.created_at >= %s AND eo.created_at < %s
|
||||
AND (eo.payload->>'from') IS NOT NULL
|
||||
)
|
||||
SELECT project, story_id, phase,
|
||||
EXTRACT(EPOCH FROM (COALESCE(exited_at, %s) - entered_at)) AS seconds
|
||||
FROM transitions
|
||||
ORDER BY seconds DESC NULLS LAST
|
||||
LIMIT 200
|
||||
""",
|
||||
(window_start, window_end, window_end),
|
||||
)
|
||||
prog_rows = cur.fetchall()
|
||||
|
||||
# --- Shape the response --------------------------------------------
|
||||
by_phase: dict[str, S.PhaseMetrics] = {}
|
||||
for r in phase_rows:
|
||||
by_phase[r["phase"]] = S.PhaseMetrics(
|
||||
avg_request_seconds=float(r["avg_secs"]) if r["avg_secs"] is not None else None,
|
||||
p50_request_seconds=float(r["p50_secs"]) if r["p50_secs"] is not None else None,
|
||||
p95_request_seconds=float(r["p95_secs"]) if r["p95_secs"] is not None else None,
|
||||
avg_input_tokens=float(r["avg_in"]) if r["avg_in"] is not None else None,
|
||||
avg_output_tokens=float(r["avg_out"]) if r["avg_out"] is not None else None,
|
||||
avg_total_tokens=float(r["avg_tot"]) if r["avg_tot"] is not None else None,
|
||||
request_count=int(r["n"]),
|
||||
# Failure attribution by phase requires joining cost_ledger
|
||||
# and events_outbox — out of scope for v1. The by_project
|
||||
# rollup carries the cross-phase failure signal instead.
|
||||
failure_count=0,
|
||||
failure_rate=None,
|
||||
)
|
||||
|
||||
by_project: dict[str, S.ProjectMetrics] = {}
|
||||
for r in proj_rows:
|
||||
total = int(r["total"])
|
||||
failures = int(r["failures"])
|
||||
by_project[r["project"]] = S.ProjectMetrics(
|
||||
request_count=total,
|
||||
failure_count=failures,
|
||||
failure_rate=(failures / total) if total > 0 else None,
|
||||
)
|
||||
|
||||
stage_progression: list[dict] = []
|
||||
for r in prog_rows:
|
||||
secs = r["seconds"]
|
||||
stage_progression.append({
|
||||
"project": r["project"],
|
||||
"story_id": r["story_id"],
|
||||
"phase": r["phase"],
|
||||
"seconds": float(secs) if secs is not None else 0.0,
|
||||
})
|
||||
|
||||
return S.PerformanceResponse(
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
total_requests=total_requests,
|
||||
total_failures=total_failures,
|
||||
by_phase=by_phase,
|
||||
by_project=by_project,
|
||||
stage_progression=stage_progression,
|
||||
)
|
||||
|
||||
# ---------- StaticFiles mount for the UI bundle (P4 ships this) ---------
|
||||
# Mounted LAST so it never shadows the API routes. If the directory is
|
||||
# empty (P4 hasn't shipped yet), StaticFiles raises on construction — we
|
||||
|
||||
@@ -397,6 +397,50 @@ class StatsResponse(BaseModel):
|
||||
cost_today_usd: Decimal
|
||||
|
||||
|
||||
# --- /v1/performance ------------------------------------------------------
|
||||
# Added 2026-06-27 to surface avg request time, avg tokens, stage failure
|
||||
# rates, and stage progression velocity on the dashboard. Sourced from the
|
||||
# existing cost_ledger + events_outbox tables — no new schema, no new writes.
|
||||
|
||||
class PhaseMetrics(BaseModel):
|
||||
"""Per-phase rollup for /v1/performance."""
|
||||
avg_request_seconds: Optional[float] # None if no requests in window
|
||||
p50_request_seconds: Optional[float]
|
||||
p95_request_seconds: Optional[float]
|
||||
avg_input_tokens: Optional[float]
|
||||
avg_output_tokens: Optional[float]
|
||||
avg_total_tokens: Optional[float]
|
||||
request_count: int
|
||||
failure_count: int # tests_failed + rebase_conflict verdicts in window
|
||||
failure_rate: Optional[float] # failure_count / total_verdicts in window
|
||||
|
||||
|
||||
class ProjectMetrics(BaseModel):
|
||||
"""Per-project rollup."""
|
||||
request_count: int
|
||||
failure_count: int
|
||||
failure_rate: Optional[float]
|
||||
|
||||
|
||||
class PerformanceResponse(BaseModel):
|
||||
"""``GET /v1/performance`` response: rolled-up perf metrics.
|
||||
|
||||
``window_start`` / ``window_end`` are inclusive lower / exclusive upper.
|
||||
All averages are NULL when there are no rows in the window for that bucket
|
||||
(clients render "no data" rather than 0 to avoid implying 0-second calls).
|
||||
"""
|
||||
window_start: datetime
|
||||
window_end: datetime
|
||||
total_requests: int
|
||||
total_failures: int
|
||||
by_phase: dict[str, PhaseMetrics]
|
||||
by_project: dict[str, ProjectMetrics]
|
||||
# Stage-progression timing: per work_item, the time spent in each phase.
|
||||
# Returned as a flat list of {project, story_id, phase, seconds} so the
|
||||
# client can compute its own p50/p95 in the widget without a second round trip.
|
||||
stage_progression: list[dict]
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
"""``GET /healthz`` response. Process-up check (does NOT probe Postgres)."""
|
||||
|
||||
@@ -497,6 +541,40 @@ class ErrorResponse(BaseModel):
|
||||
detail: Optional[str] = None
|
||||
|
||||
|
||||
# --- verdict feedback shape (ADR-005) -----------------------------------
|
||||
#
|
||||
# The cycle function stores per-verdict feedback on work_items.last_feedback
|
||||
# (JSONB). For consumers that want a typed view (dashboard, MCP, integration
|
||||
# tests), this model exposes the structured fields. All fields are optional
|
||||
# because feedback is heterogeneous: each verdict type returns its own subset
|
||||
# (test_cmd, stderr, pr_url, conflict, ...). `transient` is added by the
|
||||
# build-phase helper `phases.is_transient`; it's None for non-transient
|
||||
# verdicts and True for the 6 documented patterns (ADR-005).
|
||||
|
||||
|
||||
class VerdictFeedback(BaseModel):
|
||||
"""Structured view of a work_items.last_feedback JSONB blob.
|
||||
|
||||
Mirrors the fields set by `phases.build` / `phases.refine_spec` /
|
||||
`phases.review` verdicts. ``transient`` (ADR-005) is True when the
|
||||
build-phase error matches one of the 6 documented patterns and the
|
||||
loop-breaker should be skipped.
|
||||
"""
|
||||
|
||||
error: Optional[str] = None
|
||||
stderr: Optional[str] = None
|
||||
stdout: Optional[str] = None
|
||||
test_cmd: Optional[str] = None
|
||||
pr_url: Optional[str] = None
|
||||
branch: Optional[str] = None
|
||||
commit: Optional[str] = None
|
||||
spec_path: Optional[str] = None
|
||||
review_test: Optional[Any] = None
|
||||
transient: Optional[bool] = None
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
|
||||
# --- MCP tool envelopes (P3 derives these from the request/response -----
|
||||
# --- models via Pydantic's model_json_schema; listed here for clarity) ---
|
||||
|
||||
@@ -632,6 +710,62 @@ class McpSystemStatusResponse(BaseModel):
|
||||
last_cycle_at: Optional[datetime]
|
||||
cost_today_usd: Decimal
|
||||
|
||||
# 2026-06-27 note: keep this shape in lock-step with StatsResponse so the
|
||||
# MCP system_status tool returns the same on-the-wire contract.
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
"""``GET /healthz`` response. Process-up check (does NOT probe Postgres)."""
|
||||
|
||||
status: str = "ok"
|
||||
|
||||
|
||||
# --- /v1/performance ------------------------------------------------------
|
||||
# Added 2026-06-27 to surface avg request time, avg tokens, stage failure
|
||||
# rates, and stage progression velocity on the dashboard. Sourced from the
|
||||
# existing cost_ledger + events_outbox tables — no new schema, no new writes.
|
||||
|
||||
class PhaseMetrics(BaseModel):
|
||||
"""Per-phase rollup for /v1/performance."""
|
||||
avg_request_seconds: Optional[float] # None if no requests in window
|
||||
p50_request_seconds: Optional[float]
|
||||
p95_request_seconds: Optional[float]
|
||||
avg_input_tokens: Optional[float]
|
||||
avg_output_tokens: Optional[float]
|
||||
avg_total_tokens: Optional[float]
|
||||
request_count: int
|
||||
failure_count: int # tests_failed + rebase_conflict verdicts in window
|
||||
failure_rate: Optional[float] # failure_count / total_verdicts in window
|
||||
|
||||
|
||||
class ProjectMetrics(BaseModel):
|
||||
"""Per-project rollup."""
|
||||
request_count: int
|
||||
failure_count: int
|
||||
failure_rate: Optional[float]
|
||||
|
||||
|
||||
class PerformanceResponse(BaseModel):
|
||||
"""``GET /v1/performance`` response: rolled-up perf metrics.
|
||||
|
||||
``window_start`` / ``window_end`` are inclusive lower / exclusive upper.
|
||||
All averages are NULL when there are no rows in the window for that bucket
|
||||
(clients render "no data" rather than 0 to avoid implying 0-second calls).
|
||||
"""
|
||||
window_start: datetime
|
||||
window_end: datetime
|
||||
total_requests: int
|
||||
total_failures: int
|
||||
by_phase: dict[str, PhaseMetrics]
|
||||
by_project: dict[str, ProjectMetrics]
|
||||
# Stage-progression timing: per work_item, the time spent in each phase.
|
||||
# Returned as a flat list of {project, story_id, phase, seconds} so the
|
||||
# client can compute its own p50/p95 in the widget without a second round trip.
|
||||
stage_progression: list[dict]
|
||||
|
||||
|
||||
# End /v1/performance schemas. The original HealthResponse follows below.
|
||||
|
||||
|
||||
__all__ = [
|
||||
# enums
|
||||
@@ -664,6 +798,10 @@ __all__ = [
|
||||
"CostSummaryResponse",
|
||||
"StatsResponse",
|
||||
"HealthResponse",
|
||||
# /v1/performance
|
||||
"PhaseMetrics",
|
||||
"ProjectMetrics",
|
||||
"PerformanceResponse",
|
||||
# write response shapes
|
||||
"IngestStoryResponse",
|
||||
"BulkIngestItemResult",
|
||||
@@ -673,6 +811,8 @@ __all__ = [
|
||||
"AskHermesResponse",
|
||||
# error
|
||||
"ErrorResponse",
|
||||
# verdict feedback (ADR-005)
|
||||
"VerdictFeedback",
|
||||
# MCP args
|
||||
"McpIngestStoryArgs",
|
||||
"McpIngestProjectArgs",
|
||||
|
||||
@@ -257,10 +257,36 @@ def ingest_cmd(project, dry_run):
|
||||
break
|
||||
if not title:
|
||||
title = sid
|
||||
# Parse `## File Scope` section (bullet list of code paths).
|
||||
# 2026-06-27: previously hardcoded `file_scope=[]` here, causing
|
||||
# `scope violation` failures across 21+ stories. Parse bullets
|
||||
# under the `## File Scope` heading until the next `## ` heading.
|
||||
file_scope: list[str] = []
|
||||
in_file_scope = False
|
||||
for line in text.splitlines():
|
||||
s = line.strip()
|
||||
if s.startswith("## "):
|
||||
in_file_scope = s.lower().startswith("## file scope")
|
||||
continue
|
||||
if in_file_scope and s.startswith("- "):
|
||||
# Strip trailing parenthetical comments like "(NEW — 4 tests)"
|
||||
bullet = s[2:].split("(", 1)[0].strip().rstrip(",")
|
||||
# Strip inline backticks and trailing whitespace
|
||||
bullet = bullet.strip("`").strip()
|
||||
# Skip empty bullets and bullets that are pure prose
|
||||
if bullet and any(c.isalnum() for c in bullet):
|
||||
file_scope.append(bullet)
|
||||
# Strip stale `lore-engine-poc/` prefix (project was relocated
|
||||
# to `/workspace/projects/lore-engine-merge/`; BMAD paths
|
||||
# still use the old root).
|
||||
file_scope = [
|
||||
p[len("lore-engine-poc/"):] if p.startswith("lore-engine-poc/")
|
||||
else p for p in file_scope
|
||||
]
|
||||
if dry_run:
|
||||
console.print(f"[dry-run] {project}/{sid}: {title}")
|
||||
console.print(f"[dry-run] {project}/{sid}: {title} (file_scope={len(file_scope)} entries)")
|
||||
else:
|
||||
state.upsert_story(cur, project, sid, title, file_scope=[])
|
||||
state.upsert_story(cur, project, sid, title, file_scope=file_scope)
|
||||
count += 1
|
||||
console.print(f"[green]ingested {count} stories for {project}[/green]")
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ class Settings(BaseSettings):
|
||||
# if you want this — needs the host's ollama daemon reachable.
|
||||
use_ollama_wrapper: bool = False
|
||||
claude_model: str = "minimax-m3"
|
||||
claude_max_turns: int = 50
|
||||
claude_max_turns: int = 320 # bumped 2026-06-27: 80 → 120 → 140 → 180 → 220 → 280 → 320 (S5-lore hit 280 in 1500s; S19/S23 timed out at 1500s with budget exhaustion signature)
|
||||
claude_timeout: int = 1500 # seconds
|
||||
claude_permission_mode: str = "acceptEdits" # auto-approve file edits, still prompt for bash
|
||||
anthropic_base_url: str = "http://host.docker.internal:4000"
|
||||
|
||||
@@ -74,114 +74,206 @@ def tick() -> dict:
|
||||
summary = {"claimed": None, "transition": None, "events": []}
|
||||
|
||||
# --- Txn 1: claim ------------------------------------------------------
|
||||
with state.transaction() as cur:
|
||||
# 0. External concurrency view (always, even when idle)
|
||||
active = _active_claims(cur)
|
||||
_write_status_file(active)
|
||||
|
||||
# 1. Pick the next work item. Order matters — drain what's closest
|
||||
# to done first:
|
||||
# - review (rows that have a pr_url and need a re-test + merge)
|
||||
# - build (rows with a spec, awaiting the actual code work)
|
||||
# - spec (everything else, needs a spec written)
|
||||
# There is no separate `merge` phase: review transitions to
|
||||
# `merged` on a pass verdict (see _next_phase_on_verdict).
|
||||
item = (
|
||||
state.claim_for_review(cur)
|
||||
or state.claim_for_build(cur)
|
||||
or state.claim_for_spec(cur)
|
||||
)
|
||||
if not item:
|
||||
_log_line({"event": "idle", "active": len(active)})
|
||||
return summary
|
||||
|
||||
summary["claimed"] = f"{item['project']}/{item['story_id']}"
|
||||
log.info("claimed %s in phase %s", summary["claimed"], item["phase"])
|
||||
|
||||
# --- Txn 2: phase function (its own txn; can crash without locking) ----
|
||||
try:
|
||||
# Batch-claim loop (added 2026-06-27): one tick was claiming a single
|
||||
# row, which capped throughput at 1 spec/min regardless of DAMASCUS_MAX_
|
||||
# CONCURRENT or the taskiq worker pool size. Now we drain up to
|
||||
# `max_concurrent` rows per tick, ordered review→build→spec. Each row
|
||||
# runs its own LLM call in this process sequentially or in parallel
|
||||
# depending on the row count (see Txn 2). With max_concurrent=10 and
|
||||
# tick=15s, the upper bound is now ~40 specs/min instead of 1/min.
|
||||
#
|
||||
# PARALLEL_CAP (added 2026-06-27 after observing 429s on 10 concurrent
|
||||
# LLM calls): the LiteLLM proxy's per-IP rate limit (300 writes/min)
|
||||
# starts tripping when 10 calls land within ~2s. Capping parallel
|
||||
# LLM calls at PARALLEL_CAP_PER_TICK keeps the burst under the proxy's
|
||||
# per-second token allowance. The remaining rows stay claimed (their
|
||||
# `claimed_at` is fresh) and get processed by the NEXT tick.
|
||||
PARALLEL_CAP_PER_TICK = 5
|
||||
rows_this_tick: list[dict] = []
|
||||
for _ in range(settings.max_concurrent):
|
||||
with state.transaction() as cur:
|
||||
if item["phase"] == "build":
|
||||
result = phases.build(cur, item)
|
||||
elif item["phase"] == "review":
|
||||
result = phases.review(cur, item)
|
||||
else: # phase == 'spec'
|
||||
result = phases.refine_spec(cur, item)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.exception("phase error")
|
||||
result = {"verdict": "tests_failed", "feedback": {"error": str(e)[:500]}}
|
||||
|
||||
target_phase = _next_phase_on_verdict(item, result)
|
||||
|
||||
# --- Txn 3: verdict write ----------------------------------------------
|
||||
with state.transaction() as cur:
|
||||
# 3. Apply the verdict. Forward pr_url/branch/base_commit into the
|
||||
# row so the review phase can verify the build actually produced
|
||||
# a real PR, and so a follow-up retry (rebase_conflict) reuses
|
||||
# the same branch.
|
||||
verdict_feedback = dict(result["feedback"])
|
||||
extra_fields = {}
|
||||
if result["verdict"] == "pass" and item["phase"] == "build":
|
||||
if "pr_url" in verdict_feedback:
|
||||
extra_fields["pr_url"] = verdict_feedback["pr_url"]
|
||||
if "branch" in verdict_feedback:
|
||||
extra_fields["branch"] = verdict_feedback["branch"]
|
||||
if "commit" in verdict_feedback:
|
||||
extra_fields["base_commit"] = verdict_feedback["commit"]
|
||||
|
||||
# Amendment §4: `spec_ambiguous` does NOT consume the autonomous budget.
|
||||
# The claim already incremented attempts; roll it back so a human-blocked
|
||||
# question doesn't burn one of the row's N autonomous retries. The
|
||||
# budget resumes counting only on autonomous retries after the human
|
||||
# answers and the item returns to `spec`.
|
||||
if result["verdict"] == "spec_ambiguous" and item["phase"] == "spec":
|
||||
extra_fields["attempts"] = max(0, item["attempts"] - 1)
|
||||
|
||||
state.set_phase(cur, item["id"], target_phase,
|
||||
last_verdict=result["verdict"],
|
||||
last_feedback=verdict_feedback, **extra_fields)
|
||||
state.emit_event(cur, item["id"], "phase.transition", {
|
||||
"from": item["phase"], "to": target_phase,
|
||||
"verdict": result["verdict"], "feedback": verdict_feedback,
|
||||
})
|
||||
|
||||
# 3b. Loop-breaker: when a non-pass verdict exhausts the attempt
|
||||
# budget, the item is parked as `blocked` and surfaced to the
|
||||
# human via a human_issue (design doc §5 / §16). pass is exempt
|
||||
# (attempts are not consumed on success).
|
||||
if target_phase == "blocked":
|
||||
issue_id = state.open_human_issue(
|
||||
cur, item["id"],
|
||||
f"[{item['project']}/{item['story_id']}] blocked after "
|
||||
f"{item['attempts']}/{item['budget_cycles']} attempts "
|
||||
f"({result['verdict']}): {verdict_feedback}",
|
||||
# Refresh active-claims view per claim so the per-IP rate limit
|
||||
# can be reflected in active.json even when we exit early.
|
||||
active = _active_claims(cur)
|
||||
item = (
|
||||
state.claim_for_review(cur)
|
||||
or state.claim_for_build(cur)
|
||||
or state.claim_for_spec(cur)
|
||||
)
|
||||
state.emit_event(cur, item["id"], "work.blocked", {
|
||||
"verdict": result["verdict"],
|
||||
"attempts": item["attempts"],
|
||||
"budget_cycles": item["budget_cycles"],
|
||||
"issue_id": issue_id,
|
||||
"feedback": verdict_feedback,
|
||||
if not item:
|
||||
_write_status_file(active)
|
||||
break
|
||||
rows_this_tick.append(item)
|
||||
log.info("claimed %s in phase %s", f"{item['project']}/{item['story_id']}", item["phase"])
|
||||
|
||||
if not rows_this_tick:
|
||||
with state.transaction() as cur:
|
||||
active = _active_claims(cur)
|
||||
_write_status_file(active)
|
||||
_log_line({"event": "idle", "active": len(active)})
|
||||
return summary
|
||||
|
||||
# --- Txn 2: phase functions (one per claimed row, PARALLEL) -----------
|
||||
# Each row's LLM call is independent — no shared state between calls, the
|
||||
# only shared resource is the LiteLLM proxy (which already enforces a
|
||||
# per-IP rate limit we just bumped to 300 writes/min). With max_concurrent
|
||||
# = 10 we fan out up to 10 phase calls to a thread pool. Each thread
|
||||
# opens its own DB connection for the phase's transaction (psycopg
|
||||
# connections are thread-local — the connection pool handles concurrency).
|
||||
#
|
||||
# Why not parallel at the taskiq level? The scheduler enqueues one
|
||||
# run_cycle task per minute (cron `* * * * *`); we could enqueue N per
|
||||
# minute but that requires re-architecting the scheduler. Running the
|
||||
# LLM calls in parallel within ONE taskiq invocation is cheaper and
|
||||
# fits the existing scheduler cadence. If/when we want even more
|
||||
# parallelism, bump the cron cadence AND keep this thread pool.
|
||||
import concurrent.futures as cf
|
||||
results: list[tuple[dict, dict]] = [] # (item, result)
|
||||
rows_this_tick_first_batch = rows_this_tick[:PARALLEL_CAP_PER_TICK]
|
||||
if len(rows_this_tick_first_batch) <= 1 or settings.max_concurrent <= 1:
|
||||
# Sequential path — simpler, no threadpool spin-up cost.
|
||||
for item in rows_this_tick_first_batch:
|
||||
try:
|
||||
with state.transaction() as cur:
|
||||
if item["phase"] == "build":
|
||||
result = phases.build(cur, item)
|
||||
elif item["phase"] == "review":
|
||||
result = phases.review(cur, item)
|
||||
else: # phase == 'spec'
|
||||
result = phases.refine_spec(cur, item)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.exception("phase error")
|
||||
result = {"verdict": "tests_failed", "feedback": {"error": str(e)[:500]}}
|
||||
results.append((item, result))
|
||||
else:
|
||||
# Parallel path — each row opens its own DB transaction in its own
|
||||
# thread. The phase functions are pure I/O bound (LLM call), so
|
||||
# threads release the GIL during socket waits; we get real
|
||||
# parallelism from a thread pool, no need for processes.
|
||||
def _run_phase(item: dict) -> tuple[dict, dict]:
|
||||
try:
|
||||
with state.transaction() as cur:
|
||||
if item["phase"] == "build":
|
||||
result = phases.build(cur, item)
|
||||
elif item["phase"] == "review":
|
||||
result = phases.review(cur, item)
|
||||
else:
|
||||
result = phases.refine_spec(cur, item)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.exception("phase error")
|
||||
result = {"verdict": "tests_failed", "feedback": {"error": str(e)[:500]}}
|
||||
return item, result
|
||||
|
||||
with cf.ThreadPoolExecutor(max_workers=len(rows_this_tick_first_batch)) as ex:
|
||||
for item_result in ex.map(_run_phase, rows_this_tick_first_batch):
|
||||
results.append(item_result)
|
||||
|
||||
# The remaining rows (above PARALLEL_CAP_PER_TICK) stay claimed with
|
||||
# their `claimed_at` set. They will be released by `set_phase` clearing
|
||||
# claimed_at when the verdict is written, OR by the stale-claim filter
|
||||
# after 30min if something goes wrong. Either way, the next tick will
|
||||
# see them as unclaimed and pick them up. So we drop them from the
|
||||
# verdict-write loop below — they're handled by the next cycle.
|
||||
|
||||
# --- Txn 3: verdict write (one per claimed row) -----------------------
|
||||
# Each (item, result) gets its own transaction so a failure in one row's
|
||||
# verdict write doesn't roll back the others. The block also emits the
|
||||
# per-row phase.transition event and, for blocked rows, the human_issue
|
||||
# + work.blocked event pair.
|
||||
transitions: list[dict] = [] # [{from, to, verdict, claimed_label}]
|
||||
for item, result in results:
|
||||
target_phase = _next_phase_on_verdict(item, result)
|
||||
claimed_label = f"{item['project']}/{item['story_id']}"
|
||||
with state.transaction() as cur:
|
||||
# Apply the verdict. Forward pr_url/branch/base_commit into the
|
||||
# row so the review phase can verify the build actually produced
|
||||
# a real PR, and so a follow-up retry (rebase_conflict) reuses
|
||||
# the same branch.
|
||||
verdict_feedback = dict(result["feedback"])
|
||||
extra_fields: dict = {}
|
||||
if result["verdict"] == "pass" and item["phase"] == "build":
|
||||
if "pr_url" in verdict_feedback:
|
||||
extra_fields["pr_url"] = verdict_feedback["pr_url"]
|
||||
if "branch" in verdict_feedback:
|
||||
extra_fields["branch"] = verdict_feedback["branch"]
|
||||
if "commit" in verdict_feedback:
|
||||
extra_fields["base_commit"] = verdict_feedback["commit"]
|
||||
# 2026-06-27: GHOST-PASS FIX. A clean spec→build transition
|
||||
# returns verdict=pass (spec succeeded) and forwards spec_path
|
||||
# + spec preview into feedback. But this verdict+feedback is
|
||||
# SPEC data, not BUILD data — carrying it forward into the build
|
||||
# phase makes rows look like they already passed the build gate
|
||||
# even though no Claude invocation, no tests, no rebase, no push,
|
||||
# and no PR happened. Review() then refuses to advance them
|
||||
# because branch/pr_url are still NULL, but last_verdict=pass
|
||||
# lures operators into thinking the build worked.
|
||||
# Clear verdict+feedback on the spec→build transition so the
|
||||
# build phase starts with a clean slate. The spec_path is
|
||||
# preserved via the `spec_path` column (already written by
|
||||
# _write_spec_file in refine_spec) for the build phase to
|
||||
# locate the spec on disk.
|
||||
row_verdict = result["verdict"]
|
||||
row_feedback = verdict_feedback
|
||||
if result["verdict"] == "pass" and item["phase"] == "spec":
|
||||
row_verdict = None
|
||||
row_feedback = None
|
||||
|
||||
# Amendment §4: `spec_ambiguous` does NOT consume the autonomous budget.
|
||||
# The claim already incremented attempts; roll it back so a human-blocked
|
||||
# question doesn't burn one of the row's N autonomous retries. The
|
||||
# budget resumes counting only on autonomous retries after the human
|
||||
# answers and the item returns to `spec`.
|
||||
if result["verdict"] == "spec_ambiguous" and item["phase"] == "spec":
|
||||
extra_fields["attempts"] = max(0, item["attempts"] - 1)
|
||||
|
||||
state.set_phase(cur, item["id"], target_phase,
|
||||
last_verdict=row_verdict,
|
||||
last_feedback=row_feedback, **extra_fields)
|
||||
state.emit_event(cur, item["id"], "phase.transition", {
|
||||
"from": item["phase"], "to": target_phase,
|
||||
"verdict": result["verdict"], "feedback": verdict_feedback,
|
||||
})
|
||||
|
||||
# 4. Refresh external status
|
||||
active = _active_claims(cur)
|
||||
_write_status_file(active)
|
||||
# Loop-breaker: when a non-pass verdict exhausts the attempt
|
||||
# budget, the item is parked as `blocked` and surfaced to the
|
||||
# human via a human_issue (design doc §5 / §16). pass is exempt
|
||||
# (attempts are not consumed on success).
|
||||
if target_phase == "blocked":
|
||||
issue_id = state.open_human_issue(
|
||||
cur, item["id"],
|
||||
f"[{item['project']}/{item['story_id']}] blocked after "
|
||||
f"{item['attempts']}/{item['budget_cycles']} attempts "
|
||||
f"({result['verdict']}): {verdict_feedback}",
|
||||
)
|
||||
state.emit_event(cur, item["id"], "work.blocked", {
|
||||
"verdict": result["verdict"],
|
||||
"attempts": item["attempts"],
|
||||
"budget_cycles": item["budget_cycles"],
|
||||
"issue_id": issue_id,
|
||||
"feedback": verdict_feedback,
|
||||
})
|
||||
|
||||
summary["transition"] = {
|
||||
"from": item["phase"], "to": target_phase,
|
||||
"verdict": result["verdict"],
|
||||
}
|
||||
|
||||
# 5. One-line relay (outside the txn so webhook hiccups don't roll back)
|
||||
if summary["claimed"] and summary["transition"]:
|
||||
# Per-row one-line relay (outside the txn so webhook hiccups don't
|
||||
# roll back). Each row gets its own line so the operator can see
|
||||
# all transitions in this tick from the relay log.
|
||||
line = (
|
||||
f"[{settings.concurrency_id}] {summary['claimed']}: "
|
||||
f"{summary['transition']['from']} → {summary['transition']['to']} "
|
||||
f"({summary['transition']['verdict']})"
|
||||
f"[{settings.concurrency_id}] {claimed_label}: "
|
||||
f"{item['phase']} → {target_phase} ({result['verdict']})"
|
||||
)
|
||||
relay.post(line)
|
||||
_log_line({"event": "transition", **summary, "elapsed_ms": int((time.time()-start)*1000)})
|
||||
transitions.append({
|
||||
"claimed": claimed_label,
|
||||
"from": item["phase"], "to": target_phase,
|
||||
"verdict": result["verdict"],
|
||||
})
|
||||
|
||||
# Final status refresh + tick summary
|
||||
with state.transaction() as cur:
|
||||
active = _active_claims(cur)
|
||||
_write_status_file(active)
|
||||
summary["claimed"] = ", ".join(t["claimed"] for t in transitions)
|
||||
summary["transition"] = transitions if len(transitions) > 1 else (transitions[0] if transitions else None)
|
||||
_log_line({"event": "transition", "tick": summary, "elapsed_ms": int((time.time()-start)*1000)})
|
||||
return summary
|
||||
|
||||
|
||||
|
||||
27
src/damascus/db/migrations/0007_first_attempted_at.sql
Normal file
27
src/damascus/db/migrations/0007_first_attempted_at.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- ADR-005: distinguish transient vs structural tests_failed.
|
||||
--
|
||||
-- Adds a `first_attempted_at` column to work_items. Populated by the claim
|
||||
-- functions (state.claim_for_build / claim_for_spec / claim_for_review) on
|
||||
-- the FIRST claim for each row; NULL until then.
|
||||
--
|
||||
-- Used by cycle.py to escalate persistent transient retries to `blocked`
|
||||
-- after 24h: when feedback.transient=True AND NOW() - first_attempted_at
|
||||
-- > INTERVAL '24 hours', the row goes to blocked + opens a human_issue.
|
||||
--
|
||||
-- Backfilled from updated_at so the existing rows get a sensible value (the
|
||||
-- first time anyone touched the row since its last update). For brand-new
|
||||
-- rows inserted via upsert_story, the column stays NULL until the first
|
||||
-- claim — the claim itself populates it.
|
||||
--
|
||||
-- Forward-compatible: column is nullable, default NULL, no NOT NULL constraint,
|
||||
-- so an older orchestrator binary can still read/write the table.
|
||||
|
||||
ALTER TABLE work_items
|
||||
ADD COLUMN IF NOT EXISTS first_attempted_at TIMESTAMPTZ DEFAULT NULL;
|
||||
|
||||
-- Backfill: existing rows that haven't been claimed yet have first_attempted_at
|
||||
-- NULL. We backfill from updated_at for any non-NULL updated_at so the 24h
|
||||
-- escalation window has a starting reference. New rows handled by claim_for_*.
|
||||
UPDATE work_items
|
||||
SET first_attempted_at = updated_at
|
||||
WHERE first_attempted_at IS NULL;
|
||||
@@ -236,9 +236,15 @@ class DamascusMcpServer(Server):
|
||||
|
||||
This subclass instead makes ``mcp.list_tools()`` a regular method
|
||||
that returns the registered tool catalog directly. The list-tools
|
||||
handler is registered explicitly via
|
||||
``mcp.request_handlers[ListToolsRequest] = ...`` (the same internal
|
||||
API the decorator uses), preserving protocol correctness.
|
||||
AND call-tool handlers are registered explicitly via
|
||||
``mcp.request_handlers[...] = ...`` (the same internal API the
|
||||
decorators use), preserving protocol correctness and making the
|
||||
wiring visible without chasing decorator semantics.
|
||||
|
||||
The call-tool handler is registered the same way (see
|
||||
``_call_tool_handler`` below) so that both handlers follow the
|
||||
same registration pattern, and operators reading this file can
|
||||
see the full dispatch table in one place.
|
||||
"""
|
||||
|
||||
def list_tools(self) -> list[Tool]:
|
||||
@@ -251,22 +257,32 @@ mcp = DamascusMcpServer("damascus-mcp")
|
||||
|
||||
# Register the list-tools handler manually so the decorator form is
|
||||
# not needed. Same internal API the SDK's @mcp.list_tools() decorator
|
||||
# uses.
|
||||
# uses — but we extend it to populate ``mcp._tool_cache`` so the SDK's
|
||||
# input-validation pipeline (used by the call_tool handler below) can
|
||||
# look tool definitions up by name.
|
||||
async def _handle_list_tools() -> list[Tool]:
|
||||
"""Return the seven registered tools."""
|
||||
return TOOLS
|
||||
|
||||
|
||||
from mcp.types import ListToolsRequest # noqa: E402 (after mcp defined)
|
||||
from mcp.types import ListToolsRequest, ListToolsResult, ServerResult # noqa: E402 (after mcp defined)
|
||||
|
||||
|
||||
#: The handler is a coroutine that returns the catalog. Wrap it the
|
||||
#: same way the SDK's decorator does so the SDK's internal call path
|
||||
#: works unchanged.
|
||||
async def _list_tools_handler(req: ListToolsRequest) -> Any:
|
||||
from mcp.types import ListToolsResult, ServerResult
|
||||
"""Wrap the catalog in a ServerResult(ListToolsResult(...)) and
|
||||
populate ``mcp._tool_cache`` so SDK validation can find tools by name.
|
||||
|
||||
The SDK's own ``@mcp.list_tools()`` decorator does this transparently;
|
||||
because we register the handler manually, we have to replicate the
|
||||
cache-refresh logic or input validation in the call_tool pipeline
|
||||
will warn "Tool X not listed, no validation will be performed".
|
||||
"""
|
||||
result = await _handle_list_tools()
|
||||
# Refresh the SDK's tool cache so subsequent _get_cached_tool_definition
|
||||
# calls succeed. Mirrors the SDK's own behavior at lowlevel/server.py:451.
|
||||
mcp._tool_cache.clear()
|
||||
for tool in result:
|
||||
mcp._tool_cache[tool.name] = tool
|
||||
return ServerResult(ListToolsResult(tools=result))
|
||||
|
||||
|
||||
@@ -357,7 +373,6 @@ async def _dispatch(
|
||||
return [TextContent(type="text", text=json.dumps(payload))]
|
||||
|
||||
|
||||
@mcp.call_tool()
|
||||
async def _handle_call_tool(
|
||||
name: str,
|
||||
arguments: dict[str, Any],
|
||||
@@ -366,6 +381,40 @@ async def _handle_call_tool(
|
||||
return await _dispatch(name, arguments)
|
||||
|
||||
|
||||
# Register the call-tool handler manually so the wiring is explicit and
|
||||
# mirrors the ListToolsRequest pattern. The SDK's ``@mcp.call_tool()``
|
||||
# decorator does the same registration internally but adds a closure
|
||||
# that does input validation against ``mcp._tool_cache``. We use the
|
||||
# same internal ``request_handlers`` API the decorator uses; the SDK's
|
||||
# ``_handle_request`` method (lowlevel/server.py:722) dispatches from
|
||||
# this dict.
|
||||
async def _call_tool_handler(req: CallToolRequest) -> Any:
|
||||
"""Dispatch a ``tools/call`` request.
|
||||
|
||||
Mirrors the SDK's ``@mcp.call_tool()`` shape: pull ``name`` and
|
||||
``arguments`` off the request, run the tool, wrap the result in a
|
||||
``ServerResult(CallToolResult(...))``. Errors from the tool become
|
||||
``CallToolResult(isError=True, ...)`` — the SDK's protocol layer
|
||||
surfaces these as JSON-RPC responses with ``isError=True``, not
|
||||
as protocol errors (the call DID complete, just unsuccessfully).
|
||||
"""
|
||||
name = req.params.name
|
||||
arguments = req.params.arguments or {}
|
||||
try:
|
||||
content = await _handle_call_tool(name, arguments)
|
||||
except Exception as exc:
|
||||
return ServerResult(
|
||||
CallToolResult(
|
||||
content=[TextContent(type="text", text=str(exc))],
|
||||
isError=True,
|
||||
)
|
||||
)
|
||||
return ServerResult(CallToolResult(content=list(content), isError=False))
|
||||
|
||||
|
||||
mcp.request_handlers[CallToolRequest] = _call_tool_handler
|
||||
|
||||
|
||||
# --- public asyncio API for tests -------------------------------------------
|
||||
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ result, and verifies the diff before opening a PR.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
@@ -19,6 +20,25 @@ from .config import settings
|
||||
|
||||
# --- Phase 1: spec --------------------------------------------------------
|
||||
|
||||
# ADR-005: 6 known transient error patterns. Match as exact, case-sensitive
|
||||
# substrings on the build-phase error string. Adding a new pattern means
|
||||
# appending here AND documenting it in the ADR.
|
||||
_TRANSIENT_PATTERNS = (
|
||||
"project repo not found at", # missing clone
|
||||
"worktree setup:", # lock/contention
|
||||
"Connection refused", # port not up yet
|
||||
"Could not resolve host", # DNS transient
|
||||
"TLS handshake timeout", # cert rollout
|
||||
"rate limit", # 429
|
||||
)
|
||||
|
||||
|
||||
def is_transient(err: str) -> bool:
|
||||
"""Return True if the build-phase error string matches a known transient
|
||||
pattern (ADR-005). Case-sensitive substring match."""
|
||||
return any(p in err for p in _TRANSIENT_PATTERNS)
|
||||
|
||||
|
||||
def refine_spec(cur, item: dict) -> dict:
|
||||
"""Read the BMAD story + architecture, ask the LLM to produce a TDD spec.
|
||||
Writes the spec to the project repo's spec dir. On ambiguity, opens a
|
||||
@@ -30,6 +50,14 @@ def refine_spec(cur, item: dict) -> dict:
|
||||
bmad_story = _find_bmad_story(project, story_id)
|
||||
arch = _find_architecture(project)
|
||||
|
||||
# Inject previously-answered human_issues as authoritative decisions so
|
||||
# the refiner does not re-ask the same questions across rounds. Without
|
||||
# this, the refiner starts fresh from the BMAD file on every spec phase
|
||||
# claim, peeling back the same layer 3-4 times (validated 2026-06-26
|
||||
# across S1, S9, S29, S33, architecture). The human's prior decisions
|
||||
# become facts in the prompt — the refiner lifts them instead of asking.
|
||||
prior_decisions = _format_prior_decisions(cur, item["id"])
|
||||
|
||||
system = (
|
||||
"You are a spec refiner. Given a BMAD story and a project's architecture, "
|
||||
"produce an implementable spec. Output ONLY valid Markdown, no preamble."
|
||||
@@ -45,24 +73,41 @@ def refine_spec(cur, item: dict) -> dict:
|
||||
f"# Project\n{project}\n\n# Story\n{title}\n\n"
|
||||
f"# BMAD story file\n{bmad_story or '(missing)'}\n\n"
|
||||
f"# Architecture\n{arch or '(missing)'}\n\n"
|
||||
f"{prior_decisions}"
|
||||
f"# Row constraints\n"
|
||||
f"- declared file_scope = {file_scope!r}\n"
|
||||
f"- budget_cycles = {budget_cycles}\n"
|
||||
f"- attempts = {item.get('attempts', 0)}\n\n"
|
||||
"Honor the declared file_scope exactly: only the paths/globs listed are "
|
||||
"in scope for the implementation. Do not propose additional files.\n\n"
|
||||
"Prior human decisions (see # Prior decisions above) are AUTHORITATIVE — "
|
||||
"do not re-ask anything that was already answered. Lift those decisions "
|
||||
"into the spec directly. Only open new ambiguities in ## Ambiguities "
|
||||
"for things genuinely not yet decided.\n\n"
|
||||
"Write a Markdown spec with these sections:\n"
|
||||
"## Goal\n## Acceptance Criteria (numbered)\n## TDD Plan (list the failing tests)\n"
|
||||
"## Goal\n## Acceptance Criteria (numbered)\n"
|
||||
"## TDD Plan (list the failing tests; for end-to-end or integration-only "
|
||||
"stories — e.g. verify-gate, e2e Playwright flows, MCP integration — "
|
||||
"list integration checks instead of unit tests, e.g. "
|
||||
"`1. failing integration: <curl/Playwright/MCP assertion>`; "
|
||||
"an empty list is NOT acceptable)\n"
|
||||
"## File Scope (list of paths/globs the implementation may touch)\n"
|
||||
"## Test Command (the exact shell command that proves done)\n"
|
||||
"## Ambiguities (any open questions for a human)\n"
|
||||
"## Ambiguities (any NEW open questions for a human — leave empty if prior decisions cover everything)\n"
|
||||
)
|
||||
try:
|
||||
# 4000 tokens to fit Goal + Acceptance Criteria + TDD Plan + Test Command +
|
||||
# File Scope + Ambiguities without truncation. min-max-m3 (a 1M-ctx model)
|
||||
# has plenty of room; the old 1500 was hitting the cap and producing
|
||||
# `spec_wrong` because Test Command got cut off.
|
||||
result = llm.complete(user, system=system, max_tokens=4000)
|
||||
# 6000 tokens: fits Goal + Acceptance Criteria + TDD Plan (now longer with
|
||||
# the end-to-end / integration soft contract) + File Scope + Test Command +
|
||||
# Ambiguities without truncation. The old 4000 was hitting the cap on
|
||||
# non-trivial stories and producing `spec_wrong` because Test Command and/or
|
||||
# TDD Plan sections got cut off. Bumped 2026-06-26 alongside the TDD-Plan
|
||||
# prompt softening (see PR-comment thread on the spec_refiner).
|
||||
# max_tokens: 12000 was too aggressive — caused some E2E-flow specs
|
||||
# (S17-verify-gate-canvas-e2e, S32-verify-gate-e2e) to truncate mid-
|
||||
# section and fall back to spec_ambiguous. Bumped back to 20000
|
||||
# (between the 6000 / 50000 extremes) on 2026-06-27 to leave room
|
||||
# for long AC lists and multi-viewport E2E flows without truncating.
|
||||
result = llm.complete(user, system=system, max_tokens=20000)
|
||||
except llm.LLMError as e:
|
||||
return _verdict("spec_ambiguous", {"error": str(e)})
|
||||
|
||||
@@ -83,15 +128,33 @@ def refine_spec(cur, item: dict) -> dict:
|
||||
"input_tokens": result["input_tokens"],
|
||||
"output_tokens": result["output_tokens"], "usd": result["usd"],
|
||||
})
|
||||
|
||||
ambiguities_section = _section(text, "Ambiguities")
|
||||
# Per spec-refiner-contract.md §3: any non-empty `## Ambiguities` section
|
||||
# triggers the awaiting_human channel. The previous implementation required
|
||||
# the section to end with a question-mark character, which silently
|
||||
# swallowed list-style ambiguities (e.g. "- the auth model is unclear
|
||||
# because of X") and routed them to build with the human never seeing
|
||||
# the issue.
|
||||
ambiguities_section = _section(text, "Ambiguities")
|
||||
if "## Ambiguities" in text and ambiguities_section.strip():
|
||||
#
|
||||
# Soft-pass for "no real ambiguity" content (validated 2026-06-26): when
|
||||
# the refiner has prior decisions injected and concludes nothing new is
|
||||
# open, it writes things like "None." or "Prior decisions cover all
|
||||
# open questions" in the section. Those should NOT block on awaiting_human
|
||||
# — the spec is ready. Only route to awaiting_human when there's a
|
||||
# genuine unresolved question.
|
||||
_SOFT_PASS_MARKERS = (
|
||||
"none.", "none —", "none -", "none ", "(none)", "no new", "no additional",
|
||||
"prior decision", "prior operator decision", "nothing new", "all resolved",
|
||||
"already decided", "all settled", "settled by prior", "nothing left",
|
||||
"covered by prior", "lifted from prior", "from prior decision",
|
||||
)
|
||||
amb_lower = ambiguities_section.strip().lower()
|
||||
# Match on markers alone — the LLM is verbose about confirming nothing's
|
||||
# open ("None — all substantive questions were resolved in prior decisions
|
||||
# (...)"), so a length limit would be brittle. Any of these markers in
|
||||
# the section body means the refiner believes the spec is complete.
|
||||
is_soft_pass = any(m in amb_lower for m in _SOFT_PASS_MARKERS)
|
||||
if "## Ambiguities" in text and ambiguities_section.strip() and not is_soft_pass:
|
||||
issue_id = state.open_human_issue(
|
||||
cur, item["id"], f"[{project}/{story_id}] {title}: {_section(result['text'], 'Ambiguities')}"
|
||||
)
|
||||
@@ -107,6 +170,20 @@ def refine_spec(cur, item: dict) -> dict:
|
||||
)
|
||||
|
||||
|
||||
def _write_claude_settings_local(worktree: Path) -> None:
|
||||
"""DEPRECATED: kept for reference. The build phase now passes the
|
||||
Bash allow-list inline via Claude Code's `--settings` flag (see
|
||||
`phases.build` and `_run_claude_in_worktree`). Writing a
|
||||
`.claude/settings.local.json` file into the worktree was rejected by
|
||||
the scope-check because the file appeared in `git status` and was not
|
||||
in the spec's declared File Scope. Inline `--settings` is ephemeral
|
||||
and doesn't touch the working tree, so the scope-check stays clean."""
|
||||
raise NotImplementedError(
|
||||
"Inline --settings replaces on-disk settings.local.json. "
|
||||
"See phases.build for the allow-list source of truth."
|
||||
)
|
||||
|
||||
|
||||
# --- Phase 2: build -------------------------------------------------------
|
||||
|
||||
def build(cur, item: dict) -> dict:
|
||||
@@ -117,7 +194,7 @@ def build(cur, item: dict) -> dict:
|
||||
story_id = item["story_id"]
|
||||
spec_path = item.get("spec_path") or _find_spec_file(project, story_id)
|
||||
if not spec_path:
|
||||
return _verdict("tests_failed", {"error": "spec file not found"})
|
||||
return _transient_verdict("tests_failed", {"error": "spec file not found"})
|
||||
|
||||
spec_text = Path(spec_path).read_text()
|
||||
test_cmd = _section(spec_text, "Test Command") or "echo 'no test command'"
|
||||
@@ -128,7 +205,7 @@ def build(cur, item: dict) -> dict:
|
||||
wt = _worktree_path(project, story_id)
|
||||
repo_dir = _project_repo_dir(project)
|
||||
if not repo_dir.exists():
|
||||
return _verdict(
|
||||
return _transient_verdict(
|
||||
"tests_failed",
|
||||
{"error": f"project repo not found at {repo_dir}; "
|
||||
f"clone the Gitea repo into /workspace/projects/{project} "
|
||||
@@ -138,7 +215,39 @@ def build(cur, item: dict) -> dict:
|
||||
try:
|
||||
git_ops.ensure_worktree(repo_dir, wt, branch, base_commit)
|
||||
except RuntimeError as e:
|
||||
return _verdict("tests_failed", {"error": f"worktree setup: {e}"})
|
||||
return _transient_verdict("tests_failed", {"error": f"worktree setup: {e}"})
|
||||
|
||||
# The Bash allow-list is passed inline via Claude Code's `--settings`
|
||||
# flag rather than written into the worktree as `.claude/settings.local.json`.
|
||||
# Writing the file into the worktree would (a) show up in `git status` and
|
||||
# trip the scope-check in `phases.build`, and (b) get committed on the
|
||||
# story branch by `git_ops.commit_all`. Inline `--settings` is ephemeral,
|
||||
# scoped to one Claude Code invocation, and doesn't touch the working tree.
|
||||
#
|
||||
# `--permission-mode acceptEdits` honors this allow-list. Without it,
|
||||
# even `npm install` / `git status` inside the worktree gets a permission
|
||||
# prompt that --print mode can't answer, and the build dies at max-turns.
|
||||
claude_settings = json.dumps({
|
||||
"permissions": {
|
||||
"allow": [
|
||||
# Project tooling
|
||||
"Bash(npm:*)", "Bash(npx:*)", "Bash(node:*)",
|
||||
"Bash(yarn:*)", "Bash(pnpm:*)",
|
||||
"Bash(git:*)", "Bash(playwright:*)",
|
||||
# Read-only inspection
|
||||
"Read", "Glob", "Grep",
|
||||
# Writes
|
||||
"Edit", "Write", "NotebookEdit",
|
||||
# Common shell utilities used during scaffold/test loops
|
||||
"Bash(ls:*)", "Bash(cat:*)", "Bash(head:*)", "Bash(tail:*)",
|
||||
"Bash(find:*)", "Bash(grep:*)", "Bash(rg:*)",
|
||||
"Bash(cp:*)", "Bash(mv:*)", "Bash(rm:*)", "Bash(mkdir:*)",
|
||||
"Bash(echo:*)", "Bash(curl:*)", "Bash(touch:*)",
|
||||
"Bash(env)", "Bash(which:*)", "Bash(test:*)",
|
||||
"Bash(pwd)", "Bash(true)", "Bash(false)",
|
||||
],
|
||||
},
|
||||
})
|
||||
|
||||
# Drive Claude Code (one focused, single-action prompt per call).
|
||||
system = (
|
||||
@@ -156,9 +265,9 @@ def build(cur, item: dict) -> dict:
|
||||
'{"files_touched": ["<path>", ...], "summary": "<one-line>"}\n'
|
||||
)
|
||||
try:
|
||||
result = _run_claude_in_worktree(wt, user, system=system)
|
||||
result = _run_claude_in_worktree(wt, user, system=system, settings_json=claude_settings)
|
||||
except llm.LLMError as e:
|
||||
return _verdict("tests_failed", {"error": f"claude-code: {e}"})
|
||||
return _transient_verdict("tests_failed", {"error": f"claude-code: {e}"})
|
||||
|
||||
state.record_cost(cur, item["id"], project, "build", result["model"],
|
||||
result["input_tokens"], result["output_tokens"], result["usd"])
|
||||
@@ -168,7 +277,7 @@ def build(cur, item: dict) -> dict:
|
||||
# declared it, the reviewer enforces it).
|
||||
diff_files = _changed_files(wt)
|
||||
if file_scope and any(_path_outside_scope(f, file_scope) for f in diff_files):
|
||||
return _verdict(
|
||||
return _transient_verdict(
|
||||
"tests_failed",
|
||||
{"error": "scope violation", "out_of_scope": [
|
||||
f for f in diff_files if _path_outside_scope(f, file_scope)
|
||||
@@ -180,10 +289,10 @@ def build(cur, item: dict) -> dict:
|
||||
proc = subprocess.run(["bash", "-lc", test_cmd], cwd=wt, timeout=900,
|
||||
capture_output=True, text=True)
|
||||
except subprocess.TimeoutExpired:
|
||||
return _verdict("tests_failed", {"test_cmd": test_cmd, "error": "timeout"})
|
||||
return _transient_verdict("tests_failed", {"test_cmd": test_cmd, "error": "timeout"})
|
||||
|
||||
if proc.returncode != 0:
|
||||
return _verdict("tests_failed", {"test_cmd": test_cmd, "stderr": proc.stderr[-2000:],
|
||||
return _transient_verdict("tests_failed", {"test_cmd": test_cmd, "stderr": proc.stderr[-2000:],
|
||||
"stdout": proc.stdout[-500:]})
|
||||
|
||||
# Rebase onto main. Conflict = rebase_conflict.
|
||||
@@ -203,7 +312,8 @@ def build(cur, item: dict) -> dict:
|
||||
)
|
||||
|
||||
|
||||
def _run_claude_in_worktree(worktree: Path, prompt: str, system: str) -> dict:
|
||||
def _run_claude_in_worktree(worktree: Path, prompt: str, system: str,
|
||||
settings_json: str | None = None) -> dict:
|
||||
"""Invoke Claude Code to do the actual code work.
|
||||
|
||||
Two paths, selected by settings.use_ollama_wrapper:
|
||||
@@ -214,8 +324,19 @@ def _run_claude_in_worktree(worktree: Path, prompt: str, system: str) -> dict:
|
||||
ANTHROPIC_BASE_URL pointed at LiteLLM. This is the default
|
||||
in the homelab container; LiteLLM in turn routes
|
||||
`minimax-m3` to the cloud model.
|
||||
|
||||
`settings_json` (when provided) is passed via `--settings` so that
|
||||
Claude Code's permissions allow-list covers the Bash commands the
|
||||
build phase needs (npm, git, playwright, …). Without it, the model's
|
||||
first `npm install` or `git status` blocks on a permission prompt that
|
||||
--print mode can't answer, and the build dies at max-turns.
|
||||
"""
|
||||
full_prompt = f"{system}\n\n---\n\n{prompt}" if system else prompt
|
||||
# `--settings` accepts a JSON string OR a path to a JSON file. We
|
||||
# always pass a JSON string here so we don't write a settings file into
|
||||
# the worktree (which would show up in `git status` and trip the
|
||||
# scope-check downstream).
|
||||
settings_args = ["--settings", settings_json] if settings_json else []
|
||||
if settings.use_ollama_wrapper:
|
||||
cmd = [
|
||||
settings.ollama_bin, "launch", "claude",
|
||||
@@ -223,20 +344,22 @@ def _run_claude_in_worktree(worktree: Path, prompt: str, system: str) -> dict:
|
||||
"--", "--bare", "--print",
|
||||
"--max-turns", str(settings.claude_max_turns),
|
||||
"--permission-mode", settings.claude_permission_mode,
|
||||
*settings_args,
|
||||
full_prompt,
|
||||
]
|
||||
else:
|
||||
env = {
|
||||
"ANTHROPIC_BASE_URL": settings.anthropic_base_url,
|
||||
"ANTHROPIC_API_KEY": settings.llm_api_key or "sk-no-auth-needed-for-litellm",
|
||||
"ANTHROPIC_API_KEY": settings.llm_api_key or "sk-no-...ellm",
|
||||
}
|
||||
cmd = [
|
||||
settings.claude_bin, "--bare", "--print",
|
||||
"--max-turns", str(settings.claude_max_turns),
|
||||
"--permission-mode", settings.claude_permission_mode,
|
||||
"--model", settings.claude_model,
|
||||
*settings_args,
|
||||
full_prompt,
|
||||
]
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd, cwd=worktree, capture_output=True, text=True,
|
||||
@@ -279,17 +402,40 @@ def _run_claude_in_worktree(worktree: Path, prompt: str, system: str) -> dict:
|
||||
|
||||
|
||||
def _changed_files(worktree: Path) -> list[str]:
|
||||
out = subprocess.run(
|
||||
"""List files modified or added in the worktree (relative paths).
|
||||
|
||||
`git status --porcelain` covers both modified (M / M) and untracked (??)
|
||||
entries; `git diff --name-only HEAD` adds tracked-but-not-yet-committed
|
||||
edits. Combining them gives a complete picture of what Claude Code
|
||||
touched. The two-char porcelain prefix is `XY` where X is the index
|
||||
status and Y is the worktree status; both can be `.` for unmodified, or
|
||||
`?` for untracked, etc. We strip the first three chars (`XY ` or `?? `)
|
||||
and keep the filename.
|
||||
"""
|
||||
diff = subprocess.run(
|
||||
["git", "diff", "--name-only", "HEAD"],
|
||||
cwd=worktree, capture_output=True, text=True, check=False,
|
||||
)
|
||||
out2 = subprocess.run(
|
||||
status = subprocess.run(
|
||||
["git", "status", "--porcelain"],
|
||||
cwd=worktree, capture_output=True, text=True, check=False,
|
||||
)
|
||||
files = set()
|
||||
for line in (out.stdout + out2.stdout).splitlines():
|
||||
m = re.match(r"^??\s+(.*)$", line) or re.match(r"^..\s+(.*)$", line)
|
||||
files: set[str] = set()
|
||||
# `git diff --name-only` output: one path per line, no prefix. Anything
|
||||
# not starting with `:` (rename/copy markers) is fine; we just need names.
|
||||
for line in diff.stdout.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
files.add(line)
|
||||
# `git status --porcelain` output: "<XY> <path>" where X and Y are each
|
||||
# one of `?`, `.`, `M`, `A`, `D`, `R`, `C`, `U`. We skip the first 3
|
||||
# chars (status + space) and keep the rest. `re.escape` on the prefix
|
||||
# chars avoids "nothing to repeat" bugs when the prefix is `??`.
|
||||
for line in status.stdout.splitlines():
|
||||
if len(line) < 4:
|
||||
continue
|
||||
prefix = re.escape(line[:2]) + r"\s+"
|
||||
m = re.match(r"^" + prefix + r"(.*)$", line)
|
||||
if m:
|
||||
files.add(m.group(1).strip())
|
||||
return sorted(files)
|
||||
@@ -383,9 +529,27 @@ def _verdict(v: str, feedback: dict) -> dict:
|
||||
return {"verdict": v, "feedback": feedback}
|
||||
|
||||
|
||||
def _transient_verdict(v: str, feedback: dict) -> dict:
|
||||
"""Annotate a verdict's feedback with `transient=True` when the error
|
||||
string matches a known transient pattern (ADR-005). Non-transient
|
||||
errors leave the field absent to preserve backward compatibility."""
|
||||
err = feedback.get("error") or ""
|
||||
if is_transient(err):
|
||||
feedback = {**feedback, "transient": True}
|
||||
return _verdict(v, feedback)
|
||||
|
||||
|
||||
def _section(text: str, name: str) -> str:
|
||||
m = re.search(rf"^##\s+{re.escape(name)}\s*\n(.*?)(?=\n##\s+|\Z)", text, re.S | re.M)
|
||||
return (m.group(1).strip() if m else "")
|
||||
# The prompt's section headers may carry a parenthesized description,
|
||||
# e.g. `## TDD Plan (list the failing tests)`. Accept an optional
|
||||
# `(...)` suffix on the section name so the post-check matches what
|
||||
# the LLM actually emits. Regression-tested in
|
||||
# tests/unit/test_phases_section.py.
|
||||
m = re.search(
|
||||
rf"^##\s+{re.escape(name)}\s*(\([^)]*\))?\s*\n(.*?)(?=\n##\s+|\Z)",
|
||||
text, re.S | re.M,
|
||||
)
|
||||
return (m.group(2).strip() if m else "")
|
||||
|
||||
|
||||
def _parse_file_scope(text: str) -> list[str]:
|
||||
@@ -407,6 +571,32 @@ def _find_spec_file(project: str, story_id: str) -> str | None:
|
||||
return str(p) if p.exists() else None
|
||||
|
||||
|
||||
def _format_prior_decisions(cur, work_id: str) -> str:
|
||||
"""Pull every answered human_issue for this work_item and render them as
|
||||
an authoritative 'Prior decisions' block to inject into the spec_refiner
|
||||
prompt. Returns an empty string when there are no prior decisions.
|
||||
|
||||
The refiner otherwise starts fresh from the BMAD file on every spec
|
||||
phase claim, re-asking the same questions across rounds (validated
|
||||
2026-06-26 across S1/S9/S29/S33/architecture — 3+ rounds each).
|
||||
Surfacing the operator's prior answers as facts makes the spec phase
|
||||
converge in one or two passes instead of peeling back the same layer.
|
||||
"""
|
||||
rows = state.resolve_human_issues_for(cur, work_id)
|
||||
if not rows:
|
||||
return ""
|
||||
parts = ["# Prior decisions (operator-answered — treat as authoritative)\n\n"]
|
||||
for i, r in enumerate(rows, 1):
|
||||
question = (r.get("question") or "").strip()
|
||||
answer = (r.get("answer") or "").strip()
|
||||
if not question or not answer:
|
||||
continue
|
||||
parts.append(f"## Decision {i}\n\n**Question:**\n\n{question}\n\n")
|
||||
parts.append(f"**Decision:**\n\n{answer}\n\n")
|
||||
parts.append("---\n\n")
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
def _find_bmad_story(project: str, story_id: str) -> str | None:
|
||||
p = settings.bmad_dir / project / "_bmad-output" / "planning-artifacts"
|
||||
if not p.exists():
|
||||
|
||||
@@ -90,13 +90,20 @@ def claim_for_spec(cur) -> dict | None:
|
||||
cycle then calls refine_spec on it.
|
||||
|
||||
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md):
|
||||
a row claimed < STALE_CLAIM_MINUTES ago by a live worker is not reclaimable."""
|
||||
a row claimed < STALE_CLAIM_MINUTES ago by a live worker is not reclaimable.
|
||||
|
||||
Order changed 2026-06-27 to drain cheap wins first: rows with fewer
|
||||
prior attempts get claimed before ones that have already been tried
|
||||
multiple times. This biases the scheduler toward fresh/converging
|
||||
stories and prevents one stuck story (high attempts, repeatedly
|
||||
re-emitting questions) from monopolizing the claim queue.
|
||||
"""
|
||||
sql = f"""
|
||||
SELECT id FROM work_items
|
||||
WHERE phase = 'spec'
|
||||
AND attempts < budget_cycles
|
||||
{STALE_CLAIM_SQL}
|
||||
ORDER BY priority ASC, updated_at ASC
|
||||
ORDER BY attempts ASC, priority ASC, updated_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
"""
|
||||
@@ -160,10 +167,20 @@ def claim_for_review(cur) -> dict | None:
|
||||
# --- writes ---------------------------------------------------------------
|
||||
|
||||
def upsert_story(cur, project: str, story_id: str, title: str, file_scope: list) -> str:
|
||||
"""Create or update a story row. Returns its id."""
|
||||
"""Create or update a story row. Returns its id.
|
||||
|
||||
2026-06-27: previously short-circuited on existing rows without
|
||||
updating title/file_scope, so re-ingest never backfilled the parsed
|
||||
file_scope. Now refreshes title and file_scope on every call so
|
||||
BMAD-source-of-truth is enforced.
|
||||
"""
|
||||
cur.execute("SELECT id FROM work_items WHERE project=%s AND story_id=%s", (project, story_id))
|
||||
existing = cur.fetchone()
|
||||
if existing:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET title=%s, file_scope=%s, updated_at=NOW() WHERE id=%s",
|
||||
(title, Jsonb(file_scope), existing["id"]),
|
||||
)
|
||||
return existing["id"]
|
||||
new_id = str(uuid.uuid4())
|
||||
cur.execute(
|
||||
@@ -175,8 +192,19 @@ def upsert_story(cur, project: str, story_id: str, title: str, file_scope: list)
|
||||
|
||||
|
||||
def set_phase(cur, work_id: str, phase: str, **fields) -> None:
|
||||
"""Move a row to a new phase and set optional fields (last_verdict, last_feedback, pr_url, ...)."""
|
||||
sets = ["phase = %s", "updated_at = NOW()", "claimed_by = NULL"]
|
||||
"""Move a row to a new phase and set optional fields (last_verdict, last_feedback, pr_url, ...).
|
||||
|
||||
Clears BOTH `claimed_by` and `claimed_at` on phase transition. Without
|
||||
clearing `claimed_at`, the stale-claim filter (see STALE_CLAIM_SQL)
|
||||
treats the row as actively-claimed even after the cycle that produced
|
||||
it finished — the row stays unclaimable for STALE_CLAIM_MINUTES, which
|
||||
silently starves the next phase (e.g. spec → build transitions never
|
||||
get re-claimed). Validated 2026-06-27: 3 build rows sat at
|
||||
claimed_by=NULL, claimed_at=<stale> for the full stale window because
|
||||
the spec→build transition only cleared the BY, not the AT.
|
||||
"""
|
||||
sets = ["phase = %s", "updated_at = NOW()",
|
||||
"claimed_by = NULL", "claimed_at = NULL"]
|
||||
params: list = [phase]
|
||||
for k, v in fields.items():
|
||||
# last_feedback is JSONB: wrap native dict/list so psycopg3 adapts it.
|
||||
@@ -199,12 +227,23 @@ def open_human_issue(cur, work_id: str, question: str) -> str:
|
||||
return issue_id
|
||||
|
||||
|
||||
def resolve_human_issues_for(cur, work_id: str) -> list[dict]:
|
||||
def resolve_human_issues_for(cur, work_id: str, limit: int = 3) -> list[dict]:
|
||||
"""Return the most-recent N answered human_issues for this work_item.
|
||||
|
||||
The cap defaults to 3 (added 2026-06-27) because the prior-decisions
|
||||
block is inlined into every spec_refiner prompt; without a cap, stories
|
||||
that cycle through 4+ spec rounds accumulate 4+ answered questions in
|
||||
the prompt and the LLM call slows down (~50s vs ~25s with the cap).
|
||||
3 is enough because the soft-pass gate markers ("prior decisions",
|
||||
"all settled", etc.) keep the refiner from re-asking anything older than
|
||||
the last few rounds — earlier rounds are already absorbed.
|
||||
"""
|
||||
cur.execute(
|
||||
"""SELECT * FROM human_issues
|
||||
WHERE work_item_id = %s AND status = 'answered'
|
||||
ORDER BY answered_at DESC""",
|
||||
(work_id,),
|
||||
ORDER BY answered_at DESC
|
||||
LIMIT %s""",
|
||||
(work_id, limit),
|
||||
)
|
||||
return list(cur.fetchall())
|
||||
|
||||
|
||||
@@ -40,9 +40,21 @@ broker = ListQueueBroker(
|
||||
scheduler = TaskiqScheduler(broker=broker, sources=[LabelScheduleSource(broker)])
|
||||
|
||||
|
||||
@broker.task(schedule=[{"cron": "* * * * *"}])
|
||||
@broker.task(schedule=[{"interval": 15}]) # every 15 seconds (was cron "* * * * *" = every 60s)
|
||||
def run_cycle() -> None:
|
||||
"""One orchestrator tick. Sync — Taskiq runs it in a threadpool, so the
|
||||
blocking subprocess/httpx calls in the phase functions work unchanged."""
|
||||
blocking subprocess/httpx calls in the phase functions work unchanged.
|
||||
|
||||
Cadence changed 2026-06-27 from 60s → 15s. Why: with the parallel LLM
|
||||
fan-out (ThreadPoolExecutor inside tick) and max_concurrent=10, each
|
||||
tick drains up to 10 rows in ~30s instead of ~5min. The 60s cron was
|
||||
the new floor — at 60s/tick we're effectively 1 batch per minute
|
||||
regardless of how fast the batch runs. 15s gives us 4 batches per
|
||||
minute = 40 specs/min theoretical, which the LLM proxy can sustain
|
||||
(300 writes/min rate limit). Minimum supported interval is 1 second;
|
||||
15s is conservative — leaves headroom for a tick to overrun before
|
||||
the next one fires (if a tick takes >15s, the scheduler skips the
|
||||
overlap rather than queuing duplicate ticks).
|
||||
"""
|
||||
from . import cycle
|
||||
cycle.tick()
|
||||
@@ -14,6 +14,17 @@ Test isolation: every test calls reset_state() in a fixture, which:
|
||||
1. TRUNCATEs work_items, human_issues, cost_ledger, events_outbox
|
||||
2. Inserts a single known story in a known phase
|
||||
3. Returns the row id
|
||||
|
||||
TEST DATABASE ISOLATION (added 2026-06-26):
|
||||
The pytest suite must NEVER TRUNCATE the production orchestrator DB at
|
||||
127.0.0.1:5432. By default the suite connects to the separate
|
||||
`db-test` compose service (port 5433 host / 5432 container, database
|
||||
`damascus_test`, separate volume `dbtestdata`). The `clean_state`
|
||||
autouse fixture runs `reset_state()` against this database only.
|
||||
|
||||
To run tests against the production DB (rare — only for diagnosing
|
||||
issues that don't repro against db-test), set `DAMASCUS_ALLOW_TEST_RESET=1`.
|
||||
The `prod-safety-guard` block in `reset_state()` will then allow it.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -30,15 +41,38 @@ DAMASCUS_ROOT = Path("/root/damascus-orchestrator")
|
||||
WIKI_ROOT = DAMASCUS_ROOT / "wiki"
|
||||
SPECS_DIR = DAMASCUS_ROOT / "specs" / "wh40k-pc"
|
||||
|
||||
# Production DB is identified by the FULL DSN — there's only one of it.
|
||||
# If ANY field differs, this is not production. Whitelisting by full tuple
|
||||
# is the only way to handle the fact that prod and test share the same
|
||||
# port number (5432) in different network contexts (host-bound vs
|
||||
# in-container). Tuple comparison is unforgeable; user/dbname checks
|
||||
# catch the case where someone points at port 5432 with the wrong creds
|
||||
# (which would be a misconfigured prod, not test).
|
||||
_PROD_DSNS = frozenset({
|
||||
# (host, port, user, dbname)
|
||||
("127.0.0.1", 5432, "damascus", "damascus"), # host-loopback to prod
|
||||
("localhost", 5432, "damascus", "damascus"), # same, via localhost
|
||||
("db", 5432, "damascus", "damascus"), # in-container via compose
|
||||
("damascus-orchestrator-db-1", 5432, "damascus", "damascus"), # by container name
|
||||
})
|
||||
|
||||
# Real Postgres connection (matches docker-compose env)
|
||||
# When running from the HOST, use 127.0.0.1:5432 (the host-bound port).
|
||||
# When running from INSIDE the orchestrator container, use db:5432 (compose service name).
|
||||
# Default: connect to the `db-test` compose service on its dedicated
|
||||
# port (5433 host / 5432 container). This is the TEST DB — its own
|
||||
# volume, its own credentials, its own database. Production DB at
|
||||
# 127.0.0.1:5432 is never touched.
|
||||
#
|
||||
# From the HOST (pytest on the dev machine): use 127.0.0.1:5433, which
|
||||
# compose's `ports:` mapping exposes. The orchestrator container reaches
|
||||
# the same DB at `db-test:5432` via the compose network.
|
||||
#
|
||||
# Override the test DSN via the DAMASCUS_TEST_PG_* env vars when needed.
|
||||
DB_CONFIG = dict(
|
||||
host=os.environ.get("DAMASCUS_PG_HOST", "127.0.0.1"),
|
||||
port=int(os.environ.get("DAMASCUS_PG_PORT", "5432")),
|
||||
user=os.environ.get("DAMASCUS_PG_USER", "damascus"),
|
||||
password=os.environ.get("DAMASCUS_PG_PASSWORD", "damascus"),
|
||||
dbname=os.environ.get("DAMASCUS_PG_DB", "damascus"),
|
||||
host=os.environ.get("DAMASCUS_TEST_PG_HOST") or os.environ.get("DAMASCUS_PG_HOST", "127.0.0.1"),
|
||||
port=int(os.environ.get("DAMASCUS_TEST_PG_PORT") or os.environ.get("DAMASCUS_PG_PORT", "5433")),
|
||||
user=os.environ.get("DAMASCUS_TEST_PG_USER") or os.environ.get("DAMASCUS_PG_USER", "damascus_test"),
|
||||
password=os.environ.get("DAMASCUS_TEST_PG_PASSWORD") or os.environ.get("DAMASCUS_PG_PASSWORD", "damascus_test"),
|
||||
dbname=os.environ.get("DAMASCUS_TEST_PG_DB") or os.environ.get("DAMASCUS_PG_DB", "damascus_test"),
|
||||
autocommit=False,
|
||||
)
|
||||
|
||||
@@ -57,8 +91,60 @@ def run_cycle_in_container():
|
||||
return result.stdout, result.stderr, result.returncode
|
||||
|
||||
|
||||
def _prod_safety_guard():
|
||||
"""Refuse to TRUNCATE the production DB unless explicitly opted in.
|
||||
|
||||
Identity check is a FULL (host, port, user, dbname) tuple. Any
|
||||
difference — even one field — means it's not prod. This catches:
|
||||
- host-loopback prod (127.0.0.1:5432/damascus/damascus)
|
||||
- in-container prod (db:5432/damascus/damascus)
|
||||
- misconfigured prod pointed-at with wrong creds (still prod, still bad)
|
||||
- test DB in container (db-test:5432/damascus_test/damascus_test) → safe
|
||||
- test DB from host (127.0.0.1:5433/damascus_test/damascus_test) → safe
|
||||
|
||||
DAMASCUS_ALLOW_TEST_RESET=1 permits the wipe with a loud warning.
|
||||
"""
|
||||
dsn = (DB_CONFIG["host"], DB_CONFIG["port"], DB_CONFIG["user"], DB_CONFIG["dbname"])
|
||||
is_prod = dsn in _PROD_DSNS
|
||||
|
||||
if not is_prod:
|
||||
return # Not prod (any other combination), proceed
|
||||
|
||||
if os.environ.get("DAMASCUS_ALLOW_TEST_RESET") == "1":
|
||||
import warnings
|
||||
warnings.warn(
|
||||
f"reset_state() running against PRODUCTION DB at {dsn} "
|
||||
f"because DAMASCUS_ALLOW_TEST_RESET=1. "
|
||||
f"All work_items, human_issues, cost_ledger, events_outbox, "
|
||||
f"and coordination_gates rows will be deleted.",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return
|
||||
|
||||
# Default: skip rather than wipe production.
|
||||
import warnings
|
||||
warnings.warn(
|
||||
f"reset_state() called against PRODUCTION DB at {dsn} — "
|
||||
f"skipping TRUNCATE. Either (a) unset DAMASCUS_TEST_PG_* so the "
|
||||
f"default db-test (127.0.0.1:5433/damascus_test/damascus_test) "
|
||||
f"is used, or (b) set DAMASCUS_ALLOW_TEST_RESET=1 to confirm "
|
||||
f"intent. pytest.skip()ing this fixture.",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
pytest.skip(
|
||||
f"reset_state() refused to TRUNCATE production DB at {dsn}."
|
||||
)
|
||||
|
||||
|
||||
def reset_state():
|
||||
"""Truncate all tables and restart sequences. Called by fixtures before each test."""
|
||||
"""Truncate all tables and restart sequences. Called by fixtures before each test.
|
||||
|
||||
Refuses to run against a known production DB unless
|
||||
DAMASCUS_ALLOW_TEST_RESET=1 is set in the environment.
|
||||
"""
|
||||
_prod_safety_guard()
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
@@ -131,7 +217,7 @@ def get_cost_rows(row_id):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_state():
|
||||
"""Every test starts with a clean MySQL state."""
|
||||
"""Every test starts with a clean test-DB state."""
|
||||
reset_state()
|
||||
yield
|
||||
# Don't clean up after — leave state for inspection if the test fails
|
||||
# Don't clean up after — leave state for inspection if the test fails
|
||||
@@ -393,6 +393,83 @@ def test_refine_spec_routes_non_empty_ambiguities_to_awaiting_human():
|
||||
)
|
||||
|
||||
|
||||
def test_refine_spec_prompt_section_names_match_post_check():
|
||||
"""The spec-refiner's prompt must use section header names that the
|
||||
post-check `_section()` regex can match.
|
||||
|
||||
Bug history (2026-06-26): the prompt asked for `## Acceptance
|
||||
Criteria (numbered)` (and similar parenthesized descriptions on every
|
||||
other section), but the post-check regex was strict —
|
||||
`^##\\s+<name>\\s*\\n` rejected any parenthesized suffix. The LLM
|
||||
faithfully copied the prompt's headers into its output, the post-check
|
||||
failed to recognize them, every spec went `spec_wrong` on first
|
||||
attempt, and the cycle's loop-breaker sent it back. attempts
|
||||
incremented; eventually parked as `blocked`.
|
||||
|
||||
Fix: broaden the `_section()` regex to `\\s*(\\([^)]*\\))?\\s*\\n` so it
|
||||
accepts both bare headers AND parenthesized descriptions. The prompt
|
||||
keeps its parentheticals (they're useful hints to the LLM about what
|
||||
belongs in each section's body).
|
||||
|
||||
This test pins both sides of the contract:
|
||||
- The post-check regex is permissive (accepts parenthesized suffix).
|
||||
- The prompt's section header list is present and matches what the
|
||||
post-check looks for.
|
||||
|
||||
See: wiki/queries/damascus-orchestrator/spec-refiner-text-parsing-2026-06-26.md
|
||||
for the full gap analysis (recommendation: replace text parsing with
|
||||
Pydantic-in / JSONB-out; tracked as a follow-up story).
|
||||
"""
|
||||
phases_py = (SRC / "phases.py").read_text()
|
||||
refine_body = phases_py.split("def refine_spec", 1)[1].split("\ndef ", 1)[0]
|
||||
|
||||
# 1. The prompt must list the four sections the post-check verifies.
|
||||
# The post-check looks for: Goal, Acceptance Criteria, TDD Plan, Test Command.
|
||||
required_post_check_sections = (
|
||||
"Goal",
|
||||
"Acceptance Criteria",
|
||||
"TDD Plan",
|
||||
"Test Command",
|
||||
)
|
||||
for section in required_post_check_sections:
|
||||
assert f"## {section}" in refine_body, (
|
||||
f"spec-refiner prompt is missing '## {section}' header. "
|
||||
f"The post-check looks for these exact names; if the prompt "
|
||||
f"doesn't list them, the LLM won't emit them."
|
||||
)
|
||||
|
||||
# 2. The prompt's section headers carry parenthesized descriptions
|
||||
# (e.g. `## TDD Plan (list the failing tests)`). These are
|
||||
# intentional hints to the LLM. The post-check regex MUST be
|
||||
# permissive enough to match them — verify the regex source
|
||||
# contains the optional-suffix group.
|
||||
assert r"(\([^)]*\))?" in phases_py, (
|
||||
"The _section() regex in phases.py must contain the optional "
|
||||
"parenthesized-suffix group `(\\([^)]*\\))?` to accept headers "
|
||||
"like `## TDD Plan (list the failing tests)`. Without it, "
|
||||
"every spec fails spec_wrong (the 2026-06-26 bug)."
|
||||
)
|
||||
|
||||
# 3. The prompt's parenthetical hints should be present — they're
|
||||
# what makes the LLM produce well-formed bodies. If someone
|
||||
# strips them, the LLM may emit headers without the suffix but
|
||||
# with empty bodies (acceptable, but the hints are useful).
|
||||
expected_prompt_hints = (
|
||||
"## Acceptance Criteria (numbered)",
|
||||
"## TDD Plan (list the failing tests)",
|
||||
"## File Scope (list of paths/globs the implementation may touch)",
|
||||
"## Test Command (the exact shell command that proves done)",
|
||||
"## Ambiguities (any open questions for a human)",
|
||||
)
|
||||
for hint in expected_prompt_hints:
|
||||
assert hint in refine_body, (
|
||||
f"spec-refiner prompt missing hint '{hint}'. The "
|
||||
f"parenthesized description tells the LLM what belongs in "
|
||||
f"the section's body. The post-check regex accepts this "
|
||||
f"suffix via the optional `(\\([^)]*\\))?` group."
|
||||
)
|
||||
|
||||
|
||||
def test_refine_spec_prompt_includes_row_constraints():
|
||||
"""The spec-refiner's prompt must inject the row's declared `file_scope` and
|
||||
`budget_cycles` so the LLM produces a spec that honors the row's pre-declared
|
||||
@@ -484,3 +561,25 @@ def test_reviewer_validate_does_not_pass_through_on_missing_artifacts():
|
||||
"B: worktree-recreate then validate, C: validate_skipped typed "
|
||||
"verdict)."
|
||||
)
|
||||
|
||||
|
||||
def test_set_phase_clears_both_claim_columns():
|
||||
"""state.set_phase() must clear BOTH claimed_by AND claimed_at on phase
|
||||
transition. Clearing only claimed_by leaves a stale claimed_at behind,
|
||||
which makes the stale-claim filter (STALE_CLAIM_SQL) treat the row as
|
||||
actively claimed for the full STALE_CLAIM_MINUTES window — starving the
|
||||
next phase (validated 2026-06-27: 3 spec→build rows sat unclaimable
|
||||
for the full window, no build attempts executed).
|
||||
|
||||
Without this contract, a future 'optimization' that drops the
|
||||
claimed_at=NULL clause silently re-introduces the starvation."""
|
||||
state_py = (SRC / "state.py").read_text()
|
||||
set_phase_body = state_py.split("def set_phase", 1)[1].split("\ndef ", 1)[0]
|
||||
assert "claimed_by = NULL" in set_phase_body, (
|
||||
"set_phase must clear claimed_by on phase transition"
|
||||
)
|
||||
assert "claimed_at = NULL" in set_phase_body, (
|
||||
"set_phase must clear claimed_at on phase transition "
|
||||
"(otherwise the stale-claim filter treats the row as actively "
|
||||
"claimed for STALE_CLAIM_MINUTES and blocks re-claim)"
|
||||
)
|
||||
|
||||
475
tests/contract/test_mcp_call_dispatch.py
Normal file
475
tests/contract/test_mcp_call_dispatch.py
Normal file
@@ -0,0 +1,475 @@
|
||||
"""Contract tests for ``tools/call`` dispatch in the damascus-mcp server.
|
||||
|
||||
These tests cover the full MCP protocol path — they construct a real
|
||||
``CallToolRequest`` and invoke ``mcp.request_handlers[CallToolRequest]``
|
||||
exactly the way the SDK's stdio handler does in production. This
|
||||
guarantees the handler is registered, receives a properly shaped
|
||||
request, and returns a properly shaped ``CallToolResult``.
|
||||
|
||||
The companion file ``test_mcp_roundtrip.py`` exercises
|
||||
``mcp_server.call_tool()`` directly, which goes through ``_dispatch``
|
||||
without the SDK's request layer. That was sufficient while the
|
||||
``@mcp.call_tool()`` decorator registered the handler, but it left a
|
||||
gap: the SDK's caching + input-validation pipeline was never tested.
|
||||
This file fills that gap.
|
||||
|
||||
Acceptance criteria covered here (from the kanban task body):
|
||||
|
||||
* ``tools/call`` for ``list_items`` with
|
||||
``{"project": "damascus-orchestrator", "limit": 1}`` returns a
|
||||
non-empty ``result.content`` array containing the JSON dump of
|
||||
``GET /v1/items?...``.
|
||||
* ``tools/call`` for ``system_status`` returns the same shape as
|
||||
``GET /v1/stats``.
|
||||
* ``tools/call`` for an unknown tool returns a JSON-RPC error
|
||||
response (not a silent drop).
|
||||
* ``tools/call`` with invalid arguments (e.g. ``priority_min=-1``
|
||||
for ``list_items``) returns a validation error.
|
||||
* ``tools/list`` still works and reports all 7 tools (regression).
|
||||
* The stdio recipe end-to-end: spawn server, send
|
||||
initialize/initialized/tools-call, assert valid response.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from damascus.api_schemas import (
|
||||
ListItemsResponse,
|
||||
McpListItemsArgs,
|
||||
StatsResponse,
|
||||
)
|
||||
|
||||
|
||||
# --- helpers -----------------------------------------------------------------
|
||||
|
||||
|
||||
def _sample_work_item(**overrides: Any) -> dict[str, Any]:
|
||||
base = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"project": "damascus-orchestrator",
|
||||
"story_id": "dispatch-1",
|
||||
"title": "Dispatch smoke",
|
||||
"phase": "spec",
|
||||
"file_scope": ["src/damascus/mcp_server.py"],
|
||||
"attempts": 0,
|
||||
"budget_cycles": 3,
|
||||
"priority": 100,
|
||||
"base_commit": None,
|
||||
"branch": None,
|
||||
"pr_url": None,
|
||||
"last_verdict": None,
|
||||
"last_feedback": None,
|
||||
"spec_path": None,
|
||||
"wiki_pin": None,
|
||||
"claimed_by": None,
|
||||
"claimed_at": None,
|
||||
"created_at": "2026-06-26T00:00:00",
|
||||
"updated_at": "2026-06-26T00:00:00",
|
||||
"merged_at": None,
|
||||
}
|
||||
base.update(overrides)
|
||||
return base
|
||||
|
||||
|
||||
def _stats_payload() -> dict[str, Any]:
|
||||
return {
|
||||
"phase_counts": {
|
||||
"spec": 0, "build": 0, "review": 0,
|
||||
"merged": 0, "blocked": 0, "awaiting_human": 0,
|
||||
},
|
||||
"open_human_issues": 0,
|
||||
"active_claims": 0,
|
||||
"last_cycle_at": None,
|
||||
"cost_today_usd": "0.000000",
|
||||
}
|
||||
|
||||
|
||||
class _Recorder:
|
||||
"""httpx MockTransport that captures calls and returns a canned payload."""
|
||||
|
||||
def __init__(self, response_payload: Any, status_code: int = 200) -> None:
|
||||
self.response_payload = response_payload
|
||||
self.status_code = status_code
|
||||
self.calls: list[httpx.Request] = []
|
||||
|
||||
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
||||
self.calls.append(request)
|
||||
return httpx.Response(
|
||||
self.status_code,
|
||||
json=self.response_payload,
|
||||
headers={"content-type": "application/json"},
|
||||
)
|
||||
|
||||
async def aclose(self) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def _build_call_request(
|
||||
name: str,
|
||||
arguments: dict[str, Any] | None = None,
|
||||
) -> Any:
|
||||
"""Construct a properly-shaped CallToolRequest (as the SDK would)."""
|
||||
from mcp.types import CallToolRequest, CallToolRequestParams
|
||||
|
||||
return CallToolRequest(
|
||||
method="tools/call",
|
||||
params=CallToolRequestParams(name=name, arguments=arguments or {}),
|
||||
)
|
||||
|
||||
|
||||
# --- fixtures ----------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_token(monkeypatch: pytest.MonkeyPatch) -> str:
|
||||
token = "DAMAS" + "X" * 27 + "N"
|
||||
monkeypatch.setenv("DAMASCUS_API_TOKEN", token)
|
||||
return token
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_base(monkeypatch: pytest.MonkeyPatch) -> str:
|
||||
base = "http://damascus-api.test:9110"
|
||||
monkeypatch.setenv("DAMASCUS_API_BASE", base)
|
||||
return base
|
||||
|
||||
|
||||
def _make_client(api_base: str, api_token: str, transport: Any) -> httpx.AsyncClient:
|
||||
return httpx.AsyncClient(
|
||||
base_url=api_base,
|
||||
headers={"Authorization": f"Bearer {api_token}"},
|
||||
transport=transport,
|
||||
)
|
||||
|
||||
|
||||
# --- structural: the handler is registered at the SDK level ------------------
|
||||
|
||||
|
||||
def test_call_tool_handler_is_registered() -> None:
|
||||
"""``mcp.request_handlers[CallToolRequest]`` must be present.
|
||||
|
||||
This is the explicit acceptance criterion the task body calls out:
|
||||
the handler must be bound to the SDK's dispatch table, not just
|
||||
reachable via the ``@mcp.call_tool()`` decorator. (The decorator
|
||||
does the same thing internally, but mirroring the list-tools
|
||||
pattern makes the wiring explicit and easier to reason about.)
|
||||
"""
|
||||
from damascus import mcp_server
|
||||
|
||||
handler = mcp_server.mcp.request_handlers.get(mcp_server.CallToolRequest)
|
||||
assert handler is not None, (
|
||||
"CallToolRequest handler is not registered — "
|
||||
"tools/call requests will be silently dropped by the SDK"
|
||||
)
|
||||
assert asyncio.iscoroutinefunction(handler), (
|
||||
"CallToolRequest handler must be a coroutine function (async def)"
|
||||
)
|
||||
|
||||
|
||||
# --- success path: dispatch returns the upstream JSON ------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_tool_list_items_dispatches_and_returns_json(
|
||||
api_token: str, api_base: str,
|
||||
) -> None:
|
||||
"""``tools/call list_items {project, limit: 1}`` returns the
|
||||
``GET /v1/items`` response payload as JSON text content.
|
||||
"""
|
||||
item = _sample_work_item()
|
||||
payload = {"items": [item], "total": 1, "limit": 1, "offset": 0}
|
||||
ListItemsResponse.model_validate(payload)
|
||||
|
||||
recorder = _Recorder(payload)
|
||||
from damascus import mcp_server
|
||||
|
||||
mcp_server._client = _make_client(api_base, api_token, recorder)
|
||||
try:
|
||||
result = await mcp_server.mcp.request_handlers[mcp_server.CallToolRequest](
|
||||
_build_call_request(
|
||||
"list_items",
|
||||
{"project": "damascus-orchestrator", "limit": 1},
|
||||
),
|
||||
)
|
||||
finally:
|
||||
await mcp_server._client.aclose()
|
||||
|
||||
# Exactly one HTTP call to GET /v1/items with the right query.
|
||||
assert len(recorder.calls) == 1
|
||||
call = recorder.calls[0]
|
||||
assert call.method == "GET"
|
||||
assert call.url.path == "/v1/items"
|
||||
assert call.url.params["project"] == "damascus-orchestrator"
|
||||
assert call.url.params["limit"] == "1"
|
||||
|
||||
# Unwrap ServerResult → CallToolResult.
|
||||
ctr = result.root
|
||||
assert ctr.isError is False, f"unexpected error result: {ctr}"
|
||||
assert len(ctr.content) >= 1
|
||||
text_block = ctr.content[0]
|
||||
assert text_block.type == "text"
|
||||
parsed = json.loads(text_block.text)
|
||||
assert parsed["total"] == 1
|
||||
assert parsed["items"][0]["project"] == "damascus-orchestrator"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_tool_system_status_returns_stats_shape(
|
||||
api_token: str, api_base: str,
|
||||
) -> None:
|
||||
"""``tools/call system_status`` returns the ``GET /v1/stats`` payload."""
|
||||
payload = _stats_payload()
|
||||
StatsResponse.model_validate(payload)
|
||||
|
||||
recorder = _Recorder(payload)
|
||||
from damascus import mcp_server
|
||||
|
||||
mcp_server._client = _make_client(api_base, api_token, recorder)
|
||||
try:
|
||||
result = await mcp_server.mcp.request_handlers[mcp_server.CallToolRequest](
|
||||
_build_call_request("system_status", {}),
|
||||
)
|
||||
finally:
|
||||
await mcp_server._client.aclose()
|
||||
|
||||
assert len(recorder.calls) == 1
|
||||
call = recorder.calls[0]
|
||||
assert call.method == "GET"
|
||||
assert call.url.path == "/v1/stats"
|
||||
|
||||
ctr = result.root
|
||||
assert ctr.isError is False
|
||||
parsed = json.loads(ctr.content[0].text)
|
||||
# Shape parity with /v1/stats — keys present, types match
|
||||
assert parsed["open_human_issues"] == 0
|
||||
assert "phase_counts" in parsed
|
||||
assert "cost_today_usd" in parsed
|
||||
|
||||
|
||||
# --- error paths -------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_tool_unknown_tool_returns_error_result(
|
||||
api_token: str, api_base: str,
|
||||
) -> None:
|
||||
"""An unknown tool name must produce a ``CallToolResult`` with
|
||||
``isError=True``, not a silent drop.
|
||||
|
||||
The dispatch raises ``ValueError`` on an unknown name; the SDK's
|
||||
handler catches that exception and returns an error ``CallToolResult``
|
||||
with ``isError=True``.
|
||||
"""
|
||||
from damascus import mcp_server
|
||||
|
||||
# No HTTP client needed — dispatch raises before touching upstream.
|
||||
result = await mcp_server.mcp.request_handlers[mcp_server.CallToolRequest](
|
||||
_build_call_request("no_such_tool", {}),
|
||||
)
|
||||
ctr = result.root
|
||||
assert ctr.isError is True, (
|
||||
"unknown tool must produce isError=True so clients see the failure"
|
||||
)
|
||||
assert len(ctr.content) >= 1
|
||||
text = ctr.content[0].text
|
||||
assert "no_such_tool" in text, (
|
||||
f"error message should mention the bad tool name; got {text!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_tool_invalid_args_returns_validation_error(
|
||||
api_token: str, api_base: str,
|
||||
) -> None:
|
||||
"""``priority_min=-1`` violates ``McpListItemsArgs.priority_min >= 0``.
|
||||
|
||||
The Mcp*Args model validates before any HTTP call; a violation
|
||||
must surface as a ``CallToolResult`` with ``isError=True``.
|
||||
"""
|
||||
from damascus import mcp_server
|
||||
|
||||
result = await mcp_server.mcp.request_handlers[mcp_server.CallToolRequest](
|
||||
_build_call_request(
|
||||
"list_items",
|
||||
{"project": "damascus-orchestrator", "priority_min": -1},
|
||||
),
|
||||
)
|
||||
ctr = result.root
|
||||
assert ctr.isError is True
|
||||
text = ctr.content[0].text
|
||||
# Pydantic v2's error format — assert the field name is surfaced
|
||||
assert "priority_min" in text, (
|
||||
f"validation error should name the bad field; got {text!r}"
|
||||
)
|
||||
# And McpListItemsArgs is the validator that raised
|
||||
assert "McpListItemsArgs" in text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_tool_priority_bounds_invariant_violated(
|
||||
api_token: str, api_base: str,
|
||||
) -> None:
|
||||
"""``priority_max < priority_min`` violates the cross-field invariant
|
||||
in :class:`McpListItemsArgs` (``_priority_bounds`` model_validator).
|
||||
"""
|
||||
from damascus import mcp_server
|
||||
|
||||
result = await mcp_server.mcp.request_handlers[mcp_server.CallToolRequest](
|
||||
_build_call_request(
|
||||
"list_items",
|
||||
{"project": "damascus-orchestrator",
|
||||
"priority_min": 100, "priority_max": 50},
|
||||
),
|
||||
)
|
||||
ctr = result.root
|
||||
assert ctr.isError is True
|
||||
text = ctr.content[0].text
|
||||
assert "priority_max" in text and "priority_min" in text
|
||||
|
||||
|
||||
# --- regression: list-tools still works -------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_tools_still_reports_seven_tools(api_base: str) -> None:
|
||||
"""Regression: tools/list must keep returning all 7 tools."""
|
||||
from damascus import mcp_server
|
||||
|
||||
# First a tools/call so the SDK refreshes its cache (proves the
|
||||
# wiring works end-to-end without depending on cache state).
|
||||
recorder = _Recorder(_stats_payload())
|
||||
mcp_server._client = _make_client(api_base, "dummy", recorder)
|
||||
try:
|
||||
await mcp_server.mcp.request_handlers[mcp_server.CallToolRequest](
|
||||
_build_call_request("system_status", {}),
|
||||
)
|
||||
finally:
|
||||
await mcp_server._client.aclose()
|
||||
|
||||
# Then a tools/list request via the SDK handler.
|
||||
list_result = await mcp_server.mcp.request_handlers[
|
||||
mcp_server.ListToolsRequest
|
||||
](None)
|
||||
tools = list_result.root.tools
|
||||
names = sorted(t.name for t in tools)
|
||||
assert names == sorted([
|
||||
"list_items",
|
||||
"get_item",
|
||||
"list_open_questions",
|
||||
"answer_question",
|
||||
"ingest_story",
|
||||
"bulk_ingest",
|
||||
"system_status",
|
||||
]), f"unexpected tool list: {names}"
|
||||
|
||||
|
||||
def test_list_items_input_schema_matches_args_model() -> None:
|
||||
"""Regression: inputSchema for list_items matches
|
||||
``McpListItemsArgs.model_json_schema()`` — drift is the primary
|
||||
contract risk (wiki/concepts/entry-points-contract.md §5)."""
|
||||
from damascus import mcp_server
|
||||
|
||||
tools = {t.name: t for t in mcp_server.mcp.list_tools()}
|
||||
actual = tools["list_items"].inputSchema
|
||||
expected = McpListItemsArgs.model_json_schema()
|
||||
assert actual == expected, (
|
||||
f"inputSchema drift for list_items:\n"
|
||||
f" registered: {json.dumps(actual, sort_keys=True)[:300]}\n"
|
||||
f" expected: {json.dumps(expected, sort_keys=True)[:300]}"
|
||||
)
|
||||
|
||||
|
||||
# --- end-to-end stdio smoke --------------------------------------------------
|
||||
|
||||
|
||||
async def _stdio_round_trip() -> dict[str, Any]:
|
||||
"""Spawn ``damascus mcp-serve`` over stdio, run the full MCP
|
||||
handshake, call ``system_status``, return the response.
|
||||
|
||||
The upstream URL points to ``example.test`` so the HTTP call will
|
||||
fail with a connection error — that proves the dispatch IS firing
|
||||
(the error is from the HTTP layer, not a silent drop).
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
env["DAMASCUS_API_BASE"] = "http://example.test:9999"
|
||||
env["DAMASCUS_API_TOKEN"] = "DAMAS" + "X" * 27 + "N"
|
||||
env["PYTHONUNBUFFERED"] = "1"
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"damascus", "mcp-serve",
|
||||
cwd=str(Path.cwd()),
|
||||
env=env,
|
||||
stdin=asyncio.subprocess.PIPE,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
|
||||
async def send(req: dict[str, Any]) -> None:
|
||||
line = json.dumps(req) + "\n"
|
||||
assert proc.stdin is not None
|
||||
proc.stdin.write(line.encode())
|
||||
await proc.stdin.drain()
|
||||
|
||||
async def recv(timeout: float = 8.0) -> dict[str, Any]:
|
||||
assert proc.stdout is not None
|
||||
line = await asyncio.wait_for(proc.stdout.readline(), timeout=timeout)
|
||||
return json.loads(line.decode())
|
||||
|
||||
try:
|
||||
await send({
|
||||
"jsonrpc": "2.0", "id": 1, "method": "initialize",
|
||||
"params": {
|
||||
"protocolVersion": "2024-11-05",
|
||||
"capabilities": {},
|
||||
"clientInfo": {"name": "dispatch-test", "version": "0"},
|
||||
},
|
||||
})
|
||||
await recv(timeout=5.0)
|
||||
await send({"jsonrpc": "2.0", "method": "notifications/initialized"})
|
||||
await send({
|
||||
"jsonrpc": "2.0", "id": 3, "method": "tools/call",
|
||||
"params": {"name": "system_status", "arguments": {}},
|
||||
})
|
||||
return await recv(timeout=10.0)
|
||||
finally:
|
||||
try:
|
||||
assert proc.stdin is not None
|
||||
proc.stdin.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await asyncio.wait_for(proc.wait(), timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stdio_end_to_end_dispatch() -> None:
|
||||
"""End-to-end: stdio transport → initialize → tools/call → response.
|
||||
|
||||
Asserts the JSON-RPC envelope is well-formed and the response
|
||||
contains a ``result`` (not a protocol-level error). The upstream
|
||||
HTTP error (example.test) is fine — it surfaces as a ``CallToolResult``
|
||||
with ``isError=True``, which proves dispatch fired.
|
||||
"""
|
||||
response = await _stdio_round_trip()
|
||||
assert response.get("jsonrpc") == "2.0"
|
||||
assert response.get("id") == 3
|
||||
# Must be a successful JSON-RPC response (result, not error at the
|
||||
# protocol level). The result content may carry isError=True from
|
||||
# the upstream HTTP failure — that's fine, dispatch happened.
|
||||
assert "result" in response, (
|
||||
f"tools/call got a protocol error or silent drop: {response}"
|
||||
)
|
||||
inner = response["result"]
|
||||
assert "content" in inner and len(inner["content"]) >= 1
|
||||
assert inner["content"][0].get("type") == "text"
|
||||
3
tests/e2e/requirements.txt
Normal file
3
tests/e2e/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
pytest>=7
|
||||
pytest-playwright>=0.5
|
||||
requests>=2.31
|
||||
@@ -46,6 +46,10 @@ How to run:
|
||||
docker compose up -d db damascus-api damascus-ui-build
|
||||
cd /root/damascus-orchestrator
|
||||
python3 -m pytest tests/e2e/test_entry_points_e2e.py -q -s
|
||||
|
||||
# Worktree / alternate-evidence-dir:
|
||||
DAMASCUS_ROOT=/path/to/worktree DAMASCUS_EVIDENCE_NAME=p6b \
|
||||
python3 -m pytest tests/e2e/test_entry_points_e2e.py -q -s
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -66,8 +70,9 @@ from psycopg.rows import dict_row
|
||||
|
||||
# --- paths & config ---------------------------------------------------------
|
||||
|
||||
DAMASCUS_ROOT = Path("/root/damascus-orchestrator")
|
||||
EVIDENCE_DIR = DAMASCUS_ROOT / ".hermes" / "evidence" / "p6"
|
||||
DAMASCUS_ROOT = Path(os.environ.get("DAMASCUS_ROOT", "/root/damascus-orchestrator"))
|
||||
EVIDENCE_NAME = os.environ.get("DAMASCUS_EVIDENCE_NAME", "p6")
|
||||
EVIDENCE_DIR = DAMASCUS_ROOT / ".hermes" / "evidence" / EVIDENCE_NAME
|
||||
SCREENSHOTS = EVIDENCE_DIR / "screenshots"
|
||||
LOGS = EVIDENCE_DIR / "logs"
|
||||
SCREENSHOTS.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
325
tests/test_conftest_safety.py
Normal file
325
tests/test_conftest_safety.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""
|
||||
Tests for the conftest.py prod-safety guard (tuple-based identity check).
|
||||
|
||||
The guard refuses to TRUNCATE a database whose (host, port, user, dbname)
|
||||
tuple matches the production DB. Anything else (test DB, in-container test,
|
||||
mismatched creds) is treated as not-prod and proceeds.
|
||||
|
||||
These tests verify that:
|
||||
1. Default DSN points at db-test (127.0.0.1:5433 / damascus_test / damascus_test).
|
||||
2. Production tuples (host-loopback, in-container via `db`, container name)
|
||||
are recognized and refused without opt-in.
|
||||
3. Tuple must match EXACTLY — any field mismatch (wrong port, wrong user,
|
||||
wrong dbname, wrong host) is treated as not-prod.
|
||||
4. DAMASCUS_ALLOW_TEST_RESET=1 permits production wipe with a warning.
|
||||
5. The in-container test DSN (`db-test:5432/damascus_test/damascus_test`)
|
||||
is treated as not-prod — important because the orchestrator worker runs
|
||||
pytest INSIDE the container and reaches the test DB via this tuple.
|
||||
|
||||
Run from the repo root:
|
||||
pytest tests/test_conftest_safety.py -v
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _reload_conftest():
|
||||
"""Reload the conftest module so env-var changes take effect."""
|
||||
for mod_name in list(sys.modules.keys()):
|
||||
if "conftest" in mod_name:
|
||||
del sys.modules[mod_name]
|
||||
import conftest # type: ignore
|
||||
importlib.reload(conftest)
|
||||
return conftest
|
||||
|
||||
|
||||
def _clear_pg_env(monkeypatch):
|
||||
"""Clear every DAMASCUS_PG_* and DAMASCUS_TEST_PG_* env var so the
|
||||
module's DB_CONFIG reflects only the hard-coded defaults.
|
||||
"""
|
||||
for var in (
|
||||
"DAMASCUS_TEST_PG_HOST", "DAMASCUS_TEST_PG_PORT",
|
||||
"DAMASCUS_TEST_PG_USER", "DAMASCUS_TEST_PG_PASSWORD",
|
||||
"DAMASCUS_TEST_PG_DB",
|
||||
"DAMASCUS_PG_HOST", "DAMASCUS_PG_PORT",
|
||||
"DAMASCUS_PG_USER", "DAMASCUS_PG_PASSWORD", "DAMASCUS_PG_DB",
|
||||
"DAMASCUS_ALLOW_TEST_RESET",
|
||||
):
|
||||
monkeypatch.delenv(var, raising=False)
|
||||
|
||||
|
||||
# ── Default config ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_db_config_defaults_to_test_db(monkeypatch):
|
||||
"""DB_CONFIG defaults should point at the host-loopback test DB,
|
||||
NOT production. Host 127.0.0.1 + port 5433 + damascus_test user +
|
||||
damascus_test dbname is the host-bound port mapping for db-test.
|
||||
"""
|
||||
_clear_pg_env(monkeypatch)
|
||||
conftest = _reload_conftest()
|
||||
|
||||
assert conftest.DB_CONFIG["host"] == "127.0.0.1"
|
||||
assert conftest.DB_CONFIG["port"] == 5433
|
||||
assert conftest.DB_CONFIG["user"] == "damascus_test"
|
||||
assert conftest.DB_CONFIG["password"] == "damascus_test"
|
||||
assert conftest.DB_CONFIG["dbname"] == "damascus_test"
|
||||
|
||||
# The default tuple MUST NOT match any production tuple.
|
||||
dsn = ("127.0.0.1", 5433, "damascus_test", "damascus_test")
|
||||
assert dsn not in conftest._PROD_DSNS
|
||||
|
||||
|
||||
def test_db_config_explicit_overrides(monkeypatch):
|
||||
"""DAMASCUS_TEST_PG_* env vars override the defaults."""
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "staging-db")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5434")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "staging_user")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "staging_pw")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "staging_db")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
assert conftest.DB_CONFIG["host"] == "staging-db"
|
||||
assert conftest.DB_CONFIG["port"] == 5434
|
||||
assert conftest.DB_CONFIG["user"] == "staging_user"
|
||||
assert conftest.DB_CONFIG["password"] == "staging_pw"
|
||||
assert conftest.DB_CONFIG["dbname"] == "staging_db"
|
||||
|
||||
|
||||
# ── Prod detection: the four canonical tuples ───────────────────────────
|
||||
|
||||
|
||||
def test_prod_safety_guard_skips_host_loopback_prod(monkeypatch):
|
||||
"""127.0.0.1:5432/damascus/damascus = prod (host-loopback). Skip without opt-in."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "127.0.0.1")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
with pytest.raises(pytest.skip.Exception):
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
def test_prod_safety_guard_skips_in_container_via_db_host(monkeypatch):
|
||||
"""db:5432/damascus/damascus = prod (in-container via compose). Skip."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "db")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
with pytest.raises(pytest.skip.Exception):
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
def test_prod_safety_guard_skips_localhost(monkeypatch):
|
||||
"""localhost:5432/damascus/damascus = prod. Skip."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "localhost")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
with pytest.raises(pytest.skip.Exception):
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
def test_prod_safety_guard_skips_container_name(monkeypatch):
|
||||
"""damascus-orchestrator-db-1:5432/damascus/damascus = prod. Skip."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "damascus-orchestrator-db-1")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
with pytest.raises(pytest.skip.Exception):
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
# ── Tuple mismatches: should NOT be treated as prod ─────────────────────
|
||||
|
||||
|
||||
def test_prod_safety_guard_treats_in_container_test_as_safe(monkeypatch):
|
||||
"""db-test:5432/damascus_test/damascus_test = test DB (in-container).
|
||||
|
||||
This is the DSN an orchestrator worker uses when running pytest
|
||||
inside the container. Same port as prod (5432), different host,
|
||||
different user, different dbname. MUST NOT be treated as prod.
|
||||
"""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "db-test")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus_test")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus_test")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus_test")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
# Stub get_conn so no real DB is touched
|
||||
class FakeCursor:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def execute(self, *a, **k): pass
|
||||
|
||||
class FakeConn:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def cursor(self): return FakeCursor()
|
||||
def commit(self): pass
|
||||
def close(self): pass
|
||||
|
||||
monkeypatch.setattr(conftest, "get_conn", lambda: FakeConn())
|
||||
|
||||
# Should NOT raise — this is the test DB
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
def test_prod_safety_guard_treats_wrong_user_as_safe(monkeypatch):
|
||||
"""127.0.0.1:5432/wrong_user/damascus = not prod (mismatched user)."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "127.0.0.1")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "wrong_user")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "wrong_pw")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
class FakeCursor:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def execute(self, *a, **k): pass
|
||||
|
||||
class FakeConn:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def cursor(self): return FakeCursor()
|
||||
def commit(self): pass
|
||||
def close(self): pass
|
||||
|
||||
monkeypatch.setattr(conftest, "get_conn", lambda: FakeConn())
|
||||
|
||||
# Wrong user = not prod. Should NOT skip.
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
def test_prod_safety_guard_treats_wrong_dbname_as_safe(monkeypatch):
|
||||
"""127.0.0.1:5432/damascus/wrong_db = not prod (mismatched dbname)."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "127.0.0.1")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "wrong_db")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
class FakeCursor:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def execute(self, *a, **k): pass
|
||||
|
||||
class FakeConn:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def cursor(self): return FakeCursor()
|
||||
def commit(self): pass
|
||||
def close(self): pass
|
||||
|
||||
monkeypatch.setattr(conftest, "get_conn", lambda: FakeConn())
|
||||
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
# ── Opt-in path ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_prod_safety_guard_opt_in(monkeypatch):
|
||||
"""With DAMASCUS_ALLOW_TEST_RESET=1 the guard permits prod wipe (with warning)."""
|
||||
_clear_pg_env(monkeypatch)
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_HOST", "127.0.0.1")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PORT", "5432")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_USER", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_PASSWORD", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_TEST_PG_DB", "damascus")
|
||||
monkeypatch.setenv("DAMASCUS_ALLOW_TEST_RESET", "1")
|
||||
|
||||
conftest = _reload_conftest()
|
||||
|
||||
class FakeCursor:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def execute(self, *a, **k): pass
|
||||
|
||||
class FakeConn:
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *a): pass
|
||||
def cursor(self): return FakeCursor()
|
||||
def commit(self): pass
|
||||
def close(self): pass
|
||||
|
||||
monkeypatch.setattr(conftest, "get_conn", lambda: FakeConn())
|
||||
|
||||
with pytest.warns(RuntimeWarning, match="PRODUCTION DB"):
|
||||
conftest.reset_state()
|
||||
|
||||
|
||||
# ── Constants & invariants ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_prod_dsn_constant_includes_all_four_prod_tuples():
|
||||
"""_PROD_DSNS must include the four canonical production tuples."""
|
||||
from conftest import _PROD_DSNS # type: ignore
|
||||
|
||||
expected = {
|
||||
("127.0.0.1", 5432, "damascus", "damascus"),
|
||||
("localhost", 5432, "damascus", "damascus"),
|
||||
("db", 5432, "damascus", "damascus"),
|
||||
("damascus-orchestrator-db-1", 5432, "damascus", "damascus"),
|
||||
}
|
||||
assert expected.issubset(_PROD_DSNS)
|
||||
|
||||
|
||||
def test_prod_dsn_excludes_test_tuples():
|
||||
"""_PROD_DSNS must NOT include any test DB tuple."""
|
||||
from conftest import _PROD_DSNS # type: ignore
|
||||
|
||||
forbidden = {
|
||||
("127.0.0.1", 5433, "damascus_test", "damascus_test"), # host->test
|
||||
("db-test", 5432, "damascus_test", "damascus_test"), # in-container test
|
||||
("localhost", 5433, "damascus_test", "damascus_test"),
|
||||
}
|
||||
for dsn in forbidden:
|
||||
assert dsn not in _PROD_DSNS, f"Test DSN {dsn} wrongly in _PROD_DSNS"
|
||||
|
||||
|
||||
def test_module_invariants():
|
||||
"""Smoke test: module imports cleanly with all expected callables."""
|
||||
import conftest # type: ignore
|
||||
|
||||
assert callable(conftest.get_conn)
|
||||
assert callable(conftest.reset_state)
|
||||
assert callable(conftest.insert_work_item)
|
||||
assert callable(conftest.get_row)
|
||||
assert callable(conftest.get_events)
|
||||
assert callable(conftest.get_cost_rows)
|
||||
assert hasattr(conftest, "clean_state")
|
||||
import _pytest.fixtures # noqa
|
||||
assert isinstance(conftest.clean_state, _pytest.fixtures.FixtureFunctionDefinition)
|
||||
171
tests/test_cycle_transient_skip.py
Normal file
171
tests/test_cycle_transient_skip.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Unit tests for ADR-005: cycle.py loop-breaker skips when feedback.transient=True.
|
||||
|
||||
Story: S2 — Distinguish transient vs structural tests_failed
|
||||
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
|
||||
|
||||
Contract:
|
||||
- tests_failed with feedback.transient=True → row stays in same phase,
|
||||
attempts does NOT increment, NO human_issues row created, phase.transient_retry
|
||||
event emitted.
|
||||
- tests_failed with feedback.transient=False (or absent) → existing 3-strike
|
||||
behavior preserved (attempts increments, blocked after budget, human_issue
|
||||
opened).
|
||||
|
||||
These tests drive `cycle.tick()` against the real test Postgres (conftest's
|
||||
default `db-test` service) and stub `phases.build` so the build phase is
|
||||
hermetic. wiki/relay are stubbed the same way S1 did.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import get_events, get_row, insert_work_item
|
||||
from damascus import cycle, phases, relay, wiki
|
||||
|
||||
|
||||
def _stub_build_returning(verdict: str, feedback: dict):
|
||||
"""Build a fake phases.build that returns a fixed verdict/feedback.
|
||||
|
||||
Routes through `_transient_verdict` so the feedback gets `transient=True`
|
||||
for matching errors, mirroring what real `phases.build()` does in Txn 2.
|
||||
"""
|
||||
def fake_build(cur, item):
|
||||
return phases._transient_verdict(verdict, dict(feedback))
|
||||
return fake_build
|
||||
|
||||
|
||||
def _run_tick_with_build_stub(monkeypatch, verdict: str, feedback: dict) -> dict:
|
||||
"""Run one orchestrator tick with the build phase stubbed.
|
||||
|
||||
wiki.init_wiki and relay.post are no-ops so the test does not touch
|
||||
the host filesystem or any external service.
|
||||
"""
|
||||
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
|
||||
monkeypatch.setattr(relay, "post", lambda line: None)
|
||||
monkeypatch.setattr(phases, "build", _stub_build_returning(verdict, feedback))
|
||||
|
||||
out = cycle.tick()
|
||||
assert out["claimed"] is not None, "tick did not claim a row"
|
||||
return out
|
||||
|
||||
|
||||
def test_transient_skips_loop_breaker(monkeypatch):
|
||||
"""AC: transient tests_failed → row stays in build, attempts unchanged,
|
||||
no human_issues row, phase.transient_retry event emitted."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-transient",
|
||||
title="Transient tests_failed should not loop-break")
|
||||
|
||||
# Set attempts close to (but below) budget so we can observe it NOT increment.
|
||||
# Use budget_cycles=3 and set attempts=2; transient must NOT move it to 3+.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET attempts = %s WHERE id = %s",
|
||||
(2, rid),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
out = _run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "project repo not found at /workspace/projects/foo; clone the Gitea repo"},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
# Phase should stay in 'build' (transient re-attempt, not advance).
|
||||
assert row["phase"] == "build", (
|
||||
f"expected phase='build' (transient retry), got {row['phase']!r}"
|
||||
)
|
||||
# Attempts must NOT have been re-incremented by the claim (it's the same row,
|
||||
# same phase; per cycle.py transient branch we skip the loop-breaker entirely).
|
||||
# The claim_for_build path always increments attempts, but the transient branch
|
||||
# in cycle.py writes the row back to the SAME phase without further increment.
|
||||
# The test asserts attempts is at most 3 (claim incremented to 3, loop-breaker
|
||||
# skipped — row would normally escalate to blocked if attempts reached budget).
|
||||
assert row["attempts"] <= 3, (
|
||||
f"transient should not have triggered an extra increment beyond the claim; "
|
||||
f"got attempts={row['attempts']}"
|
||||
)
|
||||
|
||||
# No human_issues row should exist for this work_item.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 0, f"transient path should NOT open human_issue; found {n}"
|
||||
|
||||
# phase.transient_retry event should be emitted.
|
||||
events = get_events(rid)
|
||||
transient_events = [e for e in events if e["kind"] == "phase.transient_retry"]
|
||||
assert len(transient_events) == 1, (
|
||||
f"expected 1 phase.transient_retry event, got {len(transient_events)} "
|
||||
f"(all event kinds: {[e['kind'] for e in events]})"
|
||||
)
|
||||
|
||||
|
||||
def test_structural_still_loops(monkeypatch):
|
||||
"""AC: non-transient tests_failed preserves existing 3-strike behavior
|
||||
(attempts increments, blocked after budget exhaustion, human_issue opened)."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-structural",
|
||||
title="Structural tests_failed must still loop-break")
|
||||
|
||||
# Set attempts AT budget (budget_cycles=3, attempts=3 → next claim would
|
||||
# NOT happen because the SQL filter requires attempts < budget_cycles.
|
||||
# We must pre-claim and then drive the verdict through to blocked. Use a
|
||||
# budget of 3 and a fresh row, and drive one tick that hits the block.
|
||||
# Per state.claim_for_build: filter is `attempts < budget_cycles` → claim
|
||||
# requires attempts <= 2. So we set attempts=2 (== budget_cycles - 1) and
|
||||
# let the claim push it to 3, then the verdict-write path will see
|
||||
# attempts >= budget_cycles and transition to blocked.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET attempts = %s, budget_cycles = %s WHERE id = %s",
|
||||
(2, 3, rid),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Pass attempts=2 → claim pushes to 3 → loop-breaker transitions to blocked.
|
||||
_run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "test_exited_with_code_1", "stderr": "AssertionError..."},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["phase"] == "blocked", (
|
||||
f"expected phase='blocked' (3-strike budget exhausted), got {row['phase']!r}"
|
||||
)
|
||||
assert row["attempts"] == 3, (
|
||||
f"expected attempts=3 (claim incremented from 2), got {row['attempts']}"
|
||||
)
|
||||
|
||||
# human_issues row should exist.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 1, f"structural path must open human_issue at blocked; found {n}"
|
||||
145
tests/test_first_attempted_at.py
Normal file
145
tests/test_first_attempted_at.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
Unit tests for ADR-005: 24h escalation after persistent transient retries.
|
||||
|
||||
Story: S2 — Distinguish transient vs structural tests_failed
|
||||
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
|
||||
|
||||
Contract: After 24h of persistent transient retries (no pass), the row
|
||||
escalates to blocked + human_issue. We simulate the time advance by
|
||||
directly setting `first_attempted_at` to a time in the past, then drive
|
||||
a transient verdict and observe the row reaches blocked.
|
||||
|
||||
We also test that fresh transient retries (first_attempted_at within 24h)
|
||||
do NOT escalate.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import get_conn, get_events, get_row, insert_work_item
|
||||
from damascus import cycle, phases, relay, wiki
|
||||
|
||||
|
||||
def _stub_build_returning(verdict: str, feedback: dict):
|
||||
def fake_build(cur, item):
|
||||
return {"verdict": verdict, "feedback": feedback}
|
||||
return fake_build
|
||||
|
||||
|
||||
def _run_tick_with_build_stub(monkeypatch, verdict: str, feedback: dict) -> dict:
|
||||
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
|
||||
monkeypatch.setattr(relay, "post", lambda line: None)
|
||||
monkeypatch.setattr(phases, "build", _stub_build_returning(verdict, feedback))
|
||||
out = cycle.tick()
|
||||
assert out["claimed"] is not None, "tick did not claim a row"
|
||||
return out
|
||||
|
||||
|
||||
def _set_first_attempted_at(row_id: str, when: datetime) -> None:
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET first_attempted_at = %s WHERE id = %s",
|
||||
(when, row_id),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_24h_escalation(monkeypatch):
|
||||
"""AC: After 24h of persistent transient retries (no pass), the row
|
||||
escalates to blocked + human_issue is opened."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-24h",
|
||||
title="Persistent transient after 24h must escalate")
|
||||
|
||||
# Backdate first_attempted_at by 25 hours (past the 24h threshold).
|
||||
past = datetime.now(timezone.utc) - timedelta(hours=25)
|
||||
_set_first_attempted_at(rid, past)
|
||||
|
||||
# Drive a transient tests_failed verdict. With the time advanced past 24h,
|
||||
# the cycle must transition to blocked + open a human_issue.
|
||||
_run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "project repo not found at /workspace/projects/foo", "transient": True},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["phase"] == "blocked", (
|
||||
f"24h-old transient must escalate to blocked; got phase={row['phase']!r}"
|
||||
)
|
||||
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 1, f"24h escalation must open a human_issue; found {n}"
|
||||
|
||||
|
||||
def test_fresh_transient_does_not_escalate(monkeypatch):
|
||||
"""AC: A transient tests_failed within 24h of first_attempted_at must NOT
|
||||
escalate to blocked — it stays in build (transient retry)."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-fresh",
|
||||
title="Fresh transient retries should not escalate")
|
||||
|
||||
# Set first_attempted_at to right now (within 24h).
|
||||
_set_first_attempted_at(rid, datetime.now(timezone.utc))
|
||||
|
||||
_run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "project repo not found at /workspace/projects/foo", "transient": True},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["phase"] == "build", (
|
||||
f"fresh transient must stay in build; got phase={row['phase']!r}"
|
||||
)
|
||||
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 0, f"fresh transient must NOT open human_issue; found {n}"
|
||||
|
||||
|
||||
def test_first_attempted_at_set_on_first_claim():
|
||||
"""AC: state.claim_for_build sets first_attempted_at on first claim."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-firstclaim",
|
||||
title="First claim should set first_attempted_at")
|
||||
# Initially NULL.
|
||||
row = get_row(rid)
|
||||
assert row["first_attempted_at"] is None
|
||||
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor(row_factory=None) as cur:
|
||||
from damascus import state
|
||||
cur.execute("BEGIN")
|
||||
claimed = state.claim_for_build(cur)
|
||||
assert claimed is not None
|
||||
assert claimed["id"] == rid
|
||||
cur.execute("COMMIT")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["first_attempted_at"] is not None, (
|
||||
"first_attempted_at must be set on the first claim_for_build"
|
||||
)
|
||||
49
tests/test_is_transient.py
Normal file
49
tests/test_is_transient.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""
|
||||
Unit tests for ADR-005: classify transient test errors so they bypass the 3-strike
|
||||
loop-breaker.
|
||||
|
||||
Story: S2 — Distinguish transient vs structural tests_failed
|
||||
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
|
||||
|
||||
Contract: `phases.is_transient(err: str) -> bool` returns True for the 6 documented
|
||||
substrings and False for unrelated errors.
|
||||
|
||||
The function is pure (no DB, no I/O), so these tests don't need fixtures.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from damascus.phases import is_transient
|
||||
|
||||
|
||||
@pytest.mark.parametrize("err", [
|
||||
"project repo not found at /workspace/projects/mindmaps; clone the Gitea repo...",
|
||||
"worktree setup: branch feat/S2 already exists in worktree",
|
||||
"Connection refused on 127.0.0.1:5432",
|
||||
"Could not resolve host: gitea.local",
|
||||
"TLS handshake timeout after 10s",
|
||||
"rate limit exceeded (HTTP 429) for upstream API",
|
||||
])
|
||||
def test_known_patterns_are_transient(err: str):
|
||||
"""AC: each of the 6 documented substrings is classified transient."""
|
||||
assert is_transient(err) is True, f"expected transient=True for {err!r}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("err", [
|
||||
"test_exited_with_code_1",
|
||||
"AssertionError: expected 1 == 2",
|
||||
"scope violation: file outside File Scope",
|
||||
"claude-code: timed out after 600s",
|
||||
"rebase_conflict on commit abc123",
|
||||
"",
|
||||
])
|
||||
def test_unrelated_errors_are_not_transient(err: str):
|
||||
"""AC: unrelated error strings must NOT be classified transient."""
|
||||
assert is_transient(err) is False, f"expected transient=False for {err!r}"
|
||||
|
||||
|
||||
def test_case_sensitive_substring_match():
|
||||
"""AC: substring match is case-sensitive (matches ADR-005 spec)."""
|
||||
# Uppercase "PROJECT REPO NOT FOUND AT" should NOT match the lowercase substring.
|
||||
assert is_transient("PROJECT REPO NOT FOUND AT /workspace/projects/mindmaps") is False
|
||||
86
tests/test_spec_path_persistence.py
Normal file
86
tests/test_spec_path_persistence.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""
|
||||
Unit tests for ADR-004: persist `spec_path` on spec-phase pass.
|
||||
|
||||
Story: S1 — Persist spec_path on spec-phase pass
|
||||
ADR: wiki/decisions/ADR-004-persist-spec-path-on-pass.md
|
||||
|
||||
Contract:
|
||||
- verdict=pass + phase=spec => spec_path from feedback is written to the row.
|
||||
- verdict != pass + phase=spec => spec_path is unchanged.
|
||||
|
||||
These tests drive `cycle.tick()` against the real test Postgres (conftest's
|
||||
default `db-test` service) and stub `phases.refine_spec` so the LLM is
|
||||
never called. The other moveable parts (wiki, relay) are also stubbed so
|
||||
the test is hermetic.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import get_row, insert_work_item
|
||||
from damascus import cycle, phases, relay, wiki
|
||||
|
||||
|
||||
def _stub_phase_returning(verdict: str, feedback: dict):
|
||||
"""Build a fake phases.refine_spec that returns a fixed verdict/feedback."""
|
||||
def fake_refine_spec(cur, item):
|
||||
print(f"DEBUG stub: item phase={item.get('phase')!r} id={item.get('id')!r}")
|
||||
print(f"DEBUG stub: returning verdict={verdict!r} feedback={feedback!r}")
|
||||
return {"verdict": verdict, "feedback": feedback}
|
||||
return fake_refine_spec
|
||||
|
||||
|
||||
def _run_tick_with_stub(monkeypatch, verdict: str, feedback: dict) -> None:
|
||||
"""Run one orchestrator tick with the spec phase stubbed.
|
||||
|
||||
wiki.init_wiki and relay.post are no-ops so the test does not touch
|
||||
the host filesystem or any external service.
|
||||
"""
|
||||
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
|
||||
monkeypatch.setattr(relay, "post", lambda line: None)
|
||||
monkeypatch.setattr(phases, "refine_spec", _stub_phase_returning(verdict, feedback))
|
||||
|
||||
out = cycle.tick()
|
||||
print(f"DEBUG tick: claimed={out['claimed']!r} transition={out['transition']!r}")
|
||||
assert out["claimed"] is not None, "tick did not claim a row"
|
||||
assert out["transition"]["verdict"] == verdict
|
||||
|
||||
|
||||
def test_pass_verdict_persists_spec_path(monkeypatch):
|
||||
"""AC: On verdict=pass in spec phase, work_items.spec_path equals
|
||||
the absolute path returned in verdict_feedback."""
|
||||
rid = insert_work_item(phase="spec", story_id="S1-pass", title="Persist spec path on pass")
|
||||
expected_path = "/data/specs/wh40k-pc/S1-pass.spec.md"
|
||||
|
||||
_run_tick_with_stub(monkeypatch, "pass", {
|
||||
"spec_path": expected_path,
|
||||
"preview": "# Goal\n...",
|
||||
})
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["spec_path"] == expected_path, (
|
||||
f"spec_path not persisted on pass: row has {row['spec_path']!r}, "
|
||||
f"expected {expected_path!r}"
|
||||
)
|
||||
# The phase should have advanced spec -> build (the contract for pass).
|
||||
assert row["phase"] == "build"
|
||||
|
||||
|
||||
def test_non_pass_verdict_does_not_persist(monkeypatch):
|
||||
"""AC: On a non-pass verdict in spec phase, work_items.spec_path is unchanged.
|
||||
For a freshly-inserted row, spec_path starts NULL and stays NULL."""
|
||||
rid = insert_work_item(phase="spec", story_id="S1-nopass",
|
||||
title="Spec ambiguous case")
|
||||
|
||||
_run_tick_with_stub(monkeypatch, "spec_ambiguous", {
|
||||
"issue_id": "test-issue-id",
|
||||
"preview": "# Goal\n...",
|
||||
})
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["spec_path"] is None, (
|
||||
f"spec_path must be unchanged on non-pass; row has {row['spec_path']!r}"
|
||||
)
|
||||
# spec_ambiguous rolls back the attempts increment AND routes to
|
||||
# awaiting_human (contract per ADR-004 + design doc §5).
|
||||
assert row["phase"] == "awaiting_human"
|
||||
227
tests/unit/test_phases_section.py
Normal file
227
tests/unit/test_phases_section.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
Unit tests for phases.py::_section() — the spec-text parser.
|
||||
|
||||
_phases._section() extracts the body of a Markdown section by regex
|
||||
matching the section header. The orchestrator's spec-refiner uses
|
||||
this to verify the LLM-emitted spec has the required sections
|
||||
(Goal, Acceptance Criteria, TDD Plan, Test Command, etc.). If
|
||||
the regex drifts from the section names used in the prompt, every
|
||||
spec fails `spec_wrong` and burns attempts.
|
||||
|
||||
These tests pin the regex behavior so future prompt changes don't
|
||||
silently regress the post-check.
|
||||
|
||||
Run from the repo root:
|
||||
pytest tests/unit/test_phases_section.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
# Import the function under test from the orchestrator's installed
|
||||
# package. The orchestrator installs its source as `damascus` so
|
||||
# `from damascus.phases import _section` works from any CWD that
|
||||
# has the package on sys.path.
|
||||
from damascus.phases import _section
|
||||
|
||||
|
||||
def test_section_extracts_bare_section_body():
|
||||
"""A section with no parenthesized suffix extracts cleanly."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Ship a feature.\n"
|
||||
"\n"
|
||||
"## Acceptance Criteria\n"
|
||||
"1. It works.\n"
|
||||
"2. Tests pass.\n"
|
||||
)
|
||||
assert _section(text, "Goal") == "Ship a feature."
|
||||
assert _section(text, "Acceptance Criteria") == "1. It works.\n2. Tests pass."
|
||||
|
||||
|
||||
def test_section_extracts_until_next_section():
|
||||
"""Section body ends at the next `## ` header or end of text."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"first section\n"
|
||||
"## TDD Plan\n"
|
||||
"second section\n"
|
||||
)
|
||||
assert _section(text, "Goal") == "first section"
|
||||
assert _section(text, "TDD Plan") == "second section"
|
||||
|
||||
|
||||
def test_section_returns_empty_for_missing_header():
|
||||
"""No match = empty string (not raise)."""
|
||||
text = "## Goal\nShip it."
|
||||
assert _section(text, "Acceptance Criteria") == ""
|
||||
assert _section(text, "Nonexistent Section") == ""
|
||||
|
||||
|
||||
def test_section_ignores_inline_mentions():
|
||||
"""A bare mention of the section name in body text doesn't trigger."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Build the Acceptance Criteria section carefully.\n"
|
||||
)
|
||||
# The body is the Goal's body, NOT a match for "Acceptance Criteria"
|
||||
# (no `## ` prefix in the body line).
|
||||
assert _section(text, "Acceptance Criteria") == ""
|
||||
|
||||
|
||||
def test_section_handles_whitespace_variations():
|
||||
"""Multiple spaces after `##` and trailing whitespace are tolerated."""
|
||||
text = (
|
||||
"## Goal \n"
|
||||
"Ship it.\n"
|
||||
)
|
||||
# The regex's `\s+` after `##` is greedy, so multiple spaces match.
|
||||
# The `\s*` before `\n` swallows trailing whitespace.
|
||||
assert "Ship it" in _section(text, "Goal")
|
||||
|
||||
|
||||
def test_section_matches_only_at_line_start():
|
||||
"""A `## Foo` inside a code fence or quoted line is NOT matched."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Ship it.\n"
|
||||
"\n"
|
||||
" ## Inline-mention\n"
|
||||
"This is in a quote, not a real section.\n"
|
||||
)
|
||||
# Inline-mention has leading whitespace, so the `^` anchor fails.
|
||||
assert _section(text, "Inline-mention") == ""
|
||||
|
||||
|
||||
def test_section_handles_parenthesized_suffix():
|
||||
"""The regex MUST accept `## <name> (description)` suffix.
|
||||
|
||||
The spec-refiner's prompt lists section headers with parenthesized
|
||||
descriptions (e.g. `## TDD Plan (list the failing tests)`) to hint
|
||||
the LLM about what to put in the body. The LLM faithfully copies
|
||||
these into its output. The regex's optional `(\\([^)]*\\))?` group
|
||||
is what makes the post-check match them.
|
||||
|
||||
Before this broadening (2026-06-26), the strict regex `\\s*\\n`
|
||||
rejected `(numbered)` / `(list the failing tests)` and every spec
|
||||
failed `spec_wrong` on first attempt.
|
||||
|
||||
See: wiki/queries/damascus-orchestrator/spec-refiner-text-parsing-2026-06-26.md
|
||||
for the gap analysis (recommends replacing text parsing with
|
||||
Pydantic-in / JSONB-out as a follow-up).
|
||||
"""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Ship a feature.\n"
|
||||
"\n"
|
||||
"## Acceptance Criteria (numbered)\n"
|
||||
"1. Works.\n"
|
||||
"\n"
|
||||
"## TDD Plan (list the failing tests)\n"
|
||||
"- failing test 1\n"
|
||||
"- failing test 2\n"
|
||||
"\n"
|
||||
"## File Scope (list of paths/globs the implementation may touch)\n"
|
||||
"- src/foo.py\n"
|
||||
"\n"
|
||||
"## Test Command (the exact shell command that proves done)\n"
|
||||
"pytest tests/test_foo.py -v\n"
|
||||
"\n"
|
||||
"## Ambiguities (any open questions for a human)\n"
|
||||
"(none)\n"
|
||||
)
|
||||
# The regex MUST match all six sections, including the parenthesized
|
||||
# suffix on each. This is the fix for the 2026-06-26 bug.
|
||||
assert _section(text, "Goal") == "Ship a feature."
|
||||
assert _section(text, "Acceptance Criteria") == "1. Works."
|
||||
assert _section(text, "TDD Plan") == "- failing test 1\n- failing test 2"
|
||||
assert _section(text, "File Scope") == "- src/foo.py"
|
||||
assert _section(text, "Test Command") == "pytest tests/test_foo.py -v"
|
||||
assert _section(text, "Ambiguities") == "(none)"
|
||||
|
||||
|
||||
def test_section_rejects_parenthetical_in_middle_of_name():
|
||||
"""The suffix regex matches `(...)` only AFTER the section name, not
|
||||
embedded in it. `## Acceptance (numbered) Criteria` should NOT match
|
||||
`Acceptance Criteria` because the parenthetical is mid-name."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Real goal.\n"
|
||||
"\n"
|
||||
"## Acceptance (numbered) Criteria\n"
|
||||
"Should not match.\n"
|
||||
)
|
||||
assert _section(text, "Acceptance Criteria") == ""
|
||||
assert _section(text, "Goal") == "Real goal."
|
||||
|
||||
|
||||
def test_section_extracts_complex_multiline_body():
|
||||
"""A section with lists, code blocks, and sub-headings is captured whole."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Build X.\n"
|
||||
"\n"
|
||||
"Details:\n"
|
||||
"- item 1\n"
|
||||
"- item 2\n"
|
||||
"\n"
|
||||
"```bash\n"
|
||||
"echo code block\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"## Next\n"
|
||||
"Other.\n"
|
||||
)
|
||||
body = _section(text, "Goal")
|
||||
assert "Build X." in body
|
||||
assert "item 1" in body
|
||||
assert "item 2" in body
|
||||
assert "echo code block" in body
|
||||
# Should NOT include the next section
|
||||
assert "Other." not in body
|
||||
|
||||
|
||||
def test_section_required_for_spec_refiner_post_check():
|
||||
"""Integration check: all four sections the post-check requires
|
||||
extract cleanly from a well-formed spec."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"Ship the feature.\n"
|
||||
"\n"
|
||||
"## Acceptance Criteria\n"
|
||||
"1. AC1.\n"
|
||||
"2. AC2.\n"
|
||||
"\n"
|
||||
"## TDD Plan\n"
|
||||
"- failing test 1\n"
|
||||
"- failing test 2\n"
|
||||
"\n"
|
||||
"## File Scope\n"
|
||||
"- src/foo.py\n"
|
||||
"- tests/test_foo.py\n"
|
||||
"\n"
|
||||
"## Test Command\n"
|
||||
"pytest tests/test_foo.py -v\n"
|
||||
"\n"
|
||||
"## Ambiguities\n"
|
||||
"(none)\n"
|
||||
)
|
||||
# This is exactly what the post-check at phases.py:76 verifies.
|
||||
missing = [s for s in ("Goal", "Acceptance Criteria", "TDD Plan", "Test Command")
|
||||
if not _section(text, s)]
|
||||
assert missing == [], f"post-check would flag {missing} as missing"
|
||||
|
||||
|
||||
def test_section_with_extra_blank_lines_in_body():
|
||||
"""Blank lines inside a section body are preserved."""
|
||||
text = (
|
||||
"## Goal\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"Ship it.\n"
|
||||
"\n"
|
||||
"## Next\n"
|
||||
"foo\n"
|
||||
)
|
||||
# The body is whitespace; `strip()` in `_section` removes leading/trailing
|
||||
# whitespace, so the result is "Ship it."
|
||||
assert _section(text, "Goal") == "Ship it."
|
||||
@@ -183,3 +183,54 @@ export function useGroupedItems(): UseQueryResult<GroupedItemsResponse> {
|
||||
refetchInterval: FIVE_SECONDS,
|
||||
});
|
||||
}
|
||||
|
||||
// ---- /v1/performance ----------------------------------------------------
|
||||
// Added 2026-06-27 to drive the perf dashboard widgets (avg request time,
|
||||
// avg tokens, stage failure rates, stage progression velocity).
|
||||
|
||||
export interface PhaseMetrics {
|
||||
avg_request_seconds: number | null;
|
||||
p50_request_seconds: number | null;
|
||||
p95_request_seconds: number | null;
|
||||
avg_input_tokens: number | null;
|
||||
avg_output_tokens: number | null;
|
||||
avg_total_tokens: number | null;
|
||||
request_count: number;
|
||||
failure_count: number;
|
||||
failure_rate: number | null;
|
||||
}
|
||||
|
||||
export interface ProjectMetrics {
|
||||
request_count: number;
|
||||
failure_count: number;
|
||||
failure_rate: number | null;
|
||||
}
|
||||
|
||||
export interface StageTransition {
|
||||
project: string;
|
||||
story_id: string;
|
||||
phase: string;
|
||||
seconds: number;
|
||||
}
|
||||
|
||||
export interface PerformanceResponse {
|
||||
window_start: string;
|
||||
window_end: string;
|
||||
total_requests: number;
|
||||
total_failures: number;
|
||||
by_phase: Record<string, PhaseMetrics>;
|
||||
by_project: Record<string, ProjectMetrics>;
|
||||
stage_progression: StageTransition[];
|
||||
}
|
||||
|
||||
export function usePerformance(
|
||||
days = 7,
|
||||
): UseQueryResult<PerformanceResponse> {
|
||||
return useQuery({
|
||||
queryKey: ["performance", days],
|
||||
queryFn: () =>
|
||||
api.get<PerformanceResponse>("/v1/performance", { days }),
|
||||
staleTime: FIVE_SECONDS,
|
||||
refetchInterval: FIVE_SECONDS,
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user