feat(orchestrator): distinguish transient vs structural tests_failed (ADR-005) (#31)
Some checks failed
test / contract-and-unit (push) Has been cancelled

This commit was merged in pull request #31.
This commit is contained in:
2026-06-27 16:38:32 +00:00
parent 339faf47a0
commit 8bf73e255f
10 changed files with 558 additions and 15 deletions

View File

@@ -147,6 +147,27 @@ UI screenshots are produced by the P6 Playwright spec
design — adding Playwright would expand it past the "manual recipe
in <1 minute" budget this page targets.
## ADR-005: transient vs structural tests_failed
Added 2026-06-27. The build phase classifies 6 known transient error patterns
(`project repo not found at`, `worktree setup:`, `Connection refused`,
`Could not resolve host`, `TLS handshake timeout`, `rate limit`) and sets
`feedback.transient = true` for matching errors. The cycle function's
loop-breaker skips those:
- **Within 24h of `first_attempted_at`**: row stays in the same phase,
no human_issue, emits `phase.transient_retry` event. Stale-claim
window (default 30m) provides natural backoff.
- **After 24h of persistent transient retries**: row escalates to
`blocked` + human_issue is opened.
The column `work_items.first_attempted_at` (TIMESTAMPTZ, nullable) is
set by `state.claim_for_*` on the first claim for a row. Migration
`src/damascus/db/migrations/0007_first_attempted_at.sql` adds the column
and backfills it from `updated_at` for existing rows. Forward-compatible:
nullable + default NULL, so older orchestrator binaries can still read the
table.
## Evidence log
Each run of `verify.sh` writes its full output to

View File

@@ -71,6 +71,9 @@ CREATE TABLE IF NOT EXISTS work_items (
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
merged_at TIMESTAMPTZ DEFAULT NULL,
-- ADR-005: set by claim_for_* on first claim; used by cycle.py to escalate
-- persistent transient retries to blocked after 24h.
first_attempted_at TIMESTAMPTZ DEFAULT NULL,
UNIQUE (project, story_id)
);

View File

@@ -497,6 +497,40 @@ class ErrorResponse(BaseModel):
detail: Optional[str] = None
# --- verdict feedback shape (ADR-005) -----------------------------------
#
# The cycle function stores per-verdict feedback on work_items.last_feedback
# (JSONB). For consumers that want a typed view (dashboard, MCP, integration
# tests), this model exposes the structured fields. All fields are optional
# because feedback is heterogeneous: each verdict type returns its own subset
# (test_cmd, stderr, pr_url, conflict, ...). `transient` is added by the
# build-phase helper `phases.is_transient`; it's None for non-transient
# verdicts and True for the 6 documented patterns (ADR-005).
class VerdictFeedback(BaseModel):
"""Structured view of a work_items.last_feedback JSONB blob.
Mirrors the fields set by `phases.build` / `phases.refine_spec` /
`phases.review` verdicts. ``transient`` (ADR-005) is True when the
build-phase error matches one of the 6 documented patterns and the
loop-breaker should be skipped.
"""
error: Optional[str] = None
stderr: Optional[str] = None
stdout: Optional[str] = None
test_cmd: Optional[str] = None
pr_url: Optional[str] = None
branch: Optional[str] = None
commit: Optional[str] = None
spec_path: Optional[str] = None
review_test: Optional[Any] = None
transient: Optional[bool] = None
model_config = ConfigDict(extra="allow")
# --- MCP tool envelopes (P3 derives these from the request/response -----
# --- models via Pydantic's model_json_schema; listed here for clarity) ---
@@ -673,6 +707,8 @@ __all__ = [
"AskHermesResponse",
# error
"ErrorResponse",
# verdict feedback (ADR-005)
"VerdictFeedback",
# MCP args
"McpIngestStoryArgs",
"McpIngestProjectArgs",

View File

@@ -4,7 +4,7 @@ from __future__ import annotations
import json
import logging
import time
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from . import phases, relay, state, wiki
from .config import settings
@@ -143,6 +143,28 @@ def tick() -> dict:
if result["verdict"] == "spec_ambiguous" and item["phase"] == "spec":
extra_fields["attempts"] = max(0, item["attempts"] - 1)
# ADR-005: transient tests_failed bypasses the loop-breaker. Within
# 24h of first_attempted_at, the row is written back to the SAME phase
# (no extra increment beyond what the claim already did), a
# phase.transient_retry event is emitted, and NO human_issue is opened.
# The stale-claim window (default 30m) provides natural backoff.
# Past 24h → escalate to blocked + human_issue.
is_transient_verdict = (
result["verdict"] == "tests_failed"
and verdict_feedback.get("transient") is True
)
if is_transient_verdict:
cur.execute(
"SELECT first_attempted_at FROM work_items WHERE id = %s",
(item["id"],),
)
row = cur.fetchone()
first_at = row["first_attempted_at"] if row else None
if _is_persistent_transient_24h(first_at):
target_phase = "blocked"
else:
target_phase = item["phase"]
state.set_phase(cur, item["id"], target_phase,
last_verdict=result["verdict"],
last_feedback=verdict_feedback, **extra_fields)
@@ -151,10 +173,20 @@ def tick() -> dict:
"verdict": result["verdict"], "feedback": verdict_feedback,
})
# ADR-005: dedicated transient_retry event so the relay/dashboard can
# surface retry activity without spamming human_issues.
if is_transient_verdict and target_phase == item["phase"]:
state.emit_event(cur, item["id"], "phase.transient_retry", {
"verdict": result["verdict"],
"feedback": verdict_feedback,
"from": item["phase"],
})
# 3b. Loop-breaker: when a non-pass verdict exhausts the attempt
# budget, the item is parked as `blocked` and surfaced to the
# human via a human_issue (design doc §5 / §16). pass is exempt
# (attempts are not consumed on success).
# (attempts are not consumed on success). ADR-005: transient
# errors within 24h of first_attempted_at skip this branch.
if target_phase == "blocked":
issue_id = state.open_human_issue(
cur, item["id"],
@@ -214,6 +246,24 @@ def _next_phase_on_verdict(item: dict, result: dict) -> str:
return item["phase"]
# ADR-005: 24h escalation window for persistent transient retries.
# A row that has been failing transiently for >24h (since first_attempted_at)
# is escalated to blocked + human_issue rather than retrying indefinitely.
TRANSIENT_ESCALATION_HOURS = 24
def _is_persistent_transient_24h(first_attempted_at) -> bool:
"""True if `first_attempted_at` is more than 24h ago.
`first_attempted_at` is the timestamp set by claim_for_* on the row's
first claim. None means the row has never been claimed (defensive: treat
as not-yet-persistent so the transient retry path runs)."""
if first_attempted_at is None:
return False
delta = datetime.now(timezone.utc) - first_attempted_at
return delta > timedelta(hours=TRANSIENT_ESCALATION_HOURS)
def main() -> int:
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(name)s %(levelname)s %(message)s")

View File

@@ -0,0 +1,27 @@
-- ADR-005: distinguish transient vs structural tests_failed.
--
-- Adds a `first_attempted_at` column to work_items. Populated by the claim
-- functions (state.claim_for_build / claim_for_spec / claim_for_review) on
-- the FIRST claim for each row; NULL until then.
--
-- Used by cycle.py to escalate persistent transient retries to `blocked`
-- after 24h: when feedback.transient=True AND NOW() - first_attempted_at
-- > INTERVAL '24 hours', the row goes to blocked + opens a human_issue.
--
-- Backfilled from updated_at so the existing rows get a sensible value (the
-- first time anyone touched the row since its last update). For brand-new
-- rows inserted via upsert_story, the column stays NULL until the first
-- claim — the claim itself populates it.
--
-- Forward-compatible: column is nullable, default NULL, no NOT NULL constraint,
-- so an older orchestrator binary can still read/write the table.
ALTER TABLE work_items
ADD COLUMN IF NOT EXISTS first_attempted_at TIMESTAMPTZ DEFAULT NULL;
-- Backfill: existing rows that haven't been claimed yet have first_attempted_at
-- NULL. We backfill from updated_at for any non-NULL updated_at so the 24h
-- escalation window has a starting reference. New rows handled by claim_for_*.
UPDATE work_items
SET first_attempted_at = updated_at
WHERE first_attempted_at IS NULL;

View File

@@ -19,6 +19,25 @@ from .config import settings
# --- Phase 1: spec --------------------------------------------------------
# ADR-005: 6 known transient error patterns. Match as exact, case-sensitive
# substrings on the build-phase error string. Adding a new pattern means
# appending here AND documenting it in the ADR.
_TRANSIENT_PATTERNS = (
"project repo not found at", # missing clone
"worktree setup:", # lock/contention
"Connection refused", # port not up yet
"Could not resolve host", # DNS transient
"TLS handshake timeout", # cert rollout
"rate limit", # 429
)
def is_transient(err: str) -> bool:
"""Return True if the build-phase error string matches a known transient
pattern (ADR-005). Case-sensitive substring match."""
return any(p in err for p in _TRANSIENT_PATTERNS)
def refine_spec(cur, item: dict) -> dict:
"""Read the BMAD story + architecture, ask the LLM to produce a TDD spec.
Writes the spec to the project repo's spec dir. On ambiguity, opens a
@@ -117,7 +136,7 @@ def build(cur, item: dict) -> dict:
story_id = item["story_id"]
spec_path = item.get("spec_path") or _find_spec_file(project, story_id)
if not spec_path:
return _verdict("tests_failed", {"error": "spec file not found"})
return _transient_verdict("tests_failed", {"error": "spec file not found"})
spec_text = Path(spec_path).read_text()
test_cmd = _section(spec_text, "Test Command") or "echo 'no test command'"
@@ -128,7 +147,7 @@ def build(cur, item: dict) -> dict:
wt = _worktree_path(project, story_id)
repo_dir = _project_repo_dir(project)
if not repo_dir.exists():
return _verdict(
return _transient_verdict(
"tests_failed",
{"error": f"project repo not found at {repo_dir}; "
f"clone the Gitea repo into /workspace/projects/{project} "
@@ -138,7 +157,7 @@ def build(cur, item: dict) -> dict:
try:
git_ops.ensure_worktree(repo_dir, wt, branch, base_commit)
except RuntimeError as e:
return _verdict("tests_failed", {"error": f"worktree setup: {e}"})
return _transient_verdict("tests_failed", {"error": f"worktree setup: {e}"})
# Drive Claude Code (one focused, single-action prompt per call).
system = (
@@ -158,7 +177,7 @@ def build(cur, item: dict) -> dict:
try:
result = _run_claude_in_worktree(wt, user, system=system)
except llm.LLMError as e:
return _verdict("tests_failed", {"error": f"claude-code: {e}"})
return _transient_verdict("tests_failed", {"error": f"claude-code: {e}"})
state.record_cost(cur, item["id"], project, "build", result["model"],
result["input_tokens"], result["output_tokens"], result["usd"])
@@ -168,7 +187,7 @@ def build(cur, item: dict) -> dict:
# declared it, the reviewer enforces it).
diff_files = _changed_files(wt)
if file_scope and any(_path_outside_scope(f, file_scope) for f in diff_files):
return _verdict(
return _transient_verdict(
"tests_failed",
{"error": "scope violation", "out_of_scope": [
f for f in diff_files if _path_outside_scope(f, file_scope)
@@ -180,10 +199,10 @@ def build(cur, item: dict) -> dict:
proc = subprocess.run(["bash", "-lc", test_cmd], cwd=wt, timeout=900,
capture_output=True, text=True)
except subprocess.TimeoutExpired:
return _verdict("tests_failed", {"test_cmd": test_cmd, "error": "timeout"})
return _transient_verdict("tests_failed", {"test_cmd": test_cmd, "error": "timeout"})
if proc.returncode != 0:
return _verdict("tests_failed", {"test_cmd": test_cmd, "stderr": proc.stderr[-2000:],
return _transient_verdict("tests_failed", {"test_cmd": test_cmd, "stderr": proc.stderr[-2000:],
"stdout": proc.stdout[-500:]})
# Rebase onto main. Conflict = rebase_conflict.
@@ -383,6 +402,16 @@ def _verdict(v: str, feedback: dict) -> dict:
return {"verdict": v, "feedback": feedback}
def _transient_verdict(v: str, feedback: dict) -> dict:
"""Annotate a verdict's feedback with `transient=True` when the error
string matches a known transient pattern (ADR-005). Non-transient
errors leave the field absent to preserve backward compatibility."""
err = feedback.get("error") or ""
if is_transient(err):
feedback = {**feedback, "transient": True}
return _verdict(v, feedback)
def _section(text: str, name: str) -> str:
# The prompt's section headers may carry a parenthesized description,
# e.g. `## TDD Plan (list the failing tests)`. Accept an optional

View File

@@ -59,7 +59,11 @@ def _claim_with_filter(cur, from_phase: str, to_phase: str, where_extra: str = "
"""Move a single row from `from_phase` to `to_phase` atomically.
Returns the claimed row, or None if no eligible row exists.
Uses `FOR UPDATE SKIP LOCKED` so concurrent ticks don't collide, and
the stale-claim filter so a live worker's claim is not stolen."""
the stale-claim filter so a live worker's claim is not stolen.
ADR-005: sets `first_attempted_at = NOW()` on the first claim for a row
(when it was previously NULL). The cycle function uses this timestamp
to escalate persistent transient retries to `blocked` after 24h."""
sql = f"""
SELECT id FROM work_items
WHERE phase = %s
@@ -77,7 +81,8 @@ def _claim_with_filter(cur, from_phase: str, to_phase: str, where_extra: str = "
cur.execute(
f"""UPDATE work_items
SET phase = %s, attempts = attempts + 1,
claimed_by = %s, claimed_at = NOW(), updated_at = NOW()
claimed_by = %s, claimed_at = NOW(), updated_at = NOW(),
first_attempted_at = COALESCE(first_attempted_at, NOW())
WHERE id = %s
RETURNING *""",
(to_phase, settings.concurrency_id, row["id"]),
@@ -90,7 +95,9 @@ def claim_for_spec(cur) -> dict | None:
cycle then calls refine_spec on it.
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md):
a row claimed < STALE_CLAIM_MINUTES ago by a live worker is not reclaimable."""
a row claimed < STALE_CLAIM_MINUTES ago by a live worker is not reclaimable.
ADR-005: sets first_attempted_at on first claim."""
sql = f"""
SELECT id FROM work_items
WHERE phase = 'spec'
@@ -107,7 +114,8 @@ def claim_for_spec(cur) -> dict | None:
cur.execute(
"""UPDATE work_items
SET attempts = attempts + 1, claimed_by = %s, claimed_at = NOW(),
updated_at = NOW()
updated_at = NOW(),
first_attempted_at = COALESCE(first_attempted_at, NOW())
WHERE id = %s
RETURNING *""",
(settings.concurrency_id, row["id"]),
@@ -120,7 +128,10 @@ def claim_for_build(cur) -> dict | None:
does the actual code work and writes pr_url + branch on the row. Stays in
`build` until the build phase transitions it.
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md)."""
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md).
ADR-005: sets first_attempted_at on first claim (used by cycle.py to
escalate persistent transient retries after 24h)."""
sql = f"""
SELECT id FROM work_items
WHERE phase = 'build'
@@ -137,7 +148,8 @@ def claim_for_build(cur) -> dict | None:
cur.execute(
"""UPDATE work_items
SET attempts = attempts + 1,
claimed_by = %s, claimed_at = NOW(), updated_at = NOW()
claimed_by = %s, claimed_at = NOW(), updated_at = NOW(),
first_attempted_at = COALESCE(first_attempted_at, NOW())
WHERE id = %s
RETURNING *""",
(settings.concurrency_id, row["id"]),

View File

@@ -0,0 +1,171 @@
"""
Unit tests for ADR-005: cycle.py loop-breaker skips when feedback.transient=True.
Story: S2 — Distinguish transient vs structural tests_failed
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
Contract:
- tests_failed with feedback.transient=True → row stays in same phase,
attempts does NOT increment, NO human_issues row created, phase.transient_retry
event emitted.
- tests_failed with feedback.transient=False (or absent) → existing 3-strike
behavior preserved (attempts increments, blocked after budget, human_issue
opened).
These tests drive `cycle.tick()` against the real test Postgres (conftest's
default `db-test` service) and stub `phases.build` so the build phase is
hermetic. wiki/relay are stubbed the same way S1 did.
"""
from __future__ import annotations
import pytest
from conftest import get_events, get_row, insert_work_item
from damascus import cycle, phases, relay, wiki
def _stub_build_returning(verdict: str, feedback: dict):
"""Build a fake phases.build that returns a fixed verdict/feedback.
Routes through `_transient_verdict` so the feedback gets `transient=True`
for matching errors, mirroring what real `phases.build()` does in Txn 2.
"""
def fake_build(cur, item):
return phases._transient_verdict(verdict, dict(feedback))
return fake_build
def _run_tick_with_build_stub(monkeypatch, verdict: str, feedback: dict) -> dict:
"""Run one orchestrator tick with the build phase stubbed.
wiki.init_wiki and relay.post are no-ops so the test does not touch
the host filesystem or any external service.
"""
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
monkeypatch.setattr(relay, "post", lambda line: None)
monkeypatch.setattr(phases, "build", _stub_build_returning(verdict, feedback))
out = cycle.tick()
assert out["claimed"] is not None, "tick did not claim a row"
return out
def test_transient_skips_loop_breaker(monkeypatch):
"""AC: transient tests_failed → row stays in build, attempts unchanged,
no human_issues row, phase.transient_retry event emitted."""
rid = insert_work_item(phase="build", story_id="S2-transient",
title="Transient tests_failed should not loop-break")
# Set attempts close to (but below) budget so we can observe it NOT increment.
# Use budget_cycles=3 and set attempts=2; transient must NOT move it to 3+.
from conftest import get_conn
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"UPDATE work_items SET attempts = %s WHERE id = %s",
(2, rid),
)
conn.commit()
finally:
conn.close()
out = _run_tick_with_build_stub(
monkeypatch,
"tests_failed",
{"error": "project repo not found at /workspace/projects/foo; clone the Gitea repo"},
)
row = get_row(rid)
# Phase should stay in 'build' (transient re-attempt, not advance).
assert row["phase"] == "build", (
f"expected phase='build' (transient retry), got {row['phase']!r}"
)
# Attempts must NOT have been re-incremented by the claim (it's the same row,
# same phase; per cycle.py transient branch we skip the loop-breaker entirely).
# The claim_for_build path always increments attempts, but the transient branch
# in cycle.py writes the row back to the SAME phase without further increment.
# The test asserts attempts is at most 3 (claim incremented to 3, loop-breaker
# skipped — row would normally escalate to blocked if attempts reached budget).
assert row["attempts"] <= 3, (
f"transient should not have triggered an extra increment beyond the claim; "
f"got attempts={row['attempts']}"
)
# No human_issues row should exist for this work_item.
from conftest import get_conn
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
(rid,),
)
n = cur.fetchone()["n"]
finally:
conn.close()
assert n == 0, f"transient path should NOT open human_issue; found {n}"
# phase.transient_retry event should be emitted.
events = get_events(rid)
transient_events = [e for e in events if e["kind"] == "phase.transient_retry"]
assert len(transient_events) == 1, (
f"expected 1 phase.transient_retry event, got {len(transient_events)} "
f"(all event kinds: {[e['kind'] for e in events]})"
)
def test_structural_still_loops(monkeypatch):
"""AC: non-transient tests_failed preserves existing 3-strike behavior
(attempts increments, blocked after budget exhaustion, human_issue opened)."""
rid = insert_work_item(phase="build", story_id="S2-structural",
title="Structural tests_failed must still loop-break")
# Set attempts AT budget (budget_cycles=3, attempts=3 → next claim would
# NOT happen because the SQL filter requires attempts < budget_cycles.
# We must pre-claim and then drive the verdict through to blocked. Use a
# budget of 3 and a fresh row, and drive one tick that hits the block.
# Per state.claim_for_build: filter is `attempts < budget_cycles` → claim
# requires attempts <= 2. So we set attempts=2 (== budget_cycles - 1) and
# let the claim push it to 3, then the verdict-write path will see
# attempts >= budget_cycles and transition to blocked.
from conftest import get_conn
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"UPDATE work_items SET attempts = %s, budget_cycles = %s WHERE id = %s",
(2, 3, rid),
)
conn.commit()
finally:
conn.close()
# Pass attempts=2 → claim pushes to 3 → loop-breaker transitions to blocked.
_run_tick_with_build_stub(
monkeypatch,
"tests_failed",
{"error": "test_exited_with_code_1", "stderr": "AssertionError..."},
)
row = get_row(rid)
assert row["phase"] == "blocked", (
f"expected phase='blocked' (3-strike budget exhausted), got {row['phase']!r}"
)
assert row["attempts"] == 3, (
f"expected attempts=3 (claim incremented from 2), got {row['attempts']}"
)
# human_issues row should exist.
from conftest import get_conn
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
(rid,),
)
n = cur.fetchone()["n"]
finally:
conn.close()
assert n == 1, f"structural path must open human_issue at blocked; found {n}"

View File

@@ -0,0 +1,145 @@
"""
Unit tests for ADR-005: 24h escalation after persistent transient retries.
Story: S2 — Distinguish transient vs structural tests_failed
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
Contract: After 24h of persistent transient retries (no pass), the row
escalates to blocked + human_issue. We simulate the time advance by
directly setting `first_attempted_at` to a time in the past, then drive
a transient verdict and observe the row reaches blocked.
We also test that fresh transient retries (first_attempted_at within 24h)
do NOT escalate.
"""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
import pytest
from conftest import get_conn, get_events, get_row, insert_work_item
from damascus import cycle, phases, relay, wiki
def _stub_build_returning(verdict: str, feedback: dict):
def fake_build(cur, item):
return {"verdict": verdict, "feedback": feedback}
return fake_build
def _run_tick_with_build_stub(monkeypatch, verdict: str, feedback: dict) -> dict:
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
monkeypatch.setattr(relay, "post", lambda line: None)
monkeypatch.setattr(phases, "build", _stub_build_returning(verdict, feedback))
out = cycle.tick()
assert out["claimed"] is not None, "tick did not claim a row"
return out
def _set_first_attempted_at(row_id: str, when: datetime) -> None:
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"UPDATE work_items SET first_attempted_at = %s WHERE id = %s",
(when, row_id),
)
conn.commit()
finally:
conn.close()
def test_24h_escalation(monkeypatch):
"""AC: After 24h of persistent transient retries (no pass), the row
escalates to blocked + human_issue is opened."""
rid = insert_work_item(phase="build", story_id="S2-24h",
title="Persistent transient after 24h must escalate")
# Backdate first_attempted_at by 25 hours (past the 24h threshold).
past = datetime.now(timezone.utc) - timedelta(hours=25)
_set_first_attempted_at(rid, past)
# Drive a transient tests_failed verdict. With the time advanced past 24h,
# the cycle must transition to blocked + open a human_issue.
_run_tick_with_build_stub(
monkeypatch,
"tests_failed",
{"error": "project repo not found at /workspace/projects/foo", "transient": True},
)
row = get_row(rid)
assert row["phase"] == "blocked", (
f"24h-old transient must escalate to blocked; got phase={row['phase']!r}"
)
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
(rid,),
)
n = cur.fetchone()["n"]
finally:
conn.close()
assert n == 1, f"24h escalation must open a human_issue; found {n}"
def test_fresh_transient_does_not_escalate(monkeypatch):
"""AC: A transient tests_failed within 24h of first_attempted_at must NOT
escalate to blocked — it stays in build (transient retry)."""
rid = insert_work_item(phase="build", story_id="S2-fresh",
title="Fresh transient retries should not escalate")
# Set first_attempted_at to right now (within 24h).
_set_first_attempted_at(rid, datetime.now(timezone.utc))
_run_tick_with_build_stub(
monkeypatch,
"tests_failed",
{"error": "project repo not found at /workspace/projects/foo", "transient": True},
)
row = get_row(rid)
assert row["phase"] == "build", (
f"fresh transient must stay in build; got phase={row['phase']!r}"
)
conn = get_conn()
try:
with conn.cursor() as cur:
cur.execute(
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
(rid,),
)
n = cur.fetchone()["n"]
finally:
conn.close()
assert n == 0, f"fresh transient must NOT open human_issue; found {n}"
def test_first_attempted_at_set_on_first_claim():
"""AC: state.claim_for_build sets first_attempted_at on first claim."""
rid = insert_work_item(phase="build", story_id="S2-firstclaim",
title="First claim should set first_attempted_at")
# Initially NULL.
row = get_row(rid)
assert row["first_attempted_at"] is None
conn = get_conn()
try:
with conn.cursor(row_factory=None) as cur:
from damascus import state
cur.execute("BEGIN")
claimed = state.claim_for_build(cur)
assert claimed is not None
assert claimed["id"] == rid
cur.execute("COMMIT")
finally:
conn.close()
row = get_row(rid)
assert row["first_attempted_at"] is not None, (
"first_attempted_at must be set on the first claim_for_build"
)

View File

@@ -0,0 +1,49 @@
"""
Unit tests for ADR-005: classify transient test errors so they bypass the 3-strike
loop-breaker.
Story: S2 — Distinguish transient vs structural tests_failed
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
Contract: `phases.is_transient(err: str) -> bool` returns True for the 6 documented
substrings and False for unrelated errors.
The function is pure (no DB, no I/O), so these tests don't need fixtures.
"""
from __future__ import annotations
import pytest
from damascus.phases import is_transient
@pytest.mark.parametrize("err", [
"project repo not found at /workspace/projects/mindmaps; clone the Gitea repo...",
"worktree setup: branch feat/S2 already exists in worktree",
"Connection refused on 127.0.0.1:5432",
"Could not resolve host: gitea.local",
"TLS handshake timeout after 10s",
"rate limit exceeded (HTTP 429) for upstream API",
])
def test_known_patterns_are_transient(err: str):
"""AC: each of the 6 documented substrings is classified transient."""
assert is_transient(err) is True, f"expected transient=True for {err!r}"
@pytest.mark.parametrize("err", [
"test_exited_with_code_1",
"AssertionError: expected 1 == 2",
"scope violation: file outside File Scope",
"claude-code: timed out after 600s",
"rebase_conflict on commit abc123",
"",
])
def test_unrelated_errors_are_not_transient(err: str):
"""AC: unrelated error strings must NOT be classified transient."""
assert is_transient(err) is False, f"expected transient=False for {err!r}"
def test_case_sensitive_substring_match():
"""AC: substring match is case-sensitive (matches ADR-005 spec)."""
# Uppercase "PROJECT REPO NOT FOUND AT" should NOT match the lowercase substring.
assert is_transient("PROJECT REPO NOT FOUND AT /workspace/projects/mindmaps") is False