Compare commits
1 Commits
main
...
feat/S2-tr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c122bc262b |
@@ -147,6 +147,27 @@ UI screenshots are produced by the P6 Playwright spec
|
||||
design — adding Playwright would expand it past the "manual recipe
|
||||
in <1 minute" budget this page targets.
|
||||
|
||||
## ADR-005: transient vs structural tests_failed
|
||||
|
||||
Added 2026-06-27. The build phase classifies 6 known transient error patterns
|
||||
(`project repo not found at`, `worktree setup:`, `Connection refused`,
|
||||
`Could not resolve host`, `TLS handshake timeout`, `rate limit`) and sets
|
||||
`feedback.transient = true` for matching errors. The cycle function's
|
||||
loop-breaker skips those:
|
||||
|
||||
- **Within 24h of `first_attempted_at`**: row stays in the same phase,
|
||||
no human_issue, emits `phase.transient_retry` event. Stale-claim
|
||||
window (default 30m) provides natural backoff.
|
||||
- **After 24h of persistent transient retries**: row escalates to
|
||||
`blocked` + human_issue is opened.
|
||||
|
||||
The column `work_items.first_attempted_at` (TIMESTAMPTZ, nullable) is
|
||||
set by `state.claim_for_*` on the first claim for a row. Migration
|
||||
`src/damascus/db/migrations/0007_first_attempted_at.sql` adds the column
|
||||
and backfills it from `updated_at` for existing rows. Forward-compatible:
|
||||
nullable + default NULL, so older orchestrator binaries can still read the
|
||||
table.
|
||||
|
||||
## Evidence log
|
||||
|
||||
Each run of `verify.sh` writes its full output to
|
||||
|
||||
@@ -71,6 +71,9 @@ CREATE TABLE IF NOT EXISTS work_items (
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
merged_at TIMESTAMPTZ DEFAULT NULL,
|
||||
-- ADR-005: set by claim_for_* on first claim; used by cycle.py to escalate
|
||||
-- persistent transient retries to blocked after 24h.
|
||||
first_attempted_at TIMESTAMPTZ DEFAULT NULL,
|
||||
UNIQUE (project, story_id)
|
||||
);
|
||||
|
||||
|
||||
@@ -497,6 +497,40 @@ class ErrorResponse(BaseModel):
|
||||
detail: Optional[str] = None
|
||||
|
||||
|
||||
# --- verdict feedback shape (ADR-005) -----------------------------------
|
||||
#
|
||||
# The cycle function stores per-verdict feedback on work_items.last_feedback
|
||||
# (JSONB). For consumers that want a typed view (dashboard, MCP, integration
|
||||
# tests), this model exposes the structured fields. All fields are optional
|
||||
# because feedback is heterogeneous: each verdict type returns its own subset
|
||||
# (test_cmd, stderr, pr_url, conflict, ...). `transient` is added by the
|
||||
# build-phase helper `phases.is_transient`; it's None for non-transient
|
||||
# verdicts and True for the 6 documented patterns (ADR-005).
|
||||
|
||||
|
||||
class VerdictFeedback(BaseModel):
|
||||
"""Structured view of a work_items.last_feedback JSONB blob.
|
||||
|
||||
Mirrors the fields set by `phases.build` / `phases.refine_spec` /
|
||||
`phases.review` verdicts. ``transient`` (ADR-005) is True when the
|
||||
build-phase error matches one of the 6 documented patterns and the
|
||||
loop-breaker should be skipped.
|
||||
"""
|
||||
|
||||
error: Optional[str] = None
|
||||
stderr: Optional[str] = None
|
||||
stdout: Optional[str] = None
|
||||
test_cmd: Optional[str] = None
|
||||
pr_url: Optional[str] = None
|
||||
branch: Optional[str] = None
|
||||
commit: Optional[str] = None
|
||||
spec_path: Optional[str] = None
|
||||
review_test: Optional[Any] = None
|
||||
transient: Optional[bool] = None
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
|
||||
# --- MCP tool envelopes (P3 derives these from the request/response -----
|
||||
# --- models via Pydantic's model_json_schema; listed here for clarity) ---
|
||||
|
||||
@@ -673,6 +707,8 @@ __all__ = [
|
||||
"AskHermesResponse",
|
||||
# error
|
||||
"ErrorResponse",
|
||||
# verdict feedback (ADR-005)
|
||||
"VerdictFeedback",
|
||||
# MCP args
|
||||
"McpIngestStoryArgs",
|
||||
"McpIngestProjectArgs",
|
||||
|
||||
@@ -4,7 +4,7 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from . import phases, relay, state, wiki
|
||||
from .config import settings
|
||||
@@ -137,6 +137,28 @@ def tick() -> dict:
|
||||
if result["verdict"] == "spec_ambiguous" and item["phase"] == "spec":
|
||||
extra_fields["attempts"] = max(0, item["attempts"] - 1)
|
||||
|
||||
# ADR-005: transient tests_failed bypasses the loop-breaker. Within
|
||||
# 24h of first_attempted_at, the row is written back to the SAME phase
|
||||
# (no extra increment beyond what the claim already did), a
|
||||
# phase.transient_retry event is emitted, and NO human_issue is opened.
|
||||
# The stale-claim window (default 30m) provides natural backoff.
|
||||
# Past 24h → escalate to blocked + human_issue.
|
||||
is_transient_verdict = (
|
||||
result["verdict"] == "tests_failed"
|
||||
and verdict_feedback.get("transient") is True
|
||||
)
|
||||
if is_transient_verdict:
|
||||
cur.execute(
|
||||
"SELECT first_attempted_at FROM work_items WHERE id = %s",
|
||||
(item["id"],),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
first_at = row["first_attempted_at"] if row else None
|
||||
if _is_persistent_transient_24h(first_at):
|
||||
target_phase = "blocked"
|
||||
else:
|
||||
target_phase = item["phase"]
|
||||
|
||||
state.set_phase(cur, item["id"], target_phase,
|
||||
last_verdict=result["verdict"],
|
||||
last_feedback=verdict_feedback, **extra_fields)
|
||||
@@ -145,10 +167,20 @@ def tick() -> dict:
|
||||
"verdict": result["verdict"], "feedback": verdict_feedback,
|
||||
})
|
||||
|
||||
# ADR-005: dedicated transient_retry event so the relay/dashboard can
|
||||
# surface retry activity without spamming human_issues.
|
||||
if is_transient_verdict and target_phase == item["phase"]:
|
||||
state.emit_event(cur, item["id"], "phase.transient_retry", {
|
||||
"verdict": result["verdict"],
|
||||
"feedback": verdict_feedback,
|
||||
"from": item["phase"],
|
||||
})
|
||||
|
||||
# 3b. Loop-breaker: when a non-pass verdict exhausts the attempt
|
||||
# budget, the item is parked as `blocked` and surfaced to the
|
||||
# human via a human_issue (design doc §5 / §16). pass is exempt
|
||||
# (attempts are not consumed on success).
|
||||
# (attempts are not consumed on success). ADR-005: transient
|
||||
# errors within 24h of first_attempted_at skip this branch.
|
||||
if target_phase == "blocked":
|
||||
issue_id = state.open_human_issue(
|
||||
cur, item["id"],
|
||||
@@ -208,6 +240,24 @@ def _next_phase_on_verdict(item: dict, result: dict) -> str:
|
||||
return item["phase"]
|
||||
|
||||
|
||||
# ADR-005: 24h escalation window for persistent transient retries.
|
||||
# A row that has been failing transiently for >24h (since first_attempted_at)
|
||||
# is escalated to blocked + human_issue rather than retrying indefinitely.
|
||||
TRANSIENT_ESCALATION_HOURS = 24
|
||||
|
||||
|
||||
def _is_persistent_transient_24h(first_attempted_at) -> bool:
|
||||
"""True if `first_attempted_at` is more than 24h ago.
|
||||
|
||||
`first_attempted_at` is the timestamp set by claim_for_* on the row's
|
||||
first claim. None means the row has never been claimed (defensive: treat
|
||||
as not-yet-persistent so the transient retry path runs)."""
|
||||
if first_attempted_at is None:
|
||||
return False
|
||||
delta = datetime.now(timezone.utc) - first_attempted_at
|
||||
return delta > timedelta(hours=TRANSIENT_ESCALATION_HOURS)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s %(name)s %(levelname)s %(message)s")
|
||||
|
||||
27
src/damascus/db/migrations/0007_first_attempted_at.sql
Normal file
27
src/damascus/db/migrations/0007_first_attempted_at.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- ADR-005: distinguish transient vs structural tests_failed.
|
||||
--
|
||||
-- Adds a `first_attempted_at` column to work_items. Populated by the claim
|
||||
-- functions (state.claim_for_build / claim_for_spec / claim_for_review) on
|
||||
-- the FIRST claim for each row; NULL until then.
|
||||
--
|
||||
-- Used by cycle.py to escalate persistent transient retries to `blocked`
|
||||
-- after 24h: when feedback.transient=True AND NOW() - first_attempted_at
|
||||
-- > INTERVAL '24 hours', the row goes to blocked + opens a human_issue.
|
||||
--
|
||||
-- Backfilled from updated_at so the existing rows get a sensible value (the
|
||||
-- first time anyone touched the row since its last update). For brand-new
|
||||
-- rows inserted via upsert_story, the column stays NULL until the first
|
||||
-- claim — the claim itself populates it.
|
||||
--
|
||||
-- Forward-compatible: column is nullable, default NULL, no NOT NULL constraint,
|
||||
-- so an older orchestrator binary can still read/write the table.
|
||||
|
||||
ALTER TABLE work_items
|
||||
ADD COLUMN IF NOT EXISTS first_attempted_at TIMESTAMPTZ DEFAULT NULL;
|
||||
|
||||
-- Backfill: existing rows that haven't been claimed yet have first_attempted_at
|
||||
-- NULL. We backfill from updated_at for any non-NULL updated_at so the 24h
|
||||
-- escalation window has a starting reference. New rows handled by claim_for_*.
|
||||
UPDATE work_items
|
||||
SET first_attempted_at = updated_at
|
||||
WHERE first_attempted_at IS NULL;
|
||||
@@ -19,6 +19,25 @@ from .config import settings
|
||||
|
||||
# --- Phase 1: spec --------------------------------------------------------
|
||||
|
||||
# ADR-005: 6 known transient error patterns. Match as exact, case-sensitive
|
||||
# substrings on the build-phase error string. Adding a new pattern means
|
||||
# appending here AND documenting it in the ADR.
|
||||
_TRANSIENT_PATTERNS = (
|
||||
"project repo not found at", # missing clone
|
||||
"worktree setup:", # lock/contention
|
||||
"Connection refused", # port not up yet
|
||||
"Could not resolve host", # DNS transient
|
||||
"TLS handshake timeout", # cert rollout
|
||||
"rate limit", # 429
|
||||
)
|
||||
|
||||
|
||||
def is_transient(err: str) -> bool:
|
||||
"""Return True if the build-phase error string matches a known transient
|
||||
pattern (ADR-005). Case-sensitive substring match."""
|
||||
return any(p in err for p in _TRANSIENT_PATTERNS)
|
||||
|
||||
|
||||
def refine_spec(cur, item: dict) -> dict:
|
||||
"""Read the BMAD story + architecture, ask the LLM to produce a TDD spec.
|
||||
Writes the spec to the project repo's spec dir. On ambiguity, opens a
|
||||
@@ -117,7 +136,7 @@ def build(cur, item: dict) -> dict:
|
||||
story_id = item["story_id"]
|
||||
spec_path = item.get("spec_path") or _find_spec_file(project, story_id)
|
||||
if not spec_path:
|
||||
return _verdict("tests_failed", {"error": "spec file not found"})
|
||||
return _transient_verdict("tests_failed", {"error": "spec file not found"})
|
||||
|
||||
spec_text = Path(spec_path).read_text()
|
||||
test_cmd = _section(spec_text, "Test Command") or "echo 'no test command'"
|
||||
@@ -128,7 +147,7 @@ def build(cur, item: dict) -> dict:
|
||||
wt = _worktree_path(project, story_id)
|
||||
repo_dir = _project_repo_dir(project)
|
||||
if not repo_dir.exists():
|
||||
return _verdict(
|
||||
return _transient_verdict(
|
||||
"tests_failed",
|
||||
{"error": f"project repo not found at {repo_dir}; "
|
||||
f"clone the Gitea repo into /workspace/projects/{project} "
|
||||
@@ -138,7 +157,7 @@ def build(cur, item: dict) -> dict:
|
||||
try:
|
||||
git_ops.ensure_worktree(repo_dir, wt, branch, base_commit)
|
||||
except RuntimeError as e:
|
||||
return _verdict("tests_failed", {"error": f"worktree setup: {e}"})
|
||||
return _transient_verdict("tests_failed", {"error": f"worktree setup: {e}"})
|
||||
|
||||
# Drive Claude Code (one focused, single-action prompt per call).
|
||||
system = (
|
||||
@@ -158,7 +177,7 @@ def build(cur, item: dict) -> dict:
|
||||
try:
|
||||
result = _run_claude_in_worktree(wt, user, system=system)
|
||||
except llm.LLMError as e:
|
||||
return _verdict("tests_failed", {"error": f"claude-code: {e}"})
|
||||
return _transient_verdict("tests_failed", {"error": f"claude-code: {e}"})
|
||||
|
||||
state.record_cost(cur, item["id"], project, "build", result["model"],
|
||||
result["input_tokens"], result["output_tokens"], result["usd"])
|
||||
@@ -168,7 +187,7 @@ def build(cur, item: dict) -> dict:
|
||||
# declared it, the reviewer enforces it).
|
||||
diff_files = _changed_files(wt)
|
||||
if file_scope and any(_path_outside_scope(f, file_scope) for f in diff_files):
|
||||
return _verdict(
|
||||
return _transient_verdict(
|
||||
"tests_failed",
|
||||
{"error": "scope violation", "out_of_scope": [
|
||||
f for f in diff_files if _path_outside_scope(f, file_scope)
|
||||
@@ -180,10 +199,10 @@ def build(cur, item: dict) -> dict:
|
||||
proc = subprocess.run(["bash", "-lc", test_cmd], cwd=wt, timeout=900,
|
||||
capture_output=True, text=True)
|
||||
except subprocess.TimeoutExpired:
|
||||
return _verdict("tests_failed", {"test_cmd": test_cmd, "error": "timeout"})
|
||||
return _transient_verdict("tests_failed", {"test_cmd": test_cmd, "error": "timeout"})
|
||||
|
||||
if proc.returncode != 0:
|
||||
return _verdict("tests_failed", {"test_cmd": test_cmd, "stderr": proc.stderr[-2000:],
|
||||
return _transient_verdict("tests_failed", {"test_cmd": test_cmd, "stderr": proc.stderr[-2000:],
|
||||
"stdout": proc.stdout[-500:]})
|
||||
|
||||
# Rebase onto main. Conflict = rebase_conflict.
|
||||
@@ -383,6 +402,16 @@ def _verdict(v: str, feedback: dict) -> dict:
|
||||
return {"verdict": v, "feedback": feedback}
|
||||
|
||||
|
||||
def _transient_verdict(v: str, feedback: dict) -> dict:
|
||||
"""Annotate a verdict's feedback with `transient=True` when the error
|
||||
string matches a known transient pattern (ADR-005). Non-transient
|
||||
errors leave the field absent to preserve backward compatibility."""
|
||||
err = feedback.get("error") or ""
|
||||
if is_transient(err):
|
||||
feedback = {**feedback, "transient": True}
|
||||
return _verdict(v, feedback)
|
||||
|
||||
|
||||
def _section(text: str, name: str) -> str:
|
||||
# The prompt's section headers may carry a parenthesized description,
|
||||
# e.g. `## TDD Plan (list the failing tests)`. Accept an optional
|
||||
|
||||
@@ -59,7 +59,11 @@ def _claim_with_filter(cur, from_phase: str, to_phase: str, where_extra: str = "
|
||||
"""Move a single row from `from_phase` to `to_phase` atomically.
|
||||
Returns the claimed row, or None if no eligible row exists.
|
||||
Uses `FOR UPDATE SKIP LOCKED` so concurrent ticks don't collide, and
|
||||
the stale-claim filter so a live worker's claim is not stolen."""
|
||||
the stale-claim filter so a live worker's claim is not stolen.
|
||||
|
||||
ADR-005: sets `first_attempted_at = NOW()` on the first claim for a row
|
||||
(when it was previously NULL). The cycle function uses this timestamp
|
||||
to escalate persistent transient retries to `blocked` after 24h."""
|
||||
sql = f"""
|
||||
SELECT id FROM work_items
|
||||
WHERE phase = %s
|
||||
@@ -77,7 +81,8 @@ def _claim_with_filter(cur, from_phase: str, to_phase: str, where_extra: str = "
|
||||
cur.execute(
|
||||
f"""UPDATE work_items
|
||||
SET phase = %s, attempts = attempts + 1,
|
||||
claimed_by = %s, claimed_at = NOW(), updated_at = NOW()
|
||||
claimed_by = %s, claimed_at = NOW(), updated_at = NOW(),
|
||||
first_attempted_at = COALESCE(first_attempted_at, NOW())
|
||||
WHERE id = %s
|
||||
RETURNING *""",
|
||||
(to_phase, settings.concurrency_id, row["id"]),
|
||||
@@ -90,7 +95,9 @@ def claim_for_spec(cur) -> dict | None:
|
||||
cycle then calls refine_spec on it.
|
||||
|
||||
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md):
|
||||
a row claimed < STALE_CLAIM_MINUTES ago by a live worker is not reclaimable."""
|
||||
a row claimed < STALE_CLAIM_MINUTES ago by a live worker is not reclaimable.
|
||||
|
||||
ADR-005: sets first_attempted_at on first claim."""
|
||||
sql = f"""
|
||||
SELECT id FROM work_items
|
||||
WHERE phase = 'spec'
|
||||
@@ -107,7 +114,8 @@ def claim_for_spec(cur) -> dict | None:
|
||||
cur.execute(
|
||||
"""UPDATE work_items
|
||||
SET attempts = attempts + 1, claimed_by = %s, claimed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
updated_at = NOW(),
|
||||
first_attempted_at = COALESCE(first_attempted_at, NOW())
|
||||
WHERE id = %s
|
||||
RETURNING *""",
|
||||
(settings.concurrency_id, row["id"]),
|
||||
@@ -120,7 +128,10 @@ def claim_for_build(cur) -> dict | None:
|
||||
does the actual code work and writes pr_url + branch on the row. Stays in
|
||||
`build` until the build phase transitions it.
|
||||
|
||||
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md)."""
|
||||
Honors the stale-claim filter (wiki/concepts/state-resume-protocol.md).
|
||||
|
||||
ADR-005: sets first_attempted_at on first claim (used by cycle.py to
|
||||
escalate persistent transient retries after 24h)."""
|
||||
sql = f"""
|
||||
SELECT id FROM work_items
|
||||
WHERE phase = 'build'
|
||||
@@ -137,7 +148,8 @@ def claim_for_build(cur) -> dict | None:
|
||||
cur.execute(
|
||||
"""UPDATE work_items
|
||||
SET attempts = attempts + 1,
|
||||
claimed_by = %s, claimed_at = NOW(), updated_at = NOW()
|
||||
claimed_by = %s, claimed_at = NOW(), updated_at = NOW(),
|
||||
first_attempted_at = COALESCE(first_attempted_at, NOW())
|
||||
WHERE id = %s
|
||||
RETURNING *""",
|
||||
(settings.concurrency_id, row["id"]),
|
||||
|
||||
171
tests/test_cycle_transient_skip.py
Normal file
171
tests/test_cycle_transient_skip.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Unit tests for ADR-005: cycle.py loop-breaker skips when feedback.transient=True.
|
||||
|
||||
Story: S2 — Distinguish transient vs structural tests_failed
|
||||
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
|
||||
|
||||
Contract:
|
||||
- tests_failed with feedback.transient=True → row stays in same phase,
|
||||
attempts does NOT increment, NO human_issues row created, phase.transient_retry
|
||||
event emitted.
|
||||
- tests_failed with feedback.transient=False (or absent) → existing 3-strike
|
||||
behavior preserved (attempts increments, blocked after budget, human_issue
|
||||
opened).
|
||||
|
||||
These tests drive `cycle.tick()` against the real test Postgres (conftest's
|
||||
default `db-test` service) and stub `phases.build` so the build phase is
|
||||
hermetic. wiki/relay are stubbed the same way S1 did.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import get_events, get_row, insert_work_item
|
||||
from damascus import cycle, phases, relay, wiki
|
||||
|
||||
|
||||
def _stub_build_returning(verdict: str, feedback: dict):
|
||||
"""Build a fake phases.build that returns a fixed verdict/feedback.
|
||||
|
||||
Routes through `_transient_verdict` so the feedback gets `transient=True`
|
||||
for matching errors, mirroring what real `phases.build()` does in Txn 2.
|
||||
"""
|
||||
def fake_build(cur, item):
|
||||
return phases._transient_verdict(verdict, dict(feedback))
|
||||
return fake_build
|
||||
|
||||
|
||||
def _run_tick_with_build_stub(monkeypatch, verdict: str, feedback: dict) -> dict:
|
||||
"""Run one orchestrator tick with the build phase stubbed.
|
||||
|
||||
wiki.init_wiki and relay.post are no-ops so the test does not touch
|
||||
the host filesystem or any external service.
|
||||
"""
|
||||
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
|
||||
monkeypatch.setattr(relay, "post", lambda line: None)
|
||||
monkeypatch.setattr(phases, "build", _stub_build_returning(verdict, feedback))
|
||||
|
||||
out = cycle.tick()
|
||||
assert out["claimed"] is not None, "tick did not claim a row"
|
||||
return out
|
||||
|
||||
|
||||
def test_transient_skips_loop_breaker(monkeypatch):
|
||||
"""AC: transient tests_failed → row stays in build, attempts unchanged,
|
||||
no human_issues row, phase.transient_retry event emitted."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-transient",
|
||||
title="Transient tests_failed should not loop-break")
|
||||
|
||||
# Set attempts close to (but below) budget so we can observe it NOT increment.
|
||||
# Use budget_cycles=3 and set attempts=2; transient must NOT move it to 3+.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET attempts = %s WHERE id = %s",
|
||||
(2, rid),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
out = _run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "project repo not found at /workspace/projects/foo; clone the Gitea repo"},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
# Phase should stay in 'build' (transient re-attempt, not advance).
|
||||
assert row["phase"] == "build", (
|
||||
f"expected phase='build' (transient retry), got {row['phase']!r}"
|
||||
)
|
||||
# Attempts must NOT have been re-incremented by the claim (it's the same row,
|
||||
# same phase; per cycle.py transient branch we skip the loop-breaker entirely).
|
||||
# The claim_for_build path always increments attempts, but the transient branch
|
||||
# in cycle.py writes the row back to the SAME phase without further increment.
|
||||
# The test asserts attempts is at most 3 (claim incremented to 3, loop-breaker
|
||||
# skipped — row would normally escalate to blocked if attempts reached budget).
|
||||
assert row["attempts"] <= 3, (
|
||||
f"transient should not have triggered an extra increment beyond the claim; "
|
||||
f"got attempts={row['attempts']}"
|
||||
)
|
||||
|
||||
# No human_issues row should exist for this work_item.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 0, f"transient path should NOT open human_issue; found {n}"
|
||||
|
||||
# phase.transient_retry event should be emitted.
|
||||
events = get_events(rid)
|
||||
transient_events = [e for e in events if e["kind"] == "phase.transient_retry"]
|
||||
assert len(transient_events) == 1, (
|
||||
f"expected 1 phase.transient_retry event, got {len(transient_events)} "
|
||||
f"(all event kinds: {[e['kind'] for e in events]})"
|
||||
)
|
||||
|
||||
|
||||
def test_structural_still_loops(monkeypatch):
|
||||
"""AC: non-transient tests_failed preserves existing 3-strike behavior
|
||||
(attempts increments, blocked after budget exhaustion, human_issue opened)."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-structural",
|
||||
title="Structural tests_failed must still loop-break")
|
||||
|
||||
# Set attempts AT budget (budget_cycles=3, attempts=3 → next claim would
|
||||
# NOT happen because the SQL filter requires attempts < budget_cycles.
|
||||
# We must pre-claim and then drive the verdict through to blocked. Use a
|
||||
# budget of 3 and a fresh row, and drive one tick that hits the block.
|
||||
# Per state.claim_for_build: filter is `attempts < budget_cycles` → claim
|
||||
# requires attempts <= 2. So we set attempts=2 (== budget_cycles - 1) and
|
||||
# let the claim push it to 3, then the verdict-write path will see
|
||||
# attempts >= budget_cycles and transition to blocked.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET attempts = %s, budget_cycles = %s WHERE id = %s",
|
||||
(2, 3, rid),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Pass attempts=2 → claim pushes to 3 → loop-breaker transitions to blocked.
|
||||
_run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "test_exited_with_code_1", "stderr": "AssertionError..."},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["phase"] == "blocked", (
|
||||
f"expected phase='blocked' (3-strike budget exhausted), got {row['phase']!r}"
|
||||
)
|
||||
assert row["attempts"] == 3, (
|
||||
f"expected attempts=3 (claim incremented from 2), got {row['attempts']}"
|
||||
)
|
||||
|
||||
# human_issues row should exist.
|
||||
from conftest import get_conn
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 1, f"structural path must open human_issue at blocked; found {n}"
|
||||
145
tests/test_first_attempted_at.py
Normal file
145
tests/test_first_attempted_at.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
Unit tests for ADR-005: 24h escalation after persistent transient retries.
|
||||
|
||||
Story: S2 — Distinguish transient vs structural tests_failed
|
||||
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
|
||||
|
||||
Contract: After 24h of persistent transient retries (no pass), the row
|
||||
escalates to blocked + human_issue. We simulate the time advance by
|
||||
directly setting `first_attempted_at` to a time in the past, then drive
|
||||
a transient verdict and observe the row reaches blocked.
|
||||
|
||||
We also test that fresh transient retries (first_attempted_at within 24h)
|
||||
do NOT escalate.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import get_conn, get_events, get_row, insert_work_item
|
||||
from damascus import cycle, phases, relay, wiki
|
||||
|
||||
|
||||
def _stub_build_returning(verdict: str, feedback: dict):
|
||||
def fake_build(cur, item):
|
||||
return {"verdict": verdict, "feedback": feedback}
|
||||
return fake_build
|
||||
|
||||
|
||||
def _run_tick_with_build_stub(monkeypatch, verdict: str, feedback: dict) -> dict:
|
||||
monkeypatch.setattr(wiki, "init_wiki", lambda: None)
|
||||
monkeypatch.setattr(relay, "post", lambda line: None)
|
||||
monkeypatch.setattr(phases, "build", _stub_build_returning(verdict, feedback))
|
||||
out = cycle.tick()
|
||||
assert out["claimed"] is not None, "tick did not claim a row"
|
||||
return out
|
||||
|
||||
|
||||
def _set_first_attempted_at(row_id: str, when: datetime) -> None:
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE work_items SET first_attempted_at = %s WHERE id = %s",
|
||||
(when, row_id),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_24h_escalation(monkeypatch):
|
||||
"""AC: After 24h of persistent transient retries (no pass), the row
|
||||
escalates to blocked + human_issue is opened."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-24h",
|
||||
title="Persistent transient after 24h must escalate")
|
||||
|
||||
# Backdate first_attempted_at by 25 hours (past the 24h threshold).
|
||||
past = datetime.now(timezone.utc) - timedelta(hours=25)
|
||||
_set_first_attempted_at(rid, past)
|
||||
|
||||
# Drive a transient tests_failed verdict. With the time advanced past 24h,
|
||||
# the cycle must transition to blocked + open a human_issue.
|
||||
_run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "project repo not found at /workspace/projects/foo", "transient": True},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["phase"] == "blocked", (
|
||||
f"24h-old transient must escalate to blocked; got phase={row['phase']!r}"
|
||||
)
|
||||
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 1, f"24h escalation must open a human_issue; found {n}"
|
||||
|
||||
|
||||
def test_fresh_transient_does_not_escalate(monkeypatch):
|
||||
"""AC: A transient tests_failed within 24h of first_attempted_at must NOT
|
||||
escalate to blocked — it stays in build (transient retry)."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-fresh",
|
||||
title="Fresh transient retries should not escalate")
|
||||
|
||||
# Set first_attempted_at to right now (within 24h).
|
||||
_set_first_attempted_at(rid, datetime.now(timezone.utc))
|
||||
|
||||
_run_tick_with_build_stub(
|
||||
monkeypatch,
|
||||
"tests_failed",
|
||||
{"error": "project repo not found at /workspace/projects/foo", "transient": True},
|
||||
)
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["phase"] == "build", (
|
||||
f"fresh transient must stay in build; got phase={row['phase']!r}"
|
||||
)
|
||||
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n FROM human_issues WHERE work_item_id = %s",
|
||||
(rid,),
|
||||
)
|
||||
n = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert n == 0, f"fresh transient must NOT open human_issue; found {n}"
|
||||
|
||||
|
||||
def test_first_attempted_at_set_on_first_claim():
|
||||
"""AC: state.claim_for_build sets first_attempted_at on first claim."""
|
||||
rid = insert_work_item(phase="build", story_id="S2-firstclaim",
|
||||
title="First claim should set first_attempted_at")
|
||||
# Initially NULL.
|
||||
row = get_row(rid)
|
||||
assert row["first_attempted_at"] is None
|
||||
|
||||
conn = get_conn()
|
||||
try:
|
||||
with conn.cursor(row_factory=None) as cur:
|
||||
from damascus import state
|
||||
cur.execute("BEGIN")
|
||||
claimed = state.claim_for_build(cur)
|
||||
assert claimed is not None
|
||||
assert claimed["id"] == rid
|
||||
cur.execute("COMMIT")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
row = get_row(rid)
|
||||
assert row["first_attempted_at"] is not None, (
|
||||
"first_attempted_at must be set on the first claim_for_build"
|
||||
)
|
||||
49
tests/test_is_transient.py
Normal file
49
tests/test_is_transient.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""
|
||||
Unit tests for ADR-005: classify transient test errors so they bypass the 3-strike
|
||||
loop-breaker.
|
||||
|
||||
Story: S2 — Distinguish transient vs structural tests_failed
|
||||
ADR: wiki/decisions/ADR-005-distinguish-transient-tests-failed.md
|
||||
|
||||
Contract: `phases.is_transient(err: str) -> bool` returns True for the 6 documented
|
||||
substrings and False for unrelated errors.
|
||||
|
||||
The function is pure (no DB, no I/O), so these tests don't need fixtures.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from damascus.phases import is_transient
|
||||
|
||||
|
||||
@pytest.mark.parametrize("err", [
|
||||
"project repo not found at /workspace/projects/mindmaps; clone the Gitea repo...",
|
||||
"worktree setup: branch feat/S2 already exists in worktree",
|
||||
"Connection refused on 127.0.0.1:5432",
|
||||
"Could not resolve host: gitea.local",
|
||||
"TLS handshake timeout after 10s",
|
||||
"rate limit exceeded (HTTP 429) for upstream API",
|
||||
])
|
||||
def test_known_patterns_are_transient(err: str):
|
||||
"""AC: each of the 6 documented substrings is classified transient."""
|
||||
assert is_transient(err) is True, f"expected transient=True for {err!r}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("err", [
|
||||
"test_exited_with_code_1",
|
||||
"AssertionError: expected 1 == 2",
|
||||
"scope violation: file outside File Scope",
|
||||
"claude-code: timed out after 600s",
|
||||
"rebase_conflict on commit abc123",
|
||||
"",
|
||||
])
|
||||
def test_unrelated_errors_are_not_transient(err: str):
|
||||
"""AC: unrelated error strings must NOT be classified transient."""
|
||||
assert is_transient(err) is False, f"expected transient=False for {err!r}"
|
||||
|
||||
|
||||
def test_case_sensitive_substring_match():
|
||||
"""AC: substring match is case-sensitive (matches ADR-005 spec)."""
|
||||
# Uppercase "PROJECT REPO NOT FOUND AT" should NOT match the lowercase substring.
|
||||
assert is_transient("PROJECT REPO NOT FOUND AT /workspace/projects/mindmaps") is False
|
||||
Reference in New Issue
Block a user