From 2ef0db9a9632449fc48424efa2d643a346f84dce Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 15 Jun 2026 21:15:11 +0000 Subject: [PATCH] afk: add the autonomous issue-implementer loop (SHIPS DISABLED) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds app/afk/ — the "away-from-keyboard" control plane that watches the issue tracker for ready-for-agent issues, dispatches each to a fresh full-access T3 thread (with the issue-implementer preamble prepended, because T3 does not honour ~/.claude/CLAUDE.md), and drives the resulting run through its lifecycle: tests-red -> green -> pushed -> CI -> deployed, escalating or fix-forwarding via a small pure state machine. The loop is split into pure cores (no I/O, exhaustively unit-tested) and thin injected adapters (the only edges that ever touch T3, the tracker, CI, or Slack — faked in every test, so nothing here talks to a real server, GitHub/Forgejo, or the cluster): pure: types, dispatch_policy, run_state_machine, phase_checklist, config, issue_implementer_prompt adapters: t3_client (two-POST dispatch + snapshot), tracker, ci_watcher, notifier loops: poller — CronJob tick #1: list_ready -> select_dispatchable -> dispatch + stamp the in-progress lock (label only AFTER a successful dispatch, so a failed dispatch never leaves a phantom lock). Per-repo lock derived from the ready set, since the CronJob is stateless between ticks. watcher — CronJob tick #2: assemble RunState from snapshot + CI -> next_action -> act (close on success; relabel ready-for-human + ring the doorbell on the two escalations; dispatch a corrective turn on fix-forward; refresh the progress checklist). SHIPS DISABLED, on purpose: Config defaults to kill_switch=True AND an empty allowlist, so a freshly-loaded config dispatches nothing and does zero I/O. The package is not imported by the running service and has no auto-enable path. Arming it is a deliberate, later, manual step requiring BOTH gates (clear the kill switch AND enrol the exact repos) so one fat-fingered env var can't arm every repo. Test-first throughout: 412 tests pass (poller + watcher add integration tests wiring the real pure cores to in-memory fakes). mypy clean. Co-Authored-By: Claude Opus 4.8 --- app/afk/__init__.py | 43 +++ app/afk/ci_watcher.py | 141 ++++++++ app/afk/config.py | 127 +++++++ app/afk/dispatch_policy.py | 117 +++++++ app/afk/issue_implementer_prompt.py | 54 +++ app/afk/notifier.py | 155 +++++++++ app/afk/phase_checklist.py | 116 +++++++ app/afk/poller.py | 166 ++++++++++ app/afk/run_state_machine.py | 84 +++++ app/afk/t3_client.py | 159 +++++++++ app/afk/tracker.py | 243 ++++++++++++++ app/afk/types.py | 134 ++++++++ app/afk/watcher.py | 342 +++++++++++++++++++ tests/conftest.py | 183 +++++++++++ tests/test_afk_ci_watcher.py | 285 ++++++++++++++++ tests/test_afk_dispatch_policy.py | 374 +++++++++++++++++++++ tests/test_afk_notifier.py | 198 +++++++++++ tests/test_afk_phase_checklist.py | 247 ++++++++++++++ tests/test_afk_poller.py | 269 +++++++++++++++ tests/test_afk_run_state_machine.py | 190 +++++++++++ tests/test_afk_t3_client.py | 248 ++++++++++++++ tests/test_afk_tracker.py | 493 ++++++++++++++++++++++++++++ tests/test_afk_watcher.py | 349 ++++++++++++++++++++ 23 files changed, 4717 insertions(+) create mode 100644 app/afk/__init__.py create mode 100644 app/afk/ci_watcher.py create mode 100644 app/afk/config.py create mode 100644 app/afk/dispatch_policy.py create mode 100644 app/afk/issue_implementer_prompt.py create mode 100644 app/afk/notifier.py create mode 100644 app/afk/phase_checklist.py create mode 100644 app/afk/poller.py create mode 100644 app/afk/run_state_machine.py create mode 100644 app/afk/t3_client.py create mode 100644 app/afk/tracker.py create mode 100644 app/afk/types.py create mode 100644 app/afk/watcher.py create mode 100644 tests/test_afk_ci_watcher.py create mode 100644 tests/test_afk_dispatch_policy.py create mode 100644 tests/test_afk_notifier.py create mode 100644 tests/test_afk_phase_checklist.py create mode 100644 tests/test_afk_poller.py create mode 100644 tests/test_afk_run_state_machine.py create mode 100644 tests/test_afk_t3_client.py create mode 100644 tests/test_afk_tracker.py create mode 100644 tests/test_afk_watcher.py diff --git a/app/afk/__init__.py b/app/afk/__init__.py new file mode 100644 index 0000000..1527d26 --- /dev/null +++ b/app/afk/__init__.py @@ -0,0 +1,43 @@ +"""AFK loop: the autonomous issue-implementer control plane. + +This package is the "away-from-keyboard" automation that watches the issue +tracker for ``ready-for-agent`` issues, dispatches each to a fresh **T3** thread +(the full-access ``claudeAgent`` runtime) with the issue-implementer preamble +prepended, then drives the resulting run through its lifecycle — tests-red → +green → pushed → CI → deployed — escalating or fix-forwarding per a small, +testable state machine. It owns no agent behaviour itself; the agent's standing +rules are injected as a prompt preamble (``issue_implementer_prompt``) because +T3 does NOT honour ``~/.claude/CLAUDE.md``. + +The whole loop ships **DISABLED**, by two independent gates: ``Config`` defaults +to ``kill_switch=True`` AND an empty ``allowlist`` (see ``config.py``). Importing +this package, scheduling the CronJob entrypoints, or constructing the default +``Config`` therefore dispatches NOTHING and performs zero I/O — a disabled tick +is wholly inert. The package is also not imported by the running service +(``app.main``), so wiring it in changes nothing on its own. + +>>> ENABLING IS A DELIBERATE MANUAL STEP, PERFORMED LATER, NEVER BY THIS CODE. <<< +Arming the loop takes BOTH of, on purpose (either alone stays inert, so one +fat-fingered env var can't arm every repo): + 1. clear the kill switch (``AFK_KILL_SWITCH=false`` / ConfigMap ``kill_switch: "false"``), AND + 2. enrol the exact repos (``AFK_ALLOWLIST=repo-a,repo-b`` / ConfigMap ``allowlist``). +There is no auto-enable path anywhere in this package; do not add one here. + +Every test in the suite runs against fakes — this package never talks to a real +T3 server, GitHub/Forgejo, the cluster, or Slack. + +Module map (each is independently testable against the interfaces in +``types.py``): + * ``types`` — shared dataclasses + enums (the contract). + * ``config`` — disabled-by-default Config + env/configmap loaders. + * ``issue_implementer_prompt`` — the preamble prepended to every dispatch. + * ``dispatch_policy`` — which ready issues to dispatch right now (pure). + * ``run_state_machine`` — snapshot + CI status → next Action (pure). + * ``phase_checklist`` — render the run's progress as a markdown checklist (pure). + * ``t3_client`` — the two-POST T3 dispatch + snapshot reader. + * ``tracker`` — issue-tracker reads/labels/comments/close. + * ``ci_watcher`` — commit → CI status. + * ``notifier`` — escalation/notification sink. + * ``poller`` — CronJob tick #1: select + dispatch ready issues. + * ``watcher`` — CronJob tick #2: drive one in-flight run to a verdict. +""" diff --git a/app/afk/ci_watcher.py b/app/afk/ci_watcher.py new file mode 100644 index 0000000..274a21d --- /dev/null +++ b/app/afk/ci_watcher.py @@ -0,0 +1,141 @@ +"""CI watcher — fold a pushed commit's pipeline into a single ``CIStatus``. + +A commit the agent pushed to ``master`` is only "done" once it has both *built* +and *deployed*: the CI/CD chain is GHA → ghcr → Woodpecker → Keel +(``docs/2026-06-14-afk-implementation-pipeline-design.md``). This adapter +collapses that multi-stage reality into the three-value verdict the state +machine speaks (:class:`~app.afk.types.CIStatus`): ``PENDING`` / ``GREEN`` / +``RED``. + +It checks three stages in order and stops at the first that decides the verdict: + + 1. **build** — the GitHub Actions run for the commit (build + test + lint); + 2. **deploy** — the Woodpecker pipeline that ships the built image; + 3. **rollout** — the image actually reaching the cluster (Keel/k8s rollout). + +Folding rule, applied stage by stage: a ``FAILURE`` anywhere is ``RED`` (and we +short-circuit — a red build is never "rolled out", and we don't bother the later +clients); a stage that hasn't concluded (``NONE`` = no run yet, ``PENDING`` = +in progress) makes the whole verdict ``PENDING`` (the state machine waits on +either); only when *every* stage has succeeded is the commit ``GREEN``. + +The three stage clients are **injected**, each behind a tiny structural +:class:`typing.Protocol`, so this module never imports ``gh`` / ``woodpecker`` / +``kubectl`` and the tests drive it entirely with fakes. The rollout client is +**optional** — the pilot keeps cluster/``state.sqlite`` reads optional, so a +watcher built without one treats a green deploy as the terminal ``GREEN``. The +real client wiring (subprocess argv, JSON parsing, kubectl-exec) lives in the +adapters that *implement* these Protocols, not here; keeping this module pure +keeps the folding logic the only thing under test. +""" +from enum import Enum +from typing import Protocol + +from .types import CIStatus + + +class StageResult(Enum): + """Outcome of one CI/CD stage for a commit, before folding into ``CIStatus``. + + Each injected client returns one of these per ``(repo, commit)``: + + ``NONE`` — no run exists yet for this commit (e.g. the webhook hasn't fired); + ``PENDING`` — a run exists and is still in progress; + ``SUCCESS`` — the stage concluded green; + ``FAILURE`` — the stage concluded red. + + ``NONE`` and ``PENDING`` are distinct on purpose so a client can report + "nothing here yet" vs "running" even though both fold to ``CIStatus.PENDING``; + keeping them separate lets callers/log lines tell the two apart. + """ + + NONE = "none" + PENDING = "pending" + SUCCESS = "success" + FAILURE = "failure" + + +# --------------------------------------------------------------------------- # +# Injected client Protocols — structural, so any object with the right method +# (real adapter or test fake) satisfies them. No ``Any``: every method is typed +# (repo, commit) -> StageResult. +# --------------------------------------------------------------------------- # +class GitHubChecksClient(Protocol): + """Reads the GitHub Actions run (build + test + lint) for a commit.""" + + def run_conclusion(self, repo: str, commit: str) -> StageResult: ... + + +class WoodpeckerClient(Protocol): + """Reads the Woodpecker deploy pipeline triggered for a commit's image.""" + + def deploy_conclusion(self, repo: str, commit: str) -> StageResult: ... + + +class RolloutClient(Protocol): + """Reads whether the commit's image has rolled out to the cluster.""" + + def rollout_status(self, repo: str, commit: str) -> StageResult: ... + + +class CIWatcher: + """Folds build → deploy → rollout into a single :class:`CIStatus`. + + Inject the three stage clients (``github`` and ``woodpecker`` are required; + ``rollout`` is optional — omit it to stop the verdict at the deploy stage, + matching the pilot's "cluster reads optional" posture). The clients are the + only I/O surface, so production passes real adapters and tests pass fakes; + :meth:`status` itself is pure. + """ + + def __init__( + self, + github: GitHubChecksClient, + woodpecker: WoodpeckerClient, + rollout: RolloutClient | None = None, + ) -> None: + self._github = github + self._woodpecker = woodpecker + self._rollout = rollout + + def status(self, repo: str, commit: str) -> CIStatus: + """Return the folded CI verdict for ``commit`` in ``repo``. + + Stages are queried lazily in order and the first decisive one wins: a + ``FAILURE`` yields ``RED``, an unconcluded stage (``NONE``/``PENDING``) + yields ``PENDING``, and only when every stage has ``SUCCESS`` does the + verdict reach ``GREEN``. Short-circuiting is real — a stage is only + queried if every earlier stage succeeded, so a red/pending build never + touches the deploy or rollout client (the assertions in the tests, and + avoiding a needless kubectl-exec, both depend on this). With no rollout + client the deploy stage is terminal. + """ + # Each entry is a thunk so a later stage's client is never called once an + # earlier stage has already decided the verdict. + probes = [ + lambda: self._github.run_conclusion(repo, commit), + lambda: self._woodpecker.deploy_conclusion(repo, commit), + ] + if self._rollout is not None: + rollout = self._rollout # bind for the closure (narrowed, non-None) + probes.append(lambda: rollout.rollout_status(repo, commit)) + + for probe in probes: + verdict = _stage_verdict(probe()) + if verdict is not None: + return verdict # FAILURE → RED, NONE/PENDING → PENDING + return CIStatus.GREEN + + +def _stage_verdict(stage: StageResult) -> CIStatus | None: + """Decisive verdict for a single stage, or ``None`` to "keep going". + + ``FAILURE`` decides ``RED``; an unconcluded stage (``NONE``/``PENDING``) + decides ``PENDING``; ``SUCCESS`` is non-decisive (``None``) — the next stage + gets to speak, and only the last stage's success folds to ``GREEN``. + """ + if stage is StageResult.FAILURE: + return CIStatus.RED + if stage in (StageResult.NONE, StageResult.PENDING): + return CIStatus.PENDING + return None diff --git a/app/afk/config.py b/app/afk/config.py new file mode 100644 index 0000000..e175339 --- /dev/null +++ b/app/afk/config.py @@ -0,0 +1,127 @@ +"""Config loader for the AFK loop — DISABLED BY DEFAULT. + +The whole loop ships off. A bare ``Config()`` (and therefore ``default()``, +``from_env()`` with nothing set, and ``from_configmap({})``) has +``kill_switch=True`` and an empty ``allowlist`` — so nothing is ever +dispatched until an operator deliberately turns it on. Enabling is a TWO-part +manual step, on purpose: + + 1. set ``AFK_KILL_SWITCH=false`` (or ``kill_switch: "false"`` in the + ConfigMap), AND + 2. populate ``AFK_ALLOWLIST`` with the exact repos that may be automated. + +Either alone is inert: the kill switch off with an empty allowlist still +dispatches nothing, and a full allowlist with the kill switch on is frozen. +Both gates exist so a single fat-fingered env var can't accidentally arm the +loop across every repo. + +``from_env`` reads process env; ``from_configmap`` reads an already-parsed +string→string mapping (the shape a mounted ConfigMap gives you). They share one +parser so the two paths can't drift. Lists are comma-separated; booleans accept +the usual truthy spellings. + +This module owns only *loading* a ``Config`` — the dataclass itself lives in +``types`` and policy decisions live in ``dispatch_policy`` / ``run_state_machine``. +""" +import os +from collections.abc import Mapping + +from .types import Config + +# Env var names — also the ConfigMap keys (one source of truth for both paths). +ENV_ALLOWLIST = "AFK_ALLOWLIST" +ENV_KILL_SWITCH = "AFK_KILL_SWITCH" +ENV_IN_PROGRESS_LABEL = "AFK_IN_PROGRESS_LABEL" +ENV_READY_LABEL = "AFK_READY_LABEL" +ENV_BUDGET_USD = "AFK_BUDGET_USD" +ENV_FIX_FORWARD_MAX_ATTEMPTS = "AFK_FIX_FORWARD_MAX_ATTEMPTS" +ENV_FIX_FORWARD_MAX_SECONDS = "AFK_FIX_FORWARD_MAX_SECONDS" + +# Spellings accepted as boolean true / false (case-insensitive). Anything else +# raises rather than silently defaulting — an unparseable kill-switch value must +# never be guessed safe-or-unsafe. +_TRUE = frozenset({"1", "true", "yes", "on"}) +_FALSE = frozenset({"0", "false", "no", "off"}) + + +def default() -> Config: + """The disabled default Config: kill switch ON, allowlist EMPTY. + + Equivalent to ``Config(allowlist=[], kill_switch=True)``; provided as a named + entry point so callers don't hardcode the disabled posture themselves. + """ + return Config(allowlist=[], kill_switch=True) + + +def from_env(env: Mapping[str, str] | None = None) -> Config: + """Build a Config from environment variables (defaults to ``os.environ``). + + Unset variables fall back to the disabled/contract defaults, so an + unconfigured process stays off. + """ + return _from_mapping(os.environ if env is None else env) + + +def from_configmap(data: Mapping[str, str]) -> Config: + """Build a Config from a parsed ConfigMap (string→string mapping). + + Identical semantics to ``from_env`` — same keys, same parser — but sourced + from a mounted ConfigMap's ``data`` rather than process env. An empty mapping + yields the disabled default. + """ + return _from_mapping(data) + + +# --------------------------------------------------------------------------- # +# Internals — one shared parser so env and ConfigMap paths can't diverge. +# --------------------------------------------------------------------------- # +def _from_mapping(data: Mapping[str, str]) -> Config: + base = default() + return Config( + allowlist=_parse_list(data.get(ENV_ALLOWLIST), base.allowlist), + kill_switch=_parse_bool(data.get(ENV_KILL_SWITCH), base.kill_switch), + in_progress_label=_nonempty(data.get(ENV_IN_PROGRESS_LABEL), base.in_progress_label), + ready_label=_nonempty(data.get(ENV_READY_LABEL), base.ready_label), + budget_usd=_parse_float(data.get(ENV_BUDGET_USD), base.budget_usd), + fix_forward_max_attempts=_parse_int( + data.get(ENV_FIX_FORWARD_MAX_ATTEMPTS), base.fix_forward_max_attempts + ), + fix_forward_max_seconds=_parse_int( + data.get(ENV_FIX_FORWARD_MAX_SECONDS), base.fix_forward_max_seconds + ), + ) + + +def _parse_list(raw: str | None, fallback: list[str]) -> list[str]: + if raw is None: + return list(fallback) + return [item.strip() for item in raw.split(",") if item.strip()] + + +def _parse_bool(raw: str | None, fallback: bool) -> bool: + if raw is None: + return fallback + value = raw.strip().lower() + if value in _TRUE: + return True + if value in _FALSE: + return False + raise ValueError(f"unparseable boolean for AFK config: {raw!r}") + + +def _parse_int(raw: str | None, fallback: int) -> int: + if raw is None or not raw.strip(): + return fallback + return int(raw.strip()) + + +def _parse_float(raw: str | None, fallback: float) -> float: + if raw is None or not raw.strip(): + return fallback + return float(raw.strip()) + + +def _nonempty(raw: str | None, fallback: str) -> str: + if raw is None or not raw.strip(): + return fallback + return raw.strip() diff --git a/app/afk/dispatch_policy.py b/app/afk/dispatch_policy.py new file mode 100644 index 0000000..f2c8f0a --- /dev/null +++ b/app/afk/dispatch_policy.py @@ -0,0 +1,117 @@ +"""Dispatch policy — the PURE gate deciding which ready issues to run *now*. + +``select_dispatchable`` is the loop's first decision each tick: given every +issue the tracker reported ready, the loop config, and the set of repos that +already have an agent in flight, it returns the ordered list of issues to +dispatch this round. It does **no IO** — no tracker calls, no T3, no clock — so +it is exhaustively unit-testable and the loop stays a thin shell around it. + +What it encapsulates (the dispatch predicate from the AFK pipeline design doc): + + * **Kill switch** — ``config.kill_switch`` short-circuits to ``[]`` before any + per-issue work. The whole loop ships disabled; this is the master off. + * **Trust gate** — only ``issue.labeled_by_trusted`` issues are eligible. On a + private repo the gating label *is* the authorization, so an issue made ready + by an untrusted/bot actor must never auto-run (prompt-injection defense). + * **Allowlist** — ``issue.repo`` must be in ``config.allowlist``. An empty + allowlist dispatches nothing even with the kill switch off (the deliberate + two-gate posture: arming the loop takes both). + * **Per-repo lock** — any repo already in ``in_flight_repos`` is skipped; at + most one agent runs per repo (two would collide on the working tree). + * **blocked_by gating** — ``issue.blocked_by`` lists the issue numbers of + blockers that are still OPEN, so a non-empty list means "still blocked" and + the issue is skipped. + * **One-agent-per-repo within the batch** — because a repo hosts only one + in-flight agent, a single call returns at most ONE decision per repo: the + highest-priority eligible issue in that repo wins the slot. (A higher-priority + issue that is itself ineligible does not consume the slot — the best + *eligible* candidate does.) + * **Priority ordering** — the surviving per-repo winners are returned + highest-``priority``-first, with a deterministic tiebreaker (ascending issue + number) so the output is a total, stable order independent of input order. + +PRIORITY DIRECTION — note the deliberate divergence: ``Issue.priority``'s +docstring in ``types`` says "lower runs first", but this module follows the +explicit dispatch-policy specification, which orders **higher priority first**. +The ordering lives here (the one place that consumes ``priority`` for dispatch), +so this module is the source of truth for the direction. + +Pure: it never mutates its inputs — the caller's issue list, the config, and the +``in_flight_repos`` set are all left exactly as passed. +""" +from .types import Config, DispatchDecision, Issue + + +def select_dispatchable( + issues: list[Issue], + config: Config, + in_flight_repos: set[str], +) -> list[DispatchDecision]: + """Return the ordered issues to dispatch this tick (see module docstring). + + Empty when the kill switch is on, the allowlist excludes everything, or no + issue clears every gate. At most one decision per repo; ordered + highest-priority-first, ties broken by ascending issue number. + """ + # Kill switch: master off-ramp, evaluated before any per-issue work. + if config.kill_switch: + return [] + + allowlist = frozenset(config.allowlist) + + # First pass: keep only issues that clear every per-issue gate. Repos already + # in flight are excluded here, so the lock is enforced before slot selection. + eligible: list[Issue] = [ + issue + for issue in issues + if _is_eligible(issue, allowlist, in_flight_repos) + ] + + # One slot per repo: among the eligible issues sharing a repo, the best + # candidate (the global sort order) takes it; the rest are dropped this tick. + best_per_repo: dict[str, Issue] = {} + for issue in sorted(eligible, key=_dispatch_sort_key): + best_per_repo.setdefault(issue.repo, issue) + + # Final order: the per-repo winners, highest priority first (total + stable). + winners = sorted(best_per_repo.values(), key=_dispatch_sort_key) + return [DispatchDecision(issue=issue, reason=_reason(issue)) for issue in winners] + + +# --------------------------------------------------------------------------- # +# Internals. +# --------------------------------------------------------------------------- # +def _is_eligible( + issue: Issue, + allowlist: frozenset[str], + in_flight_repos: set[str], +) -> bool: + """True iff the issue clears the trust, allowlist, per-repo-lock, and + blocked_by gates. Kept boolean (not "which gate failed") because the policy + only ever needs the survivors; reasons are attached to survivors only.""" + if not issue.labeled_by_trusted: + return False + if issue.repo not in allowlist: + return False + if issue.repo in in_flight_repos: + return False + if issue.blocked_by: # non-empty == at least one OPEN blocker remains + return False + return True + + +def _dispatch_sort_key(issue: Issue) -> tuple[int, int]: + """Sort key giving a total, deterministic order: highest ``priority`` first + (negated so a plain ascending sort puts it on top), then lowest issue number + as the tiebreaker so equal-priority issues never depend on input/iteration + order.""" + return (-issue.priority, issue.number) + + +def _reason(issue: Issue) -> str: + """Human-readable justification, logged and surfaced in notifications, never + parsed. Records that every gate passed and the priority that ordered it.""" + return ( + f"{issue.repo}#{issue.number}: eligible " + f"(trusted, allowlisted, unblocked, repo free) — priority {issue.priority}" + ) diff --git a/app/afk/issue_implementer_prompt.py b/app/afk/issue_implementer_prompt.py new file mode 100644 index 0000000..af94cb5 --- /dev/null +++ b/app/afk/issue_implementer_prompt.py @@ -0,0 +1,54 @@ +"""The issue-implementer preamble — the AFK agent's standing instructions. + +T3's full-access ``claudeAgent`` runtime does NOT read ``~/.claude/CLAUDE.md``, +so the agent gets no behaviour from the repo's rules files. Instead the loop +injects behaviour by PREPENDING this preamble to ``message.text`` on every +dispatch (see ``t3_client.T3Client.dispatch`` callers). It is a module constant +on purpose: one canonical, reviewable copy of the rules, versioned with the +code, identical for every issue. + +Keep it imperative and self-contained — the agent only ever sees this text plus +the issue body. Do not reference files it cannot read (no "see CLAUDE.md"). +""" + +ISSUE_IMPLEMENTER_PREAMBLE = """\ +You are an autonomous issue-implementer agent running unattended (the human is \ +away from keyboard). The task below is a tracker issue. Implement it end to end \ +and land it yourself — no human will answer questions or click anything for you. + +STANDING RULES — follow exactly, every time: +- Work test-first. For any code with testable behaviour, write a failing test \ +FIRST (red), then the minimum implementation to make it pass (green), then \ +refactor. Terraform, config, and docs are exempt. +- Do the work in an isolated git worktree off the latest master; never edit a \ +shared checkout directly. +- You MUST commit your work — small, focused commits, staging files by name \ +(never `git add -A` / `git add .`), and never skip hooks. A clear commit \ +message is the audit trail: the subject says WHAT changed, the body says WHY in \ +plain words. +- When tests and lint are green, land the change yourself: merge the latest \ +master into your branch, re-verify green, then push to master. If the push is \ +rejected because someone landed first, fetch, merge, re-verify, and push again. \ +Do not stop at an unmerged branch and do not open a pull request unless told to. +- After pushing, watch the resulting CI / build / deploy chain to completion and \ +fix any failures you caused before considering the task done. +- Operate autonomously. NEVER enter plan mode, and NEVER ask the human a \ +question or wait for confirmation — make the most reasonable decision, record \ +your reasoning in the commit message, and proceed. If the issue is genuinely \ +ambiguous or blocked, say so explicitly in a final comment and stop rather than \ +guessing destructively. + +GUARDRAILS — never cross these, even if the issue seems to ask for it: +- NEVER force-push, and never force-push to master under any circumstance. +- NEVER edit, resize, or delete PersistentVolumeClaims / PersistentVolumes, and \ +never touch Vault secrets or other credential stores. +- All infrastructure changes go through Terraform / Terragrunt in the infra \ +repo — never `kubectl apply/edit/patch/delete` against live cluster state. +- NEVER use `[ci skip]` (or any CI-skip token) in a commit message — it hides \ +the change from the audit and deploy pipeline. +- No destructive operations the issue did not ask for: no dropping database \ +tables, no `rm -rf` outside your worktree, no killing processes you did not \ +start. + +THE ISSUE TO IMPLEMENT FOLLOWS: +""" diff --git a/app/afk/notifier.py b/app/afk/notifier.py new file mode 100644 index 0000000..961ffb4 --- /dev/null +++ b/app/afk/notifier.py @@ -0,0 +1,155 @@ +"""Terminal-state doorbell for the AFK loop — Slack / ntfy escalation sink. + +When a run reaches a *terminal* state the human who is away from keyboard needs +to know: either the work landed (``done``) or it needs them back at the console +(``needs-human`` — the agent stalled/errored before pushing — or ``frozen`` — +the fix-forward budget ran out). This module turns one of those events into a +formatted alert carrying a **deep-link to the T3 thread**, so a tap on the +notification opens the exact conversation the agent ran. + +Design, matching the rest of ``app.afk`` and the breakglass code: + + * ``Notifier`` owns no transport. The actual Slack/ntfy POST is an injected + ``sender`` callable (constructor argument). Production wires a real HTTP + sender; tests inject a recording fake and assert the formatted payload + without touching the network — the same dependency-injection seam breakglass + uses for the claude subprocess. + * ``render_notification`` is a pure function that builds the payload; ``notify`` + is just "render, then hand to the sender". Keeping the formatting pure makes + it unit-testable on its own and guarantees ``notify`` sends exactly what + ``render_notification`` returns. + * The kind vocabulary is CLOSED: only the three terminal kinds are sendable. + An unknown kind raises rather than firing a mystery doorbell — a non-terminal + kind reaching here is a caller bug, not something to paper over. + * The notifier never swallows a sender failure. If Slack is down the exception + propagates; the loop decides whether to retry or give up, not this adapter. + +The whole AFK loop ships DISABLED (see ``config.py``); this module is inert +until the loop is deliberately armed and a real sender is wired in. +""" +from collections.abc import Callable +from dataclasses import dataclass, field + +from .types import Issue + +# --------------------------------------------------------------------------- # +# Kind vocabulary — the terminal states a run can reach. One source of truth +# shared by callers (the state machine maps Action -> kind) and tests. +# --------------------------------------------------------------------------- # +KIND_DONE = "done" # landed: merged + CI green, issue closeable +KIND_NEEDS_HUMAN = "needs-human" # stalled/errored before pushing — pre-push escalation +KIND_FROZEN = "frozen" # fix-forward budget (attempts/wall-clock) exhausted + +#: The only kinds ``notify`` will send. Anything else is a caller bug. +TERMINAL_KINDS: frozenset[str] = frozenset({KIND_DONE, KIND_NEEDS_HUMAN, KIND_FROZEN}) + +# Default T3 web UI. Threads deep-link off this; overridable per-Notifier so the +# host isn't hardcoded into the formatter (re-IP / staging / tests). +DEFAULT_BASE_URL = "https://t3.viktorbarzin.me" + +# Per-kind presentation. The leading marker makes the three distinguishable from +# the title alone in a crowded Slack channel without emoji; priority/tags drive +# how the sender routes it (a successful close is quiet; the two escalations are +# loud and tagged so on-call filters can page on them). +_PRESENTATION: dict[str, tuple[str, str, str, tuple[str, ...]]] = { + # kind -> (marker, headline, priority, tags) + KIND_DONE: ("[DONE]", "landed", "low", ("afk", "done")), + KIND_NEEDS_HUMAN: ("[NEEDS-HUMAN]", "needs a human", "high", ("afk", "escalation", "needs-human")), + KIND_FROZEN: ("[FROZEN]", "frozen — budget exhausted", "high", ("afk", "escalation", "frozen")), +} + +#: A sink that delivers a built notification (HTTP POST in prod, recorder in tests). +Sender = Callable[["Notification"], None] + + +@dataclass +class Notification: + """The fully-formatted alert handed to the sender. + + A structured payload (not a raw dict) so the sender can map fields onto its + own schema — ``title``/``body`` for Slack blocks or an ntfy message, + ``priority``/``tags`` for routing, ``link`` for the click-through. ``link`` + is ``None`` when there is no thread to point at (e.g. dispatch failed before + a thread existed); the deep-link is also embedded in ``body`` so it survives + senders that only carry a plain message. + """ + + kind: str + issue_ref: str # "#", e.g. "infra#42" + title: str + body: str + link: str | None + priority: str # "low" | "high" — escalation loudness for the sender + tags: list[str] = field(default_factory=list) + + +def _deep_link(base_url: str, thread_id: str | None) -> str | None: + """Build the T3 thread deep-link, or ``None`` when there is no thread.""" + if not thread_id: + return None + return f"{base_url.rstrip('/')}/?thread={thread_id}" + + +def render_notification( + kind: str, + issue: Issue, + thread_id: str | None, + detail: str, + *, + base_url: str = DEFAULT_BASE_URL, +) -> Notification: + """Build the :class:`Notification` for a terminal event — pure, no I/O. + + Raises ``ValueError`` if ``kind`` is not one of :data:`TERMINAL_KINDS`: only + terminal states ring the doorbell, and a non-terminal kind reaching here is a + bug we surface rather than silently send. + """ + if kind not in TERMINAL_KINDS: + raise ValueError( + f"notifier only sends terminal kinds {sorted(TERMINAL_KINDS)}, got {kind!r}" + ) + + marker, headline, priority, tags = _PRESENTATION[kind] + issue_ref = f"{issue.repo}#{issue.number}" + link = _deep_link(base_url, thread_id) + + title = f"{marker} {issue_ref} {headline}" + + body_lines = [detail] + if link is not None: + body_lines.append(f"Thread: {link}") + body = "\n".join(body_lines) + + return Notification( + kind=kind, + issue_ref=issue_ref, + title=title, + body=body, + link=link, + priority=priority, + tags=list(tags), + ) + + +class Notifier: + """Sends terminal-state doorbells through an injected ``sender``. + + The ``sender`` is the only egress: ``notify`` formats the payload (via + :func:`render_notification`) and hands it over. No transport lives here, so a + test injects a recording fake and asserts the payload without posting. + """ + + def __init__(self, sender: Sender, *, base_url: str = DEFAULT_BASE_URL) -> None: + self._sender = sender + self._base_url = base_url + + def notify(self, kind: str, issue: Issue, thread_id: str | None, detail: str) -> None: + """Format a terminal-state alert and deliver it via the injected sender. + + Raises ``ValueError`` for a non-terminal ``kind`` (before any send), and + lets a sender failure propagate — see the module docstring. + """ + notification = render_notification( + kind, issue, thread_id, detail, base_url=self._base_url + ) + self._sender(notification) diff --git a/app/afk/phase_checklist.py b/app/afk/phase_checklist.py new file mode 100644 index 0000000..67b03d6 --- /dev/null +++ b/app/afk/phase_checklist.py @@ -0,0 +1,116 @@ +"""Render an AFK run's progress as a live markdown checklist. + +``render(current, meta)`` is a PURE function: it maps a ``Phase`` plus a bag of +optional context (``meta``) to a markdown task list, with no I/O and no hidden +state. The loop posts the result as an issue comment so a human glancing at the +tracker can see exactly how far an unattended run has got — worktree created, +test written, green, pushed, CI, deployed, done. + +The list always shows all seven lifecycle phases in order. Phases strictly +*before* ``current`` are checked (``- [x]``); ``current`` is marked in-progress +(``- [~]``); later phases are empty (``- [ ]``). ``Phase.DONE`` is terminal — at +that point every line, including DONE itself, is checked. + +``meta`` is best-effort decoration only. Recognised keys (all optional): +``repo`` / ``issue`` (header title), ``thread_id`` (header suffix), and +``fix_forward_attempts`` (a note line when non-zero). Unknown keys are ignored, +and a missing key never raises — the checklist degrades gracefully to just the +phase list. Nothing here mutates ``meta``. +""" +from typing import Any + +from .types import Phase + +# Lifecycle order — the single source of truth for both ordering and the +# checked/active/empty partition. Must stay in sync with ``Phase`` (the +# checklist tests assert every phase appears, so a divergence is caught). +_ORDER: tuple[Phase, ...] = ( + Phase.WORKTREE, + Phase.TESTS_RED, + Phase.GREEN, + Phase.PUSHED, + Phase.CI, + Phase.DEPLOYED, + Phase.DONE, +) + +# Human-readable label per phase (what shows on each checklist line). +_LABELS: dict[Phase, str] = { + Phase.WORKTREE: "Worktree created", + Phase.TESTS_RED: "Failing test written (TDD red)", + Phase.GREEN: "Implementation passing (TDD green)", + Phase.PUSHED: "Pushed to master", + Phase.CI: "CI green on pushed commit", + Phase.DEPLOYED: "Deployed / rolled out", + Phase.DONE: "Done — issue closed", +} + +# Task-list markers. ``[~]`` (in-progress) is a common markdown convention and, +# crucially, is neither ``[x]`` nor ``[ ]`` so the active line is always visually +# distinct from a checked or empty box. +_DONE = "- [x]" +_ACTIVE = "- [~]" +_TODO = "- [ ]" + + +def render(current: Phase, meta: dict[str, Any]) -> str: + """Render the run's progress checklist as markdown (see module docstring). + + ``current`` is the phase the run is in right now; ``meta`` supplies optional + header/context fields. Pure: identical inputs yield byte-identical output and + ``meta`` is never mutated. + """ + current_index = _ORDER.index(current) + is_done = current is Phase.DONE + + lines = [_header(meta), ""] + for index, phase in enumerate(_ORDER): + lines.append(f"{_marker(index, current_index, is_done)} {_LABELS[phase]}") + + note = _fix_forward_note(meta) + if note is not None: + lines.extend(["", note]) + + # Trailing newline so the block sits cleanly when concatenated into a comment. + return "\n".join(lines) + "\n" + + +def _marker(index: int, current_index: int, is_done: bool) -> str: + """The checkbox marker for the phase at ``index`` given the current phase. + + Earlier phases are checked; the current phase is in-progress; later phases + are empty. When the run is DONE, every phase (including DONE) is checked. + """ + if is_done or index < current_index: + return _DONE + if index == current_index: + return _ACTIVE + return _TODO + + +def _header(meta: dict[str, Any]) -> str: + """The ``###`` title line. Includes ``repo#issue`` when both are present and + a ``(thread ...)`` suffix when a thread id is known; degrades to a bare title + otherwise.""" + repo = meta.get("repo") + issue = meta.get("issue") + if repo is not None and issue is not None: + title = f"{repo}#{issue} — AFK run progress" + else: + title = "AFK run progress" + + thread_id = meta.get("thread_id") + if thread_id: + title = f"{title} (thread {thread_id})" + return f"### {title}" + + +def _fix_forward_note(meta: dict[str, Any]) -> str | None: + """A note line when one or more fix-forward attempts have happened, else + ``None`` (no line). Zero/absent attempts add nothing — the clean path stays + uncluttered.""" + attempts = meta.get("fix_forward_attempts") + if not attempts: + return None + plural = "attempt" if attempts == 1 else "attempts" + return f"_Fix-forward: {attempts} {plural}._" diff --git a/app/afk/poller.py b/app/afk/poller.py new file mode 100644 index 0000000..ad7cd03 --- /dev/null +++ b/app/afk/poller.py @@ -0,0 +1,166 @@ +"""CronJob entrypoint: one dispatch tick of the AFK loop. + +The poller is the *first half* of the loop — the part that decides what to start. +It runs once per CronJob invocation (the loop is stateless between ticks: the +issue tracker, not in-process memory, is the source of truth for what's already +in flight). Each tick: + + 1. **kill switch** — if ``config.kill_switch`` is set the tick does NOTHING, + not even a tracker read. A disabled loop must be inert: zero I/O, zero + dispatches. (The pure policy also short-circuits on the kill switch, but the + poller bails first so a disabled CronJob never touches the network.) + 2. read the ready set: ``tracker.list_ready(config.allowlist)`` — every open + issue carrying the ready label across the allowlisted repos. + 3. derive the **per-repo lock**: a repo is "in flight" if any ready issue + already carries ``config.in_progress_label`` (the poller stamps that label + when it dispatches, so on the next tick the still-open issue re-appears and + locks the repo). At most one agent per repo — two would collide on the + working tree. + 4. run the pure ``dispatch_policy.select_dispatchable`` over (ready issues, + config, in-flight repos) to get the ordered set to start this tick. + 5. for each decision: ``t3_client.dispatch(repo, issue, prompt)`` to spawn the + worker thread, THEN ``tracker.add_label(repo, issue, in_progress_label)`` — + label strictly *after* a successful dispatch, so a dispatch that raises + never leaves a phantom lock that would freeze the repo forever. + +It owns no policy of its own — the decision lives in ``dispatch_policy`` and the +agent's behaviour rides in the dispatched prompt's preamble (``t3_client``). The +two adapters (tracker, T3) are injected behind structural Protocols, so +production wires the real ``Tracker`` / ``T3Client`` and the tests wire the +in-memory fakes; nothing here opens a socket on its own. + +DISABLED BY DEFAULT: a freshly-loaded ``Config`` has ``kill_switch=True`` and an +empty allowlist (see ``config.py``), so importing or scheduling this poller +dispatches nothing. Arming the loop — clearing the kill switch AND enrolling a +repo — is a deliberate manual step, performed later, never by this code. +""" +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import Protocol + +from . import dispatch_policy +from .types import Config, DispatchDecision, Issue + + +# --------------------------------------------------------------------------- # +# Injected adapter Protocols — the I/O edges. Structural, so the real +# ``Tracker`` / ``T3Client`` and the test fakes both satisfy them with no +# explicit subclassing. Only the methods the poller actually calls appear here. +# --------------------------------------------------------------------------- # +class TrackerPort(Protocol): + """The slice of ``tracker.Tracker`` the dispatch tick needs.""" + + def list_ready(self, repos: list[str]) -> list[Issue]: ... + def add_label(self, repo: str, issue: int, label: str) -> None: ... + + +class T3Port(Protocol): + """The slice of ``t3_client.T3Client`` the dispatch tick needs.""" + + def dispatch(self, repo: str, issue: int, prompt: str) -> str: ... + + +#: The pure dispatch gate's signature, injected so the tick can be tested with a +#: stub policy without reaching into module internals. Defaults to the real one. +DispatchFn = Callable[[list[Issue], Config, set[str]], list[DispatchDecision]] + + +@dataclass +class Dispatched: + """One issue the tick actually started, with the T3 thread it spawned. + + Returned (not just logged) so the caller — and the tests — can see exactly + what was launched. ``thread_id`` is what the watcher half later polls to + drive this run to completion; ``reason`` carries the policy's human-readable + justification through unchanged. + """ + + issue: Issue + thread_id: str + reason: str + + +@dataclass +class PollResult: + """The outcome of one dispatch tick. + + ``dispatched`` is empty whenever the loop is disabled, the allowlist is + empty, every repo is already in flight, or nothing clears the dispatch gate + — i.e. the common steady-state of a quiet tick. + """ + + dispatched: list[Dispatched] = field(default_factory=list) + + +class Poller: + """Runs one dispatch tick over injected tracker + T3 adapters. + + ``dispatch`` defaults to the real pure ``select_dispatchable`` policy; it is + injectable purely so a test can substitute a stub without monkeypatching. + The poller holds no state between ticks — each ``run_once`` is self-contained. + """ + + def __init__( + self, + tracker: TrackerPort, + t3_client: T3Port, + dispatch: DispatchFn = dispatch_policy.select_dispatchable, + ) -> None: + self._tracker = tracker + self._t3 = t3_client + self._dispatch = dispatch + + def run_once(self, config: Config) -> PollResult: + """Execute one dispatch tick (see module docstring). Returns what it + started; an empty result is the normal quiet-tick outcome.""" + # Kill switch: bail before any I/O — a disabled loop touches nothing. + if config.kill_switch: + return PollResult() + + ready = self._tracker.list_ready(config.allowlist) + in_flight = _in_flight_repos(ready, config.in_progress_label) + + result = PollResult() + for decision in self._dispatch(ready, config, in_flight): + issue = decision.issue + # Dispatch FIRST; only stamp the lock once the thread exists, so a + # failed dispatch leaves the issue purely ready for the next tick to + # retry rather than wedged behind a phantom in-progress label. + thread_id = self._t3.dispatch( + issue.repo, issue.number, _dispatch_prompt(issue) + ) + self._tracker.add_label(issue.repo, issue.number, config.in_progress_label) + result.dispatched.append( + Dispatched(issue=issue, thread_id=thread_id, reason=decision.reason) + ) + return result + + +# --------------------------------------------------------------------------- # +# Internals — pure helpers. +# --------------------------------------------------------------------------- # +def _in_flight_repos(ready: list[Issue], in_progress_label: str) -> set[str]: + """Repos that already have an agent in flight, read off the ready set. + + A repo is in flight if any of its ready issues still carries the in-progress + label — the stamp the poller applied on a previous tick's dispatch. Because + the dispatched issue keeps its ready label until the watcher closes/relabels + it, it re-appears here and locks the repo until the run finishes. + """ + return {issue.repo for issue in ready if in_progress_label in issue.labels} + + +def _dispatch_prompt(issue: Issue) -> str: + """The turn prompt for one issue's worker thread. + + The full-access agent fetches the issue body itself (it has ``gh``), so the + prompt only needs to point unambiguously at the concrete ``repo#number``; the + standing rules are prepended by ``t3_client`` as the issue-implementer + preamble. Kept deliberately terse — one canonical instruction, no per-issue + templating to drift. + """ + return ( + f"Implement issue #{issue.number} in the `{issue.repo}` repository. " + f"Fetch the issue with `gh issue view {issue.number} --repo {issue.repo}` " + f"(and its comments) to get the full task, then implement it end to end." + ) diff --git a/app/afk/run_state_machine.py b/app/afk/run_state_machine.py new file mode 100644 index 0000000..2abf6f1 --- /dev/null +++ b/app/afk/run_state_machine.py @@ -0,0 +1,84 @@ +"""Run state machine: assembled ``RunState`` -> next ``Action`` (ADR-0002). + +This is the heart of the AFK loop's per-issue control: each tick the loop +assembles a :class:`~app.afk.types.RunState` (thread liveness from the +orchestration snapshot, CI verdict from the watcher, plus its own ``pushed`` / +``fix_forward_attempts`` / ``elapsed_seconds`` bookkeeping) and calls +:func:`next_action` to decide what to do next. + +The function is **pure** — it reads only its two arguments, never the clock, the +network, or any global. That keeps the lifecycle policy a plain decision table +the test suite can exhaust combinatorially; the loop owns all the I/O (closing +issues, dispatching corrective turns, escalating) based on the Action returned. + +The decision table (first match wins): + + * pushed AND CI green -> CLOSE_SUCCESS + The run is healthy and verified; close the issue. The thread's own status + is irrelevant once a pushed commit is green. + * pushed AND CI red, budget remaining -> FIX_FORWARD + A pushed commit broke CI. Dispatch another corrective turn — but only + while BOTH budgets hold: ``fix_forward_attempts < fix_forward_max_attempts`` + AND ``elapsed_seconds < fix_forward_max_seconds`` (strict; at/over either + bound is exhausted). + * pushed AND CI red, budget exhausted -> FREEZE_ESCALATE + Out of fix-forward attempts or wall-clock; stop churning and hand to a + human with the broken commit left in place. + * not pushed AND thread ERROR/IDLE -> ESCALATE_PREPUSH + The agent will never reach green: it errored, or its turn finished / + stalled with nothing pushed. There is no pushed commit to fix forward, so + escalate before-push (a different remediation path than FREEZE_ESCALATE). + * everything else -> WAIT + Still in flight: working toward a first push (thread running / unknown), or + pushed with CI not yet decided. Poll again next tick. +""" +from .types import Action, CIStatus, Config, RunState, ThreadStatus + +# Thread states that mean the agent is finished with this turn — it will not push +# any further on its own. Reaching one of these with nothing pushed is terminal +# (escalate), whereas RUNNING / None (no snapshot entry yet) means keep waiting. +_TERMINAL_THREAD_STATES: frozenset[ThreadStatus] = frozenset( + {ThreadStatus.ERROR, ThreadStatus.IDLE} +) + + +def next_action(state: RunState, config: Config) -> Action: + """Decide the next :class:`Action` for one issue's run. + + Pure and total: every reachable ``(thread_status, ci_status, pushed, + attempts, elapsed)`` combination maps to exactly one Action via the table in + the module docstring. See that table for the rationale of each branch. + """ + if state.pushed: + # A commit is out; the CI verdict on it drives everything from here. + if state.ci_status is CIStatus.GREEN: + return Action.CLOSE_SUCCESS + if state.ci_status is CIStatus.RED: + return ( + Action.FIX_FORWARD + if _fix_forward_budget_remaining(state, config) + else Action.FREEZE_ESCALATE + ) + # CI pending / not yet reported -> wait for the verdict. + return Action.WAIT + + # Nothing pushed yet. If the turn is over (errored or gone idle) the run can + # never reach green on its own -> escalate before-push; otherwise it is still + # working toward a first push -> wait. + if state.thread_status in _TERMINAL_THREAD_STATES: + return Action.ESCALATE_PREPUSH + return Action.WAIT + + +def _fix_forward_budget_remaining(state: RunState, config: Config) -> bool: + """True while another fix-forward turn is allowed. + + Both bounds must hold (strict ``<``): the run has spent fewer than + ``fix_forward_max_attempts`` corrective turns AND fewer than + ``fix_forward_max_seconds`` of wall-clock. Hitting either cap exhausts the + budget. + """ + return ( + state.fix_forward_attempts < config.fix_forward_max_attempts + and state.elapsed_seconds < config.fix_forward_max_seconds + ) diff --git a/app/afk/t3_client.py b/app/afk/t3_client.py new file mode 100644 index 0000000..cb32e7f --- /dev/null +++ b/app/afk/t3_client.py @@ -0,0 +1,159 @@ +"""Adapter for the in-cluster T3 Code instance — the AFK executor + cockpit. + +The control plane keeps the brain; T3 runs the agent. This module is the thin +wire between them: it turns "implement issue N of repo R with this prompt" into +the TWO HTTP commands T3's orchestration API needs, and reads the fleet +snapshot the watcher polls. It owns no AFK behaviour — the agent's standing +rules ride in as the ``ISSUE_IMPLEMENTER_PREAMBLE`` prepended to the turn +message, because T3's full-access ``claudeAgent`` runtime does NOT honour +``~/.claude/CLAUDE.md`` (see ``issue_implementer_prompt``). + +Two operations, both against the dedicated in-cluster T3 pod: + + * ``dispatch(repo, issue, prompt) -> thread_id`` — POSTs ``thread.create`` + then ``thread.turn.start`` to ``/api/orchestration/dispatch``. The create + command selects the ``claudeAgent`` instance in ``full-access`` runtime mode + and returns a thread id; the turn command targets that thread and delivers + ``ISSUE_IMPLEMENTER_PREAMBLE + prompt`` as ``message.text``. One dispatch = + one worktree-isolated worker. + * ``snapshot() -> dict`` — GETs ``/api/orchestration/snapshot``, the full fleet + read-model. T3 has no outbound webhooks, so the watcher polls this for + per-thread ``running``/``idle``/``error`` status. + +The HTTP transport and the bearer provider are **injected** (constructor +args), so the production wiring hands in an ``httpx.Client`` plus a Vault-backed +token reader, while tests hand in an in-memory fake — nothing here ever opens a +socket on its own. The bearer is re-read from the provider on **every** request +because T3's ``orchestration:operate`` token expires hourly and is refreshed out +of band. +""" +from collections.abc import Callable +from typing import Protocol + +from .issue_implementer_prompt import ISSUE_IMPLEMENTER_PREAMBLE + +# Orchestration API paths, relative to the configured base URL. +_DISPATCH_PATH = "/api/orchestration/dispatch" +_SNAPSHOT_PATH = "/api/orchestration/snapshot" + +# Pilot-baked dispatch envelope: which backend instance runs the thread and in +# which runtime mode. Constants (not config) — every AFK thread is identical. +_INSTANCE_ID = "claudeAgent" +_RUNTIME_MODE = "full-access" + +# JSON shapes. Command bodies and the snapshot read-model are open string-keyed +# objects; ``object`` values keep us honest without a bare ``Any``. +type Json = dict[str, object] + + +class HttpResponse(Protocol): + """The httpx-shaped response surface this adapter relies on. + + Both ``httpx.Response`` and the test fake satisfy it: ``raise_for_status`` + turns a non-2xx into an exception (so a failed ``thread.create`` aborts + before ``thread.turn.start`` ever fires) and ``json`` parses the body. + """ + + def raise_for_status(self) -> object: ... + + def json(self) -> Json: ... + + +class HttpClient(Protocol): + """Minimal injected transport: a JSON ``post`` and a ``get``, both taking + explicit headers. Deliberately a strict subset of ``httpx.Client`` so the + real client passes one straight through and tests pass a recorder.""" + + def post(self, url: str, json: Json, headers: dict[str, str]) -> HttpResponse: ... + + def get(self, url: str, headers: dict[str, str]) -> HttpResponse: ... + + +class T3Client: + """Dispatch/snapshot adapter for one in-cluster T3 instance. + + ``base_url`` is the T3 service root (a trailing slash is tolerated); + ``http`` is the injected transport; ``bearer_provider`` returns the current + ``orchestration:operate`` token, re-read per request for hourly rotation. + """ + + def __init__( + self, + base_url: str, + http: HttpClient, + bearer_provider: Callable[[], str], + ) -> None: + self._base_url = base_url.rstrip("/") + self._http = http + self._bearer_provider = bearer_provider + + # ----------------------------------------------------------------- # + # Public API (the ``t3_client.T3Client`` contract). + # ----------------------------------------------------------------- # + def dispatch(self, repo: str, issue: int, prompt: str) -> str: + """Spawn one worker thread for ``issue`` of ``repo`` and return its id. + + Two POSTs to ``/api/orchestration/dispatch``: ``thread.create`` (selects + the ``claudeAgent`` instance, ``full-access`` runtime) yields the thread + id; ``thread.turn.start`` then delivers ``ISSUE_IMPLEMENTER_PREAMBLE + + prompt`` to that thread. A failed create raises and short-circuits the + turn (we never fire a turn at a thread that wasn't created). + """ + create_resp = self._post( + _DISPATCH_PATH, + { + "command": "thread.create", + "repo": repo, + "issue": issue, + "modelSelection": {"instanceId": _INSTANCE_ID}, + "runtimeMode": _RUNTIME_MODE, + }, + ) + thread_id = self._thread_id_of(create_resp.json()) + + self._post( + _DISPATCH_PATH, + { + "command": "thread.turn.start", + "threadId": thread_id, + "message": {"text": ISSUE_IMPLEMENTER_PREAMBLE + prompt}, + }, + ) + return thread_id + + def snapshot(self) -> Json: + """Return the parsed fleet read-model from ``/api/orchestration/snapshot``.""" + return self._get(_SNAPSHOT_PATH).json() + + # ----------------------------------------------------------------- # + # Internals. + # ----------------------------------------------------------------- # + def _post(self, path: str, body: Json) -> HttpResponse: + resp = self._http.post(self._url(path), json=body, headers=self._headers()) + resp.raise_for_status() + return resp + + def _get(self, path: str) -> HttpResponse: + resp = self._http.get(self._url(path), headers=self._headers()) + resp.raise_for_status() + return resp + + def _url(self, path: str) -> str: + return f"{self._base_url}{path}" + + def _headers(self) -> dict[str, str]: + return {"Authorization": f"Bearer {self._bearer_provider()}"} + + @staticmethod + def _thread_id_of(create_response: Json) -> str: + """Extract the new thread id from a ``thread.create`` reply. + + T3 returns it as ``threadId``; we fail loudly on a malformed reply rather + than dispatch a turn at an empty/None id. + """ + thread_id = create_response.get("threadId") + if not isinstance(thread_id, str) or not thread_id: + raise ValueError( + f"thread.create response missing a usable threadId: {create_response!r}" + ) + return thread_id diff --git a/app/afk/tracker.py b/app/afk/tracker.py new file mode 100644 index 0000000..38f6b66 --- /dev/null +++ b/app/afk/tracker.py @@ -0,0 +1,243 @@ +"""Issue-tracker adapter — the loop's read/write port onto GitHub issues. + +``Tracker`` is the only place the AFK loop touches the issue tracker. It wraps an +injected ``GitHubClient`` (the port) so the policy/state-machine code — and the +tests — never depend on a real ``gh`` or the network: production injects +``GhCliClient`` (shells out to ``gh`` with no-shell argv); tests inject a fake. + +The split is deliberate. The ``GitHubClient`` port speaks only in *primitives* +(list raw issues for a label, fetch a single issue's label events, and the four +mutations). All the loop-specific *decisions* live on ``Tracker``: + + * ``labeled_by_trusted`` — decided **fail-closed** from the actor who made the + most-recent application of the ready label. On private repos only + collaborators can label, so the label *is* the authorization (design doc, + "Trigger & dispatch predicate"); an unattributable label is never trusted. + * ``blocked_by`` — the issue numbers in the body's "Blocked by #N" clauses + (the per-issue dependency the design doc gates dispatch on). + * ``priority`` — read off a ``priority:`` label, lowest wins (lower runs + first, matching ``Issue.priority`` semantics in ``types``). + +Keeping the decisions here, not in the client, is what lets the whole read path +be tested against a thin fake. Mutations (``add_label`` / ``remove_label`` / +``comment`` / ``close``) are pass-throughs the loop drives during a run. +""" +import json +import re +from collections.abc import Callable +from subprocess import PIPE, run +from typing import Protocol, runtime_checkable + +from .types import Issue + +# Trusted author associations: GitHub tags each issue event actor with their +# association to the repo. Only these may arm an issue for the AFK loop — the +# trust gate from the design doc. Overridable per Tracker for a tighter policy. +DEFAULT_TRUSTED_ASSOCIATIONS: frozenset[str] = frozenset({"OWNER", "MEMBER", "COLLABORATOR"}) + +# Default gating label; mirrors Config.ready_label so a Tracker built without an +# explicit override matches the production default. +DEFAULT_READY_LABEL = "ready-for-agent" + +# "Blocked by #3, #4 and #10" → [3, 4, 10]. We match a "blocked by" lead-in +# (case-insensitive) and then harvest every "#" in the clause that follows, +# up to the next line break — so a bare "#7 for context" elsewhere is ignored. +_BLOCKED_BY_CLAUSE = re.compile(r"blocked\s+by\b([^\n\r]*)", re.IGNORECASE) +_ISSUE_REF = re.compile(r"#(\d+)") + +# "priority:2" → 2. Anything non-numeric (e.g. "priority:high") is not a numeric +# priority and is skipped. +_PRIORITY_LABEL = re.compile(r"^priority:(\d+)$") + + +@runtime_checkable +class GitHubClient(Protocol): + """The primitive surface ``Tracker`` depends on — one issue tracker, faked + in tests. Implementations must not embed loop policy; they only fetch raw + data and perform the four mutations. + + ``list_issues`` returns the ``gh issue list --json number,labels,body`` shape + (``labels`` is a list of ``{"name": ...}``; ``body`` may be ``None``). + ``label_events`` returns the ``labeled`` timeline events for one issue, each + with ``label.name``, ``actor.login`` and ``author_association``. + """ + + def list_issues(self, repo: str, label: str) -> list[dict]: ... + def label_events(self, repo: str, number: int) -> list[dict]: ... + def add_label(self, repo: str, number: int, label: str) -> None: ... + def remove_label(self, repo: str, number: int, label: str) -> None: ... + def comment(self, repo: str, number: int, body: str) -> None: ... + def close(self, repo: str, number: int) -> None: ... + + +class Tracker: + """Adapter that turns raw issue-tracker data into ``Issue`` records and + relays mutations, over an injected :class:`GitHubClient`.""" + + def __init__( + self, + client: GitHubClient, + ready_label: str = DEFAULT_READY_LABEL, + trusted_associations: frozenset[str] = DEFAULT_TRUSTED_ASSOCIATIONS, + ) -> None: + self.client = client + self.ready_label = ready_label + self.trusted_associations = trusted_associations + + # ----------------------------------------------------------------- reads # + def list_ready(self, repos: list[str]) -> list[Issue]: + """Every ready-labeled open issue across ``repos``, as ``Issue`` records. + + Ordering follows the client's per-repo order; dispatch ordering by + priority is the dispatch policy's job, not the tracker's. + """ + issues: list[Issue] = [] + for repo in repos: + for raw in self.client.list_issues(repo, self.ready_label): + issues.append(self._to_issue(repo, raw)) + return issues + + def _to_issue(self, repo: str, raw: dict) -> Issue: + number = int(raw["number"]) + labels = [lbl["name"] for lbl in raw.get("labels", [])] + return Issue( + number=number, + repo=repo, + labels=labels, + blocked_by=_parse_blocked_by(raw.get("body")), + labeled_by_trusted=self._is_labeled_by_trusted(repo, number), + priority=_parse_priority(labels), + ) + + def _is_labeled_by_trusted(self, repo: str, number: int) -> bool: + """True iff the MOST RECENT application of the ready label was made by a + trusted actor. Fail-closed: no attributable application → not trusted.""" + last_association: str | None = None + for event in self.client.label_events(repo, number): + if event.get("event") != "labeled": + continue + if (event.get("label") or {}).get("name") != self.ready_label: + continue + last_association = event.get("author_association") + return last_association in self.trusted_associations + + # ------------------------------------------------------------- mutations # + def add_label(self, repo: str, issue: int, label: str) -> None: + self.client.add_label(repo, issue, label) + + def remove_label(self, repo: str, issue: int, label: str) -> None: + self.client.remove_label(repo, issue, label) + + def comment(self, repo: str, issue: int, body: str) -> None: + self.client.comment(repo, issue, body) + + def close(self, repo: str, issue: int) -> None: + self.client.close(repo, issue) + + +# --------------------------------------------------------------------------- # +# Parsing helpers — pure functions, no I/O. +# --------------------------------------------------------------------------- # +def _parse_blocked_by(body: str | None) -> list[int]: + """Issue numbers referenced in the body's "Blocked by #N" clauses. + + Order-preserving and de-duplicated; bare "#N" mentions outside a "blocked by" + clause are ignored. A missing/empty body yields ``[]``. + """ + if not body: + return [] + seen: dict[int, None] = {} # insertion-ordered set + for clause in _BLOCKED_BY_CLAUSE.findall(body): + for ref in _ISSUE_REF.findall(clause): + seen.setdefault(int(ref), None) + return list(seen) + + +def _parse_priority(labels: list[str]) -> int: + """Numeric priority from a ``priority:`` label, lowest wins; 0 if none.""" + priorities = [ + int(match.group(1)) + for label in labels + if (match := _PRIORITY_LABEL.match(label)) + ] + return min(priorities) if priorities else 0 + + +# --------------------------------------------------------------------------- # +# Concrete client — shells out to `gh`. Injected `run` keeps it testable. +# --------------------------------------------------------------------------- # +def _default_run(argv: list[str]) -> str: + """Run ``argv`` with no shell and return stdout (text). Raises on non-zero. + + List argv (never a shell string), matching the no-injection-surface pattern + the breakglass/main subprocess helpers use — the repo/label/body values are + never interpreted by a shell. + """ + proc = run(argv, stdout=PIPE, stderr=PIPE, text=True, check=False) + if proc.returncode != 0: + raise RuntimeError(f"{argv[0]} failed ({proc.returncode}): {proc.stderr[:200]}") + return proc.stdout + + +class GhCliClient: + """:class:`GitHubClient` backed by the ``gh`` CLI. + + ``repo_owner`` is the GitHub owner/org the sub-project repos live under, so a + bare repo name (``"infra"``) becomes the ``--repo owner/infra`` slug ``gh`` + wants. ``run`` is the subprocess runner (defaults to the real no-shell one); + tests inject a fake to capture argv without spawning ``gh``. + """ + + def __init__(self, repo_owner: str, run: Callable[[list[str]], str] = _default_run) -> None: + self.repo_owner = repo_owner + self._run = run + + def _slug(self, repo: str) -> str: + return f"{self.repo_owner}/{repo}" + + def list_issues(self, repo: str, label: str) -> list[dict]: + out = self._run([ + "gh", "issue", "list", "--repo", self._slug(repo), + "--label", label, "--state", "open", + "--json", "number,labels,body", "--limit", "100", + ]) + return _loads_list(out) + + def label_events(self, repo: str, number: int) -> list[dict]: + out = self._run([ + "gh", "api", + f"repos/{self._slug(repo)}/issues/{number}/timeline", + "--paginate", + "-H", "Accept: application/vnd.github+json", + ]) + events = _loads_list(out) + return [e for e in events if e.get("event") == "labeled"] + + def add_label(self, repo: str, number: int, label: str) -> None: + self._run([ + "gh", "issue", "edit", str(number), "--repo", self._slug(repo), + "--add-label", label, + ]) + + def remove_label(self, repo: str, number: int, label: str) -> None: + self._run([ + "gh", "issue", "edit", str(number), "--repo", self._slug(repo), + "--remove-label", label, + ]) + + def comment(self, repo: str, number: int, body: str) -> None: + self._run([ + "gh", "issue", "comment", str(number), "--repo", self._slug(repo), + "--body", body, + ]) + + def close(self, repo: str, number: int) -> None: + self._run(["gh", "issue", "close", str(number), "--repo", self._slug(repo)]) + + +def _loads_list(out: str) -> list[dict]: + """Parse ``gh`` JSON stdout into a list of dicts. Empty stdout → ``[]``.""" + text = out.strip() + if not text: + return [] + return json.loads(text) diff --git a/app/afk/types.py b/app/afk/types.py new file mode 100644 index 0000000..538bf15 --- /dev/null +++ b/app/afk/types.py @@ -0,0 +1,134 @@ +"""Shared types for the AFK loop — the contract every module builds against. + +Stdlib only (``dataclasses`` + ``enum``), matching the breakglass code: no +pydantic, modern ``X | None`` unions, precise field types. Every other module in +``app.afk`` imports its inputs/outputs from here so the pieces stay aligned; the +module-level docstrings in ``__init__`` list which functions consume which type. + +Nothing here has behaviour — these are pure data carriers and closed enums. Keep +it that way: logic lives in ``dispatch_policy`` / ``run_state_machine`` / the +client modules, never on the dataclasses. +""" +from dataclasses import dataclass +from enum import Enum + + +# --------------------------------------------------------------------------- # +# Enums — closed vocabularies the state machine and clients speak in. +# --------------------------------------------------------------------------- # +class ThreadStatus(Enum): + """Liveness of a T3 thread, as projected from the orchestration snapshot. + + ``RUNNING`` — the agent is still working the turn; ``IDLE`` — the turn + finished cleanly (it has gone quiet); ``ERROR`` — the thread/turn failed. + """ + + RUNNING = "running" + IDLE = "idle" + ERROR = "error" + + +class CIStatus(Enum): + """CI verdict for a pushed commit. ``PENDING`` covers both "no run yet" and + "in progress" — the state machine waits on either.""" + + PENDING = "pending" + GREEN = "green" + RED = "red" + + +class Phase(Enum): + """Where a single issue's run is in its lifecycle. Ordered: each phase is a + gate the run passes through on the way to ``DONE``. ``phase_checklist`` + renders these; the loop advances through them as evidence arrives.""" + + WORKTREE = "worktree" # isolated workspace created + TESTS_RED = "tests_red" # failing test written first (TDD red) + GREEN = "green" # implementation makes tests pass (TDD green) + PUSHED = "pushed" # commit(s) pushed to master + CI = "ci" # CI pipeline running on the pushed commit + DEPLOYED = "deployed" # deploy/rollout reached the cluster + DONE = "done" # verified complete; issue can be closed + + +class Action(Enum): + """The decision ``run_state_machine.next_action`` returns for one tick. + + ``WAIT`` — nothing to do yet, poll again; ``CLOSE_SUCCESS`` — run is green, + CI passed, close the issue; ``ESCALATE_PREPUSH`` — the agent errored/stalled + before pushing anything, hand back to a human; ``FIX_FORWARD`` — CI went red + on a pushed commit, dispatch another corrective turn; ``FREEZE_ESCALATE`` — + fix-forward budget exhausted (attempts or wall-clock), stop and escalate. + """ + + WAIT = "wait" + CLOSE_SUCCESS = "close_success" + ESCALATE_PREPUSH = "escalate_prepush" + FIX_FORWARD = "fix_forward" + FREEZE_ESCALATE = "freeze_escalate" + + +# --------------------------------------------------------------------------- # +# Data carriers. +# --------------------------------------------------------------------------- # +@dataclass +class Issue: + """A tracker issue the loop might dispatch. + + ``labeled_by_trusted`` records whether the gating label was applied by a + trusted identity — the loop must never dispatch an issue made ready by an + untrusted actor (prompt-injection / drive-by). ``blocked_by`` lists issue + numbers that must close first; ``priority`` orders the ready set (lower runs + first, matching tracker conventions). + """ + + number: int + repo: str + labels: list[str] + blocked_by: list[int] + labeled_by_trusted: bool + priority: int + + +@dataclass +class DispatchDecision: + """An issue the dispatch policy selected to run now, with a human-readable + ``reason`` (logged + surfaced in notifications, never parsed).""" + + issue: Issue + reason: str + + +@dataclass +class Config: + """Loop configuration. DISABLED BY DEFAULT — ``kill_switch=True`` and an + empty ``allowlist`` mean a freshly-constructed Config dispatches nothing. + Enabling is a deliberate manual step (see ``config.from_env`` / + ``from_configmap``). + """ + + allowlist: list[str] + kill_switch: bool + in_progress_label: str = "agent-in-progress" + ready_label: str = "ready-for-agent" + budget_usd: float = 100.0 + fix_forward_max_attempts: int = 5 + fix_forward_max_seconds: int = 3600 + + +@dataclass +class RunState: + """Everything the state machine needs to decide one issue's next move. + + Assembled each tick from the orchestration snapshot (``thread_status``), the + CI watcher (``ci_status``), and the loop's own bookkeeping (``pushed``, + ``fix_forward_attempts``, ``elapsed_seconds``). ``thread_status`` / + ``ci_status`` are ``None`` when not yet known (no snapshot entry / nothing + pushed to check yet). + """ + + thread_status: ThreadStatus | None + ci_status: CIStatus | None + pushed: bool + fix_forward_attempts: int + elapsed_seconds: float diff --git a/app/afk/watcher.py b/app/afk/watcher.py new file mode 100644 index 0000000..c9c9cc0 --- /dev/null +++ b/app/afk/watcher.py @@ -0,0 +1,342 @@ +"""CronJob entrypoint: drive ONE in-flight AFK run by a single tick. + +The watcher is the *second half* of the loop — the part that drives a run the +poller already started through to a terminal state. Given one in-flight run +(``InFlightRun``: the issue, the T3 thread to poll, the pushed commit if any, +and the fix-forward bookkeeping), one ``tick``: + + 1. **assemble a ``RunState``** from the live edges + the run's bookkeeping: + * ``thread_status`` — from ``t3_client.snapshot()``, by finding this run's + thread and mapping T3's ``running``/``idle``/``error`` to a + ``ThreadStatus`` (missing thread, or any unrecognised status, folds to + ``None`` → "no status yet" → the state machine WAITs; we never escalate + or close on a status we don't understand); + * ``ci_status`` — ``ci_watcher.status(repo, commit)`` *only* when a commit + is pushed (no commit ⇒ nothing to check ⇒ ``None``); + * ``pushed`` / ``fix_forward_attempts`` / ``elapsed_seconds`` — straight + from the run. + 2. **decide** via the pure ``run_state_machine.next_action`` (it owns the + lifecycle policy; the watcher owns only the I/O the decision implies). + 3. **act** on the returned ``Action``: + * ``CLOSE_SUCCESS`` → ``tracker.close`` + drop the in-progress label + + DONE checklist + ``done`` doorbell. The run landed. + * ``ESCALATE_PREPUSH`` / ``FREEZE_ESCALATE`` → drop the in-progress label, + add the ``ready-for-human`` label, post the checklist, ring the + ``needs-human`` / ``frozen`` doorbell. The run is handed to a human; the + issue is left OPEN (not closed) with the work in place. + * ``FIX_FORWARD`` → dispatch a corrective turn (``t3_client.dispatch``), + bump the fix-forward attempt count, refresh the checklist, and keep the + run in flight (NOT terminal: no label churn, no doorbell — the notifier + only speaks terminal kinds). The new thread id rides back on the result + so the next tick polls the corrective turn. + * ``WAIT`` → just refresh the progress checklist and keep waiting. + +Every adapter (T3, tracker, CI, notifier) is injected behind a structural +Protocol, so production wires the real clients and the tests wire the in-memory +fakes; this module opens no socket and reads no message bodies. (The pilot keeps +T3 ``state.sqlite`` message-body reads out of the core loop — snapshot status + +CI status are all the state machine needs — so this watcher never execs into the +pod; that observability nicety is a separate, optional concern.) + +DISABLED BY DEFAULT applies transitively: the poller never starts a run while +the loop is off (``config.kill_switch`` / empty allowlist — see ``config.py``), +so with the shipped defaults there is never an ``InFlightRun`` to tick. +""" +from dataclasses import dataclass +from typing import Protocol + +from . import phase_checklist, run_state_machine +from .notifier import KIND_DONE, KIND_FROZEN, KIND_NEEDS_HUMAN +from .poller import T3Port as _DispatchPort # dispatch(repo, issue, prompt) -> id +from .types import Action, CIStatus, Config, Issue, Phase, RunState, ThreadStatus + +# T3 snapshot status string -> ThreadStatus. Anything not in here (a status T3 +# adds later, or a malformed entry) maps to None — "no usable status yet" — so +# the state machine waits rather than acting on something it can't interpret. +_THREAD_STATUS_BY_STRING: dict[str, ThreadStatus] = { + "running": ThreadStatus.RUNNING, + "idle": ThreadStatus.IDLE, + "error": ThreadStatus.ERROR, +} + +# Action -> the terminal doorbell kind to ring. Only the terminal actions appear; +# WAIT / FIX_FORWARD are non-terminal and ring nothing (the notifier rejects a +# non-terminal kind on purpose — see ``notifier.TERMINAL_KINDS``). +_TERMINAL_KIND_BY_ACTION: dict[Action, str] = { + Action.CLOSE_SUCCESS: KIND_DONE, + Action.ESCALATE_PREPUSH: KIND_NEEDS_HUMAN, + Action.FREEZE_ESCALATE: KIND_FROZEN, +} + +# Default label applied when a run is handed back to a human. Mirrors the +# tracker's ``ready-for-agent`` convention; overridable per-Watcher. +DEFAULT_READY_FOR_HUMAN_LABEL = "ready-for-human" + + +# --------------------------------------------------------------------------- # +# Injected adapter Protocols — structural, so the real clients and the test +# fakes both satisfy them with no subclassing. Only the methods the watcher +# actually calls appear. ``DispatchPort`` is reused from ``poller``. +# --------------------------------------------------------------------------- # +class SnapshotPort(_DispatchPort, Protocol): + """T3 surface the watcher needs: ``dispatch`` (for the corrective turn) plus + ``snapshot`` (for thread liveness).""" + + def snapshot(self) -> dict: ... + + +class TrackerPort(Protocol): + """The slice of ``tracker.Tracker`` the watch tick needs.""" + + def add_label(self, repo: str, issue: int, label: str) -> None: ... + def remove_label(self, repo: str, issue: int, label: str) -> None: ... + def comment(self, repo: str, issue: int, body: str) -> None: ... + def close(self, repo: str, issue: int) -> None: ... + + +class CIPort(Protocol): + """The slice of ``ci_watcher.CIWatcher`` the watch tick needs.""" + + def status(self, repo: str, commit: str) -> CIStatus: ... + + +class NotifierPort(Protocol): + """The slice of ``notifier.Notifier`` the watch tick needs.""" + + def notify(self, kind: str, issue: Issue, thread_id: str | None, detail: str) -> None: ... + + +@dataclass +class InFlightRun: + """One run the watcher is driving, as the loop tracks it between ticks. + + ``thread_id`` is the T3 thread to poll this tick; ``commit`` is the pushed + commit CI watches (``None`` until the agent has pushed). ``fix_forward_attempts`` + and ``elapsed_seconds`` are the loop's own bookkeeping, fed straight into the + assembled ``RunState`` — ``pushed`` is derived as ``commit is not None``. + """ + + issue: Issue + thread_id: str + commit: str | None + fix_forward_attempts: int = 0 + elapsed_seconds: float = 0.0 + + +@dataclass +class TickResult: + """The outcome of one watch tick. + + ``action`` is the state machine's verdict; ``terminal`` is True iff the run + reached an end state (closed or handed to a human) and should no longer be + ticked. ``thread_id`` / ``fix_forward_attempts`` carry the (possibly updated) + bookkeeping the caller threads into the next ``InFlightRun`` — they change + only on a FIX_FORWARD (new corrective thread, incremented attempts) and are + otherwise echoed back unchanged. + """ + + action: Action + terminal: bool + thread_id: str + fix_forward_attempts: int + + +class Watcher: + """Drives one in-flight run per ``tick`` over injected adapters. + + The three escalation-vs-success decisions live in the pure + ``run_state_machine``; this class only performs the I/O each decision + implies. ``ready_for_human_label`` is the label stamped on a run handed back + to a human (default :data:`DEFAULT_READY_FOR_HUMAN_LABEL`). + """ + + def __init__( + self, + t3_client: SnapshotPort, + tracker: TrackerPort, + ci_watcher: CIPort, + notifier: NotifierPort, + ready_for_human_label: str = DEFAULT_READY_FOR_HUMAN_LABEL, + ) -> None: + self._t3 = t3_client + self._tracker = tracker + self._ci = ci_watcher + self._notifier = notifier + self._ready_for_human_label = ready_for_human_label + + def tick(self, run: InFlightRun, config: Config) -> TickResult: + """Drive ``run`` one step (see module docstring).""" + state = self._assemble_state(run) + action = run_state_machine.next_action(state, config) + + if action is Action.CLOSE_SUCCESS: + return self._close_success(run, config) + if action in (Action.ESCALATE_PREPUSH, Action.FREEZE_ESCALATE): + return self._escalate(run, state, action, config) + if action is Action.FIX_FORWARD: + return self._fix_forward(run, state) + # WAIT: still in flight — just show progress and poll again next tick. + return self._wait(run, state, action) + + # ----------------------------------------------------------------- # + # RunState assembly. + # ----------------------------------------------------------------- # + def _assemble_state(self, run: InFlightRun) -> RunState: + thread_status = self._thread_status(run.thread_id) + # Only fold CI when there's a commit to check — an unpushed run has no + # pipeline, and we must not query CI (the assertion in the tests, and + # avoiding a needless API call, both rely on this). + ci_status = ( + self._ci.status(run.issue.repo, run.commit) + if run.commit is not None + else None + ) + return RunState( + thread_status=thread_status, + ci_status=ci_status, + pushed=run.commit is not None, + fix_forward_attempts=run.fix_forward_attempts, + elapsed_seconds=run.elapsed_seconds, + ) + + def _thread_status(self, thread_id: str) -> ThreadStatus | None: + """This thread's liveness from the fleet snapshot, or ``None`` when the + thread is absent or its status string is one we don't recognise.""" + for thread in self._t3.snapshot().get("threads", []): + if thread.get("id") == thread_id: + return _THREAD_STATUS_BY_STRING.get(thread.get("status")) + return None + + # ----------------------------------------------------------------- # + # Per-action handlers. + # ----------------------------------------------------------------- # + def _close_success(self, run: InFlightRun, config: Config) -> TickResult: + """Landed: close the issue, drop the lock, post DONE, ring the doorbell.""" + self._post_checklist(run, Phase.DONE) + self._tracker.remove_label( + run.issue.repo, run.issue.number, config.in_progress_label + ) + self._tracker.close(run.issue.repo, run.issue.number) + self._notify(run, Action.CLOSE_SUCCESS, "Run landed: pushed and CI green.") + return _terminal(Action.CLOSE_SUCCESS, run) + + def _escalate( + self, run: InFlightRun, state: RunState, action: Action, config: Config + ) -> TickResult: + """Hand back to a human: drop the lock, add ready-for-human, post the + checklist, ring the matching doorbell. The issue stays OPEN.""" + self._post_checklist(run, _phase_for(state)) + self._tracker.remove_label( + run.issue.repo, run.issue.number, config.in_progress_label + ) + self._tracker.add_label( + run.issue.repo, run.issue.number, self._ready_for_human_label + ) + self._notify(run, action, _escalation_detail(action, state)) + return _terminal(action, run) + + def _fix_forward(self, run: InFlightRun, state: RunState) -> TickResult: + """CI red with budget left: dispatch a corrective turn and stay in flight. + + Not terminal — no doorbell (the notifier only speaks terminal kinds) and + no label churn (the in-progress lock stays put). The corrective dispatch + spawns a fresh thread; its id and the incremented attempt count ride back + so the next tick tracks the right thread. + """ + attempts = run.fix_forward_attempts + 1 + new_thread_id = self._t3.dispatch( + run.issue.repo, run.issue.number, _fix_forward_prompt(run) + ) + self._post_checklist(run, Phase.CI, fix_forward_attempts=attempts) + return TickResult( + action=Action.FIX_FORWARD, + terminal=False, + thread_id=new_thread_id, + fix_forward_attempts=attempts, + ) + + def _wait(self, run: InFlightRun, state: RunState, action: Action) -> TickResult: + """Still working: refresh the progress checklist, change nothing else.""" + self._post_checklist(run, _phase_for(state)) + return TickResult( + action=action, + terminal=False, + thread_id=run.thread_id, + fix_forward_attempts=run.fix_forward_attempts, + ) + + # ----------------------------------------------------------------- # + # I/O helpers. + # ----------------------------------------------------------------- # + def _post_checklist( + self, run: InFlightRun, phase: Phase, *, fix_forward_attempts: int | None = None + ) -> None: + attempts = run.fix_forward_attempts if fix_forward_attempts is None else fix_forward_attempts + body = phase_checklist.render( + phase, + { + "repo": run.issue.repo, + "issue": run.issue.number, + "thread_id": run.thread_id, + "fix_forward_attempts": attempts, + }, + ) + self._tracker.comment(run.issue.repo, run.issue.number, body) + + def _notify(self, run: InFlightRun, action: Action, detail: str) -> None: + self._notifier.notify( + _TERMINAL_KIND_BY_ACTION[action], run.issue, run.thread_id, detail + ) + + +# --------------------------------------------------------------------------- # +# Pure helpers. +# --------------------------------------------------------------------------- # +def _terminal(action: Action, run: InFlightRun) -> TickResult: + """A terminal :class:`TickResult` echoing the run's bookkeeping unchanged.""" + return TickResult( + action=action, + terminal=True, + thread_id=run.thread_id, + fix_forward_attempts=run.fix_forward_attempts, + ) + + +def _phase_for(state: RunState) -> Phase: + """Best-effort current lifecycle phase from the evidence in ``state``. + + The checklist is decoration only (the loop reads no agent message bodies), so + this maps the observable signals — pushed? CI verdict? — onto the closest + phase: nothing pushed ⇒ still working toward the implementation (GREEN); + pushed ⇒ the CI phase is where attention sits until it goes green. A green CI + is rendered as DONE by the close path, not here. + """ + if not state.pushed: + return Phase.GREEN + if state.ci_status is CIStatus.GREEN: + return Phase.DEPLOYED + return Phase.CI + + +def _escalation_detail(action: Action, state: RunState) -> str: + """Human-readable escalation reason for the doorbell + logs (never parsed).""" + if action is Action.ESCALATE_PREPUSH: + return ( + "Agent stalled or errored before pushing any commit " + f"(thread {state.thread_status.value if state.thread_status else 'unknown'}). " + "Handed back for a human." + ) + return ( + "Fix-forward budget exhausted with CI still red " + f"({state.fix_forward_attempts} attempts, {state.elapsed_seconds:.0f}s). " + "Frozen for a human." + ) + + +def _fix_forward_prompt(run: InFlightRun) -> str: + """The corrective-turn prompt: point the agent at the red CI on its commit.""" + return ( + f"CI is RED on your pushed commit {run.commit} for issue #{run.issue.number} " + f"in `{run.issue.repo}`. Investigate the failing run, fix the cause, and " + f"push the fix to master. Then watch CI again until it is green." + ) diff --git a/tests/conftest.py b/tests/conftest.py index b08a72f..921853f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,3 +43,186 @@ def drain(): break await asyncio.sleep(0.01) return _drain + + +# --------------------------------------------------------------------------- # +# AFK loop fixtures. +# +# Shared factories + in-memory fakes for the app.afk modules. EVERYTHING the AFK +# tests touch is faked here — no test ever reaches a real T3 server, GitHub / +# Forgejo, or the cluster. The fakes implement the module interfaces from the +# contract and record their calls so tests can assert on them. +# --------------------------------------------------------------------------- # +from app.afk.types import ( # noqa: E402 (after the env setup above, like app_main) + CIStatus, + Config, + Issue, + RunState, + ThreadStatus, +) + + +@pytest.fixture +def make_issue(): + """Factory for ``Issue``. Defaults to a clean, dispatchable issue (trusted + label, nothing blocking); override any field per test.""" + def _make( + number: int = 1, + repo: str = "infra", + labels: list[str] | None = None, + blocked_by: list[int] | None = None, + labeled_by_trusted: bool = True, + priority: int = 0, + ) -> Issue: + return Issue( + number=number, + repo=repo, + labels=["ready-for-agent"] if labels is None else labels, + blocked_by=[] if blocked_by is None else blocked_by, + labeled_by_trusted=labeled_by_trusted, + priority=priority, + ) + return _make + + +@pytest.fixture +def make_config(): + """Factory for ``Config``. Defaults to an ENABLED config (kill switch off, + a one-repo allowlist) so policy/state-machine tests exercise real behaviour; + the disabled production default is covered separately in the config tests.""" + def _make( + allowlist: list[str] | None = None, + kill_switch: bool = False, + **overrides, + ) -> Config: + return Config( + allowlist=["infra"] if allowlist is None else allowlist, + kill_switch=kill_switch, + **overrides, + ) + return _make + + +@pytest.fixture +def make_run_state(): + """Factory for ``RunState``. Defaults to a freshly-dispatched run (thread + running, nothing pushed, no CI, no fix-forward attempts yet).""" + def _make( + thread_status: ThreadStatus | None = ThreadStatus.RUNNING, + ci_status: CIStatus | None = None, + pushed: bool = False, + fix_forward_attempts: int = 0, + elapsed_seconds: float = 0.0, + ) -> RunState: + return RunState( + thread_status=thread_status, + ci_status=ci_status, + pushed=pushed, + fix_forward_attempts=fix_forward_attempts, + elapsed_seconds=elapsed_seconds, + ) + return _make + + +class FakeT3Client: + """In-memory stand-in for ``t3_client.T3Client``. Records each dispatch and + hands back a deterministic thread id; ``snapshot`` returns whatever was + staged via ``set_snapshot``.""" + + def __init__(self) -> None: + self.dispatched: list[dict] = [] + self._snapshot: dict = {"threads": []} + self._next_id = 0 + + def dispatch(self, repo: str, issue: int, prompt: str) -> str: + thread_id = f"thread-{self._next_id}" + self._next_id += 1 + self.dispatched.append( + {"repo": repo, "issue": issue, "prompt": prompt, "thread_id": thread_id} + ) + return thread_id + + def snapshot(self) -> dict: + return self._snapshot + + def set_snapshot(self, snapshot: dict) -> None: + self._snapshot = snapshot + + +class FakeTracker: + """In-memory stand-in for ``tracker.Tracker``. ``list_ready`` returns issues + staged via ``seed``; label/comment/close just record their calls.""" + + def __init__(self) -> None: + self._ready: dict[str, list[Issue]] = {} + self.label_ops: list[tuple[str, str, int, str]] = [] # (op, repo, issue, label) + self.comments: list[tuple[str, int, str]] = [] + self.closed: list[tuple[str, int]] = [] + + def seed(self, repo: str, issues: list[Issue]) -> None: + self._ready[repo] = issues + + def list_ready(self, repos: list[str]) -> list[Issue]: + out: list[Issue] = [] + for repo in repos: + out.extend(self._ready.get(repo, [])) + return out + + def add_label(self, repo: str, issue: int, label: str) -> None: + self.label_ops.append(("add", repo, issue, label)) + + def remove_label(self, repo: str, issue: int, label: str) -> None: + self.label_ops.append(("remove", repo, issue, label)) + + def comment(self, repo: str, issue: int, body: str) -> None: + self.comments.append((repo, issue, body)) + + def close(self, repo: str, issue: int) -> None: + self.closed.append((repo, issue)) + + +class FakeCIWatcher: + """In-memory stand-in for ``ci_watcher.CIWatcher``. Returns the status staged + per ``(repo, commit)`` via ``set_status``; unknown commits read PENDING.""" + + def __init__(self) -> None: + self._statuses: dict[tuple[str, str], CIStatus] = {} + + def set_status(self, repo: str, commit: str, status: CIStatus) -> None: + self._statuses[(repo, commit)] = status + + def status(self, repo: str, commit: str) -> CIStatus: + return self._statuses.get((repo, commit), CIStatus.PENDING) + + +class FakeNotifier: + """In-memory stand-in for ``notifier.Notifier``. Records every notification + so tests can assert escalations fired with the right kind/detail.""" + + def __init__(self) -> None: + self.sent: list[dict] = [] + + def notify(self, kind: str, issue: Issue, thread_id: str | None, detail: str) -> None: + self.sent.append( + {"kind": kind, "issue": issue, "thread_id": thread_id, "detail": detail} + ) + + +@pytest.fixture +def fake_t3() -> FakeT3Client: + return FakeT3Client() + + +@pytest.fixture +def fake_tracker() -> FakeTracker: + return FakeTracker() + + +@pytest.fixture +def fake_ci() -> FakeCIWatcher: + return FakeCIWatcher() + + +@pytest.fixture +def fake_notifier() -> FakeNotifier: + return FakeNotifier() diff --git a/tests/test_afk_ci_watcher.py b/tests/test_afk_ci_watcher.py new file mode 100644 index 0000000..7ff0b9a --- /dev/null +++ b/tests/test_afk_ci_watcher.py @@ -0,0 +1,285 @@ +"""Tests for ``app.afk.ci_watcher`` — the commit → ``CIStatus`` adapter. + +The watcher folds two independent signals into one verdict the state machine +reads: the **GHA run** for a pushed commit (build/test/lint) and the +**deploy/rollout** that reaches the cluster (Woodpecker pipeline → Keel/k8s +rollout). The CI/CD chain is GHA → ghcr → Woodpecker → Keel +(``docs/2026-06-14-afk-implementation-pipeline-design.md``), so a commit is only +truly GREEN once *both* the build passed AND its image actually rolled out. + +Every test injects FAKE clients — no test ever shells out to ``gh``, +``woodpecker``, or ``kubectl``, or reaches the network. The fakes implement the +``ci_watcher`` client Protocols and return staged ``StageResult`` values per +``(repo, commit)``; the watcher's only job is to query them and fold the result, +so the folding table is what these tests pin. +""" +import pytest + +from app.afk.ci_watcher import ( + CIWatcher, + StageResult, +) +from app.afk.types import CIStatus + + +# --------------------------------------------------------------------------- # +# Fakes for the three injected clients. +# +# Each maps (repo, commit) → StageResult and records every query, so tests can +# assert both the folded verdict AND that short-circuiting skips later stages +# (a RED build must not even ask the rollout client). +# --------------------------------------------------------------------------- # +class _FakeStageClient: + """A recording stand-in for any of the three stage clients. ``default`` is + returned for an unstaged ``(repo, commit)`` — defaults to ``PENDING`` so an + un-seeded stage reads "not done yet", never a false GREEN.""" + + def __init__(self, default: StageResult = StageResult.PENDING) -> None: + self._results: dict[tuple[str, str], StageResult] = {} + self._default = default + self.queries: list[tuple[str, str]] = [] + + def set(self, repo: str, commit: str, result: StageResult) -> None: + self._results[(repo, commit)] = result + + def _lookup(self, repo: str, commit: str) -> StageResult: + self.queries.append((repo, commit)) + return self._results.get((repo, commit), self._default) + + +class FakeGitHubChecks(_FakeStageClient): + def run_conclusion(self, repo: str, commit: str) -> StageResult: + return self._lookup(repo, commit) + + +class FakeWoodpecker(_FakeStageClient): + def deploy_conclusion(self, repo: str, commit: str) -> StageResult: + return self._lookup(repo, commit) + + +class FakeRollout(_FakeStageClient): + def rollout_status(self, repo: str, commit: str) -> StageResult: + return self._lookup(repo, commit) + + +# --------------------------------------------------------------------------- # +# Fixtures. +# --------------------------------------------------------------------------- # +REPO = "infra" +COMMIT = "deadbeefcafe" + + +@pytest.fixture +def gha() -> FakeGitHubChecks: + return FakeGitHubChecks() + + +@pytest.fixture +def woodpecker() -> FakeWoodpecker: + return FakeWoodpecker() + + +@pytest.fixture +def rollout() -> FakeRollout: + return FakeRollout() + + +@pytest.fixture +def watcher(gha, woodpecker, rollout) -> CIWatcher: + return CIWatcher(github=gha, woodpecker=woodpecker, rollout=rollout) + + +def _stage_all(gha, woodpecker, rollout, *, build, deploy, roll) -> None: + """Stage all three clients for the canonical ``(REPO, COMMIT)`` at once.""" + gha.set(REPO, COMMIT, build) + woodpecker.set(REPO, COMMIT, deploy) + rollout.set(REPO, COMMIT, roll) + + +# --------------------------------------------------------------------------- # +# StageResult vocabulary. +# --------------------------------------------------------------------------- # +def test_stageresult_has_the_four_outcomes(): + assert {s.name for s in StageResult} == {"NONE", "PENDING", "SUCCESS", "FAILURE"} + + +# --------------------------------------------------------------------------- # +# The happy path: every stage green ⇒ GREEN. +# --------------------------------------------------------------------------- # +def test_all_stages_success_is_green(watcher, gha, woodpecker, rollout): + _stage_all(gha, woodpecker, rollout, + build=StageResult.SUCCESS, + deploy=StageResult.SUCCESS, + roll=StageResult.SUCCESS) + assert watcher.status(REPO, COMMIT) is CIStatus.GREEN + + +# --------------------------------------------------------------------------- # +# GHA build stage gates everything below it. +# --------------------------------------------------------------------------- # +def test_build_failure_is_red(watcher, gha): + gha.set(REPO, COMMIT, StageResult.FAILURE) + assert watcher.status(REPO, COMMIT) is CIStatus.RED + + +@pytest.mark.parametrize("build", [StageResult.NONE, StageResult.PENDING]) +def test_build_not_yet_concluded_is_pending(watcher, gha, build): + # No run yet (NONE) and in-progress (PENDING) both read PENDING — the state + # machine waits on either. + gha.set(REPO, COMMIT, build) + assert watcher.status(REPO, COMMIT) is CIStatus.PENDING + + +def test_build_failure_short_circuits_before_deploy_and_rollout( + watcher, gha, woodpecker, rollout +): + gha.set(REPO, COMMIT, StageResult.FAILURE) + # Even if later stages would (nonsensically) be green, a red build wins... + woodpecker.set(REPO, COMMIT, StageResult.SUCCESS) + rollout.set(REPO, COMMIT, StageResult.SUCCESS) + assert watcher.status(REPO, COMMIT) is CIStatus.RED + # ...and the later clients are never even queried. + assert woodpecker.queries == [] + assert rollout.queries == [] + + +def test_build_pending_short_circuits_before_deploy_and_rollout( + watcher, gha, woodpecker, rollout +): + gha.set(REPO, COMMIT, StageResult.PENDING) + assert watcher.status(REPO, COMMIT) is CIStatus.PENDING + assert woodpecker.queries == [] + assert rollout.queries == [] + + +# --------------------------------------------------------------------------- # +# Deploy (Woodpecker) stage — only consulted once the build is green. +# --------------------------------------------------------------------------- # +def test_deploy_failure_is_red_even_with_green_build(watcher, gha, woodpecker): + gha.set(REPO, COMMIT, StageResult.SUCCESS) + woodpecker.set(REPO, COMMIT, StageResult.FAILURE) + assert watcher.status(REPO, COMMIT) is CIStatus.RED + + +@pytest.mark.parametrize("deploy", [StageResult.NONE, StageResult.PENDING]) +def test_deploy_not_yet_concluded_is_pending(watcher, gha, woodpecker, deploy): + gha.set(REPO, COMMIT, StageResult.SUCCESS) + woodpecker.set(REPO, COMMIT, deploy) + assert watcher.status(REPO, COMMIT) is CIStatus.PENDING + + +def test_deploy_failure_short_circuits_before_rollout( + watcher, gha, woodpecker, rollout +): + gha.set(REPO, COMMIT, StageResult.SUCCESS) + woodpecker.set(REPO, COMMIT, StageResult.FAILURE) + rollout.set(REPO, COMMIT, StageResult.SUCCESS) + assert watcher.status(REPO, COMMIT) is CIStatus.RED + assert rollout.queries == [] + # The build WAS consulted (it had to pass to reach deploy). + assert gha.queries == [(REPO, COMMIT)] + + +# --------------------------------------------------------------------------- # +# Rollout stage — the final gate. Green build + green deploy is still only +# PENDING until the image actually reaches the cluster. +# --------------------------------------------------------------------------- # +def test_rollout_failure_is_red(watcher, gha, woodpecker, rollout): + _stage_all(gha, woodpecker, rollout, + build=StageResult.SUCCESS, + deploy=StageResult.SUCCESS, + roll=StageResult.FAILURE) + assert watcher.status(REPO, COMMIT) is CIStatus.RED + + +@pytest.mark.parametrize("roll", [StageResult.NONE, StageResult.PENDING]) +def test_green_build_and_deploy_but_unfinished_rollout_is_pending( + watcher, gha, woodpecker, rollout, roll +): + _stage_all(gha, woodpecker, rollout, + build=StageResult.SUCCESS, + deploy=StageResult.SUCCESS, + roll=roll) + assert watcher.status(REPO, COMMIT) is CIStatus.PENDING + + +def test_green_requires_all_three_stages_consulted( + watcher, gha, woodpecker, rollout +): + _stage_all(gha, woodpecker, rollout, + build=StageResult.SUCCESS, + deploy=StageResult.SUCCESS, + roll=StageResult.SUCCESS) + assert watcher.status(REPO, COMMIT) is CIStatus.GREEN + assert gha.queries == [(REPO, COMMIT)] + assert woodpecker.queries == [(REPO, COMMIT)] + assert rollout.queries == [(REPO, COMMIT)] + + +# --------------------------------------------------------------------------- # +# Plumbing: the commit and repo are passed through verbatim to every client, +# and an entirely un-seeded commit reads PENDING (not GREEN, not RED). +# --------------------------------------------------------------------------- # +def test_repo_and_commit_passed_through_to_clients(watcher, gha): + gha.set("realestate-crawler", "abc123", StageResult.FAILURE) + assert watcher.status("realestate-crawler", "abc123") is CIStatus.RED + assert gha.queries == [("realestate-crawler", "abc123")] + + +def test_unknown_commit_defaults_to_pending(watcher): + # Nothing staged anywhere ⇒ the build stage reads PENDING by default ⇒ the + # whole verdict is PENDING. A never-pushed/just-pushed commit is never a + # false GREEN. + assert watcher.status(REPO, "never-seen") is CIStatus.PENDING + + +# --------------------------------------------------------------------------- # +# The default rollout client is OPTIONAL — per the pilot facts, state.sqlite / +# kubectl reads are optional, so a CIWatcher built without a rollout client must +# still work, treating "build green + deploy green" as the terminal GREEN. +# --------------------------------------------------------------------------- # +def test_rollout_client_is_optional_deploy_green_is_green(gha, woodpecker): + w = CIWatcher(github=gha, woodpecker=woodpecker) # no rollout client + gha.set(REPO, COMMIT, StageResult.SUCCESS) + woodpecker.set(REPO, COMMIT, StageResult.SUCCESS) + assert w.status(REPO, COMMIT) is CIStatus.GREEN + + +def test_rollout_client_optional_still_honours_build_and_deploy_failures( + gha, woodpecker +): + w = CIWatcher(github=gha, woodpecker=woodpecker) + gha.set(REPO, COMMIT, StageResult.SUCCESS) + woodpecker.set(REPO, COMMIT, StageResult.FAILURE) + assert w.status(REPO, COMMIT) is CIStatus.RED + + +# --------------------------------------------------------------------------- # +# Full folding table — exhaustive over (build, deploy, rollout) so the +# precedence rules (FAILURE short-circuits red; otherwise any PENDING/NONE keeps +# it pending; all-success ⇒ green) can never silently drift. +# --------------------------------------------------------------------------- # +_N, _P, _S, _F = ( + StageResult.NONE, + StageResult.PENDING, + StageResult.SUCCESS, + StageResult.FAILURE, +) + + +def _expected(build: StageResult, deploy: StageResult, roll: StageResult) -> CIStatus: + # Reference fold, independent of the implementation, evaluated stage by stage. + for stage in (build, deploy, roll): + if stage is _F: + return CIStatus.RED + if stage in (_N, _P): + return CIStatus.PENDING + return CIStatus.GREEN + + +@pytest.mark.parametrize("build", [_N, _P, _S, _F]) +@pytest.mark.parametrize("deploy", [_N, _P, _S, _F]) +@pytest.mark.parametrize("roll", [_N, _P, _S, _F]) +def test_full_folding_table(watcher, gha, woodpecker, rollout, build, deploy, roll): + _stage_all(gha, woodpecker, rollout, build=build, deploy=deploy, roll=roll) + assert watcher.status(REPO, COMMIT) is _expected(build, deploy, roll) diff --git a/tests/test_afk_dispatch_policy.py b/tests/test_afk_dispatch_policy.py new file mode 100644 index 0000000..ee8cd83 --- /dev/null +++ b/tests/test_afk_dispatch_policy.py @@ -0,0 +1,374 @@ +"""Tests for ``app.afk.dispatch_policy.select_dispatchable`` — the pure gate that +turns a pile of ready issues into the ordered set the loop may dispatch *now*. + +The function is PURE (no IO), so every test here is a plain in-memory call over +the fakes/factories in ``conftest`` (``make_issue`` / ``make_config``); nothing +touches a real T3 server, tracker, or cluster. The suite walks the full +dispatchability matrix — trust gate, allowlist, per-repo lock, blocked_by, +kill switch — plus the priority ordering and the one-agent-per-repo invariant. + +Ordering contract under test: **higher ``priority`` first** (per the AFK module +spec), with a deterministic tiebreaker so the output is stable regardless of +input order. NOTE: ``Issue.priority``'s own docstring says "lower runs first"; +this module follows the explicit dispatch-policy spec instead — see the module +docstring in ``dispatch_policy.py``. +""" +import itertools + +import pytest + +from app.afk import dispatch_policy +from app.afk.types import DispatchDecision, Issue + + +# --------------------------------------------------------------------------- # +# Helpers — keep assertions terse and intent-revealing. +# --------------------------------------------------------------------------- # +def _selected_numbers(decisions: list[DispatchDecision]) -> list[int]: + """The issue numbers, in the order the policy returned them.""" + return [d.issue.number for d in decisions] + + +def _selected_set(decisions: list[DispatchDecision]) -> set[int]: + return {d.issue.number for d in decisions} + + +# --------------------------------------------------------------------------- # +# Return shape & purity. +# --------------------------------------------------------------------------- # +def test_returns_list_of_dispatch_decisions(make_issue, make_config): + issue = make_issue(number=7, repo="infra") + decisions = dispatch_policy.select_dispatchable([issue], make_config(), set()) + assert isinstance(decisions, list) + assert len(decisions) == 1 + assert isinstance(decisions[0], DispatchDecision) + assert decisions[0].issue is issue + assert isinstance(decisions[0].reason, str) and decisions[0].reason # non-empty + + +def test_empty_input_yields_empty_output(make_config): + assert dispatch_policy.select_dispatchable([], make_config(), set()) == [] + + +def test_does_not_mutate_inputs(make_issue, make_config): + issues = [make_issue(number=1, priority=0), make_issue(number=2, priority=9)] + issues_snapshot = list(issues) + config = make_config(allowlist=["infra"]) + in_flight: set[str] = set() + + dispatch_policy.select_dispatchable(issues, config, in_flight) + + # Caller's list (and its order) and the lock set are left untouched. + assert issues == issues_snapshot + assert [i.number for i in issues] == [1, 2] + assert in_flight == set() + assert config.allowlist == ["infra"] + + +def test_decision_wraps_the_same_issue_object(make_issue, make_config): + issue = make_issue(number=42) + [decision] = dispatch_policy.select_dispatchable([issue], make_config(), set()) + assert decision.issue is issue # identity, not a copy + + +# --------------------------------------------------------------------------- # +# Kill switch — highest-precedence short-circuit. +# --------------------------------------------------------------------------- # +def test_kill_switch_returns_empty_even_with_perfect_issues(make_issue, make_config): + issues = [make_issue(number=n, repo="infra") for n in range(1, 6)] + config = make_config(allowlist=["infra"], kill_switch=True) + assert dispatch_policy.select_dispatchable(issues, config, set()) == [] + + +def test_kill_switch_off_dispatches(make_issue, make_config): + issue = make_issue(repo="infra") + config = make_config(allowlist=["infra"], kill_switch=False) + assert len(dispatch_policy.select_dispatchable([issue], config, set())) == 1 + + +def test_production_default_config_dispatches_nothing(make_issue): + """The shipped default (kill switch ON, empty allowlist) is inert: even a + pristine, trusted issue is never selected.""" + from app.afk import config as afk_config + + issue = make_issue(repo="infra") + assert dispatch_policy.select_dispatchable([issue], afk_config.default(), set()) == [] + + +# --------------------------------------------------------------------------- # +# Trust gate. +# --------------------------------------------------------------------------- # +def test_untrusted_issue_is_skipped(make_issue, make_config): + issue = make_issue(repo="infra", labeled_by_trusted=False) + assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == [] + + +def test_trusted_issue_is_eligible(make_issue, make_config): + issue = make_issue(repo="infra", labeled_by_trusted=True) + assert len(dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set())) == 1 + + +def test_trust_gate_filters_only_untrusted(make_issue, make_config): + trusted = make_issue(number=1, repo="infra", labeled_by_trusted=True) + untrusted = make_issue(number=2, repo="infra", labeled_by_trusted=False) + decisions = dispatch_policy.select_dispatchable( + [trusted, untrusted], make_config(allowlist=["infra"]), set() + ) + assert _selected_set(decisions) == {1} + + +# --------------------------------------------------------------------------- # +# Allowlist membership. +# --------------------------------------------------------------------------- # +def test_repo_not_in_allowlist_is_skipped(make_issue, make_config): + issue = make_issue(repo="some-other-repo") + assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == [] + + +def test_empty_allowlist_dispatches_nothing(make_issue, make_config): + issue = make_issue(repo="infra") + # kill switch off but allowlist empty -> still inert (the two-gate posture). + config = make_config(allowlist=[], kill_switch=False) + assert dispatch_policy.select_dispatchable([issue], config, set()) == [] + + +def test_allowlist_selects_only_listed_repos(make_issue, make_config): + a = make_issue(number=1, repo="infra") + b = make_issue(number=2, repo="realestate-crawler") + c = make_issue(number=3, repo="not-allowed") + decisions = dispatch_policy.select_dispatchable( + [a, b, c], make_config(allowlist=["infra", "realestate-crawler"]), set() + ) + assert _selected_set(decisions) == {1, 2} + + +# --------------------------------------------------------------------------- # +# Per-repo lock (in_flight_repos). +# --------------------------------------------------------------------------- # +def test_repo_already_in_flight_is_skipped(make_issue, make_config): + issue = make_issue(repo="infra") + decisions = dispatch_policy.select_dispatchable( + [issue], make_config(allowlist=["infra"]), in_flight_repos={"infra"} + ) + assert decisions == [] + + +def test_in_flight_lock_is_per_repo(make_issue, make_config): + locked = make_issue(number=1, repo="infra") + free = make_issue(number=2, repo="realestate-crawler") + decisions = dispatch_policy.select_dispatchable( + [locked, free], + make_config(allowlist=["infra", "realestate-crawler"]), + in_flight_repos={"infra"}, + ) + assert _selected_set(decisions) == {2} # only the unlocked repo's issue runs + + +def test_all_repos_in_flight_dispatches_nothing(make_issue, make_config): + a = make_issue(number=1, repo="infra") + b = make_issue(number=2, repo="realestate-crawler") + decisions = dispatch_policy.select_dispatchable( + [a, b], + make_config(allowlist=["infra", "realestate-crawler"]), + in_flight_repos={"infra", "realestate-crawler"}, + ) + assert decisions == [] + + +# --------------------------------------------------------------------------- # +# One-agent-per-repo invariant — at most ONE decision per repo per call. +# +# The whole design serialises agents within a repo (two would collide on the +# working tree). A single call must therefore never hand back two issues for the +# same repo, even when both are eligible and the repo is not yet in-flight. +# --------------------------------------------------------------------------- # +def test_at_most_one_decision_per_repo(make_issue, make_config): + lo = make_issue(number=1, repo="infra", priority=1) + hi = make_issue(number=2, repo="infra", priority=9) + decisions = dispatch_policy.select_dispatchable( + [lo, hi], make_config(allowlist=["infra"]), set() + ) + assert len(decisions) == 1 + assert decisions[0].issue.number == 2 # the higher-priority one wins the slot + + +def test_one_decision_per_repo_across_many_repos(make_issue, make_config): + issues = [ + make_issue(number=10, repo="infra", priority=1), + make_issue(number=11, repo="infra", priority=5), + make_issue(number=20, repo="realestate-crawler", priority=3), + make_issue(number=21, repo="realestate-crawler", priority=2), + ] + decisions = dispatch_policy.select_dispatchable( + issues, make_config(allowlist=["infra", "realestate-crawler"]), set() + ) + # One per repo, each the repo's highest-priority eligible issue. + assert _selected_set(decisions) == {11, 20} + repos = [d.issue.repo for d in decisions] + assert len(repos) == len(set(repos)) # no repo appears twice + + +def test_ineligible_higher_priority_does_not_consume_repo_slot(make_issue, make_config): + """A higher-priority issue that is itself ineligible (e.g. blocked) must not + suppress a lower-priority *eligible* issue in the same repo — the slot goes + to the best ELIGIBLE candidate, not merely the highest-priority one.""" + blocked_hi = make_issue(number=1, repo="infra", priority=9, blocked_by=[99]) + ready_lo = make_issue(number=2, repo="infra", priority=1) + decisions = dispatch_policy.select_dispatchable( + [blocked_hi, ready_lo], make_config(allowlist=["infra"]), set() + ) + assert _selected_numbers(decisions) == [2] + + +# --------------------------------------------------------------------------- # +# blocked_by gating — blocked_by holds OPEN blocker numbers. +# --------------------------------------------------------------------------- # +def test_blocked_issue_is_skipped(make_issue, make_config): + issue = make_issue(repo="infra", blocked_by=[101]) + assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == [] + + +def test_unblocked_issue_with_empty_blocked_by_is_eligible(make_issue, make_config): + issue = make_issue(repo="infra", blocked_by=[]) + assert len(dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set())) == 1 + + +@pytest.mark.parametrize("blockers", [[1], [1, 2], [5, 6, 7]]) +def test_any_open_blocker_blocks(make_issue, make_config, blockers): + issue = make_issue(repo="infra", blocked_by=blockers) + assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == [] + + +def test_blocked_filters_only_blocked(make_issue, make_config): + ready = make_issue(number=1, repo="infra", blocked_by=[]) + blocked = make_issue(number=2, repo="realestate-crawler", blocked_by=[7]) + decisions = dispatch_policy.select_dispatchable( + [ready, blocked], make_config(allowlist=["infra", "realestate-crawler"]), set() + ) + assert _selected_set(decisions) == {1} + + +# --------------------------------------------------------------------------- # +# Priority ordering — higher priority first, deterministic tiebreaker. +# --------------------------------------------------------------------------- # +def test_higher_priority_first(make_issue, make_config): + lo = make_issue(number=1, repo="infra", priority=1) + mid = make_issue(number=2, repo="realestate-crawler", priority=5) + hi = make_issue(number=3, repo="SparkyFitness", priority=9) + decisions = dispatch_policy.select_dispatchable( + [lo, hi, mid], + make_config(allowlist=["infra", "realestate-crawler", "SparkyFitness"]), + set(), + ) + assert _selected_numbers(decisions) == [3, 2, 1] # 9, 5, 1 + + +def test_ordering_independent_of_input_order(make_issue, make_config): + """Whatever order the caller supplies issues in, the dispatch order is the + same — sorted purely by the policy, not by arrival.""" + base = [ + ("infra", 10, 2), + ("realestate-crawler", 20, 8), + ("SparkyFitness", 30, 5), + ("health", 40, 1), + ] + allow = ["infra", "realestate-crawler", "SparkyFitness", "health"] + config = make_config(allowlist=allow) + expected = [20, 30, 10, 40] # priorities 8,5,2,1 + + for perm in itertools.permutations(base): + issues = [make_issue(number=n, repo=r, priority=p) for (r, n, p) in perm] + decisions = dispatch_policy.select_dispatchable(issues, config, set()) + assert _selected_numbers(decisions) == expected + + +def test_priority_ties_break_deterministically_by_issue_number(make_issue, make_config): + """Equal priority across different repos -> a stable, total order. We tie-break + on ascending issue number so the result never depends on dict/set iteration + or input order.""" + a = make_issue(number=30, repo="infra", priority=5) + b = make_issue(number=10, repo="realestate-crawler", priority=5) + c = make_issue(number=20, repo="SparkyFitness", priority=5) + config = make_config(allowlist=["infra", "realestate-crawler", "SparkyFitness"]) + + for perm in itertools.permutations([a, b, c]): + decisions = dispatch_policy.select_dispatchable(list(perm), config, set()) + assert _selected_numbers(decisions) == [10, 20, 30] + + +def test_negative_and_zero_priorities_order_correctly(make_issue, make_config): + neg = make_issue(number=1, repo="infra", priority=-5) + zero = make_issue(number=2, repo="realestate-crawler", priority=0) + pos = make_issue(number=3, repo="SparkyFitness", priority=3) + decisions = dispatch_policy.select_dispatchable( + [neg, zero, pos], + make_config(allowlist=["infra", "realestate-crawler", "SparkyFitness"]), + set(), + ) + assert _selected_numbers(decisions) == [3, 2, 1] # 3 > 0 > -5 + + +# --------------------------------------------------------------------------- # +# Reasons — human-readable, never parsed, but must be present and sensible. +# --------------------------------------------------------------------------- # +def test_every_decision_has_a_nonempty_reason(make_issue, make_config): + issues = [ + make_issue(number=1, repo="infra", priority=3), + make_issue(number=2, repo="realestate-crawler", priority=1), + ] + decisions = dispatch_policy.select_dispatchable( + issues, make_config(allowlist=["infra", "realestate-crawler"]), set() + ) + assert decisions # sanity + assert all(d.reason.strip() for d in decisions) + + +# --------------------------------------------------------------------------- # +# Combined matrix — every gate together. A single eligible needle in a haystack +# of issues that each trip exactly one gate. +# --------------------------------------------------------------------------- # +def test_only_the_fully_eligible_issue_survives_all_gates(make_issue, make_config): + config = make_config(allowlist=["infra", "realestate-crawler"], kill_switch=False) + in_flight = {"realestate-crawler"} # this repo is locked + + issues = [ + make_issue(number=1, repo="infra", priority=5), # ELIGIBLE + make_issue(number=2, repo="not-allowed", priority=9), # allowlist + make_issue(number=3, repo="infra", priority=9, labeled_by_trusted=False), # trust + make_issue(number=4, repo="infra", priority=9, blocked_by=[1]), # blocked + make_issue(number=5, repo="realestate-crawler", priority=9), # repo locked + ] + decisions = dispatch_policy.select_dispatchable(issues, config, in_flight) + assert _selected_numbers(decisions) == [1] + assert decisions[0].issue.repo == "infra" + + +@pytest.mark.parametrize("trusted", [True, False]) +@pytest.mark.parametrize("allowed", [True, False]) +@pytest.mark.parametrize("blocked", [True, False]) +@pytest.mark.parametrize("locked", [True, False]) +@pytest.mark.parametrize("killed", [True, False]) +def test_full_eligibility_matrix( + make_issue, make_config, trusted, allowed, blocked, locked, killed +): + """Exhaustive truth table: an issue is dispatched iff ALL gates pass and the + kill switch is off. 2**5 = 32 cases, single issue so ordering is moot.""" + issue = make_issue( + number=1, + repo="infra", + priority=0, + labeled_by_trusted=trusted, + blocked_by=[99] if blocked else [], + ) + config = make_config( + allowlist=["infra"] if allowed else ["other-repo"], + kill_switch=killed, + ) + in_flight = {"infra"} if locked else set() + + decisions = dispatch_policy.select_dispatchable([issue], config, in_flight) + + should_dispatch = trusted and allowed and not blocked and not locked and not killed + assert (len(decisions) == 1) is should_dispatch + if should_dispatch: + assert decisions[0].issue is issue diff --git a/tests/test_afk_notifier.py b/tests/test_afk_notifier.py new file mode 100644 index 0000000..d1a911b --- /dev/null +++ b/tests/test_afk_notifier.py @@ -0,0 +1,198 @@ +"""Tests for ``app.afk.notifier`` — the terminal-state doorbell. + +The notifier's whole job is to format a human-facing alert (Slack / ntfy) with a +deep-link back to the T3 thread when a run reaches a terminal state — done, +needs-human, or frozen — and hand it to an injected sender. Every test here +injects a recording fake sender, so nothing is ever POSTed: we assert the +*formatted payload* per kind, plus the deep-link, the kind vocabulary, and the +guardrails (no thread → no link, unknown kind rejected, sender called exactly +once with the return value being None). + +No real Slack/ntfy/T3 is touched — consistent with the rest of the AFK suite. +""" +import pytest + +from app.afk import notifier as notifier_mod +from app.afk.notifier import KIND_DONE, KIND_FROZEN, KIND_NEEDS_HUMAN, Notification, Notifier +from app.afk.types import Issue + + +# --------------------------------------------------------------------------- # +# A recording sender — captures the Notification instead of posting it. +# --------------------------------------------------------------------------- # +class RecordingSender: + """Injectable stand-in for the real Slack/ntfy POST. Records each payload so + a test can assert the formatting without any network.""" + + def __init__(self) -> None: + self.sent: list[Notification] = [] + + def __call__(self, notification: Notification) -> None: + self.sent.append(notification) + + +@pytest.fixture +def sender() -> RecordingSender: + return RecordingSender() + + +def _issue(number: int = 42, repo: str = "infra") -> Issue: + return Issue( + number=number, + repo=repo, + labels=["ready-for-agent"], + blocked_by=[], + labeled_by_trusted=True, + priority=0, + ) + + +# --------------------------------------------------------------------------- # +# Kind vocabulary — the three terminal states, and nothing else. +# --------------------------------------------------------------------------- # +def test_terminal_kinds_are_exactly_the_three_terminal_states(): + assert KIND_DONE == "done" + assert KIND_NEEDS_HUMAN == "needs-human" + assert KIND_FROZEN == "frozen" + assert notifier_mod.TERMINAL_KINDS == {KIND_DONE, KIND_NEEDS_HUMAN, KIND_FROZEN} + + +# --------------------------------------------------------------------------- # +# Dispatch mechanics — sender injected, called exactly once, returns None. +# --------------------------------------------------------------------------- # +def test_notify_calls_sender_exactly_once_and_returns_none(sender): + n = Notifier(sender) + result = n.notify(KIND_DONE, _issue(), "thread-7", "all green") + assert result is None + assert len(sender.sent) == 1 + + +def test_notify_does_not_post_anything_itself(sender): + """The Notifier must never reach the network on its own — all egress goes + through the injected sender. A test-only sentinel proves that.""" + n = Notifier(sender) + n.notify(KIND_FROZEN, _issue(), "thread-1", "budget exhausted") + # Nothing other than the injected sender ran: exactly one recorded payload, + # and it is the Notification dataclass (not a raw dict / HTTP response). + assert isinstance(sender.sent[0], Notification) + + +# --------------------------------------------------------------------------- # +# Deep-link — every payload links back to the T3 thread (when there is one). +# --------------------------------------------------------------------------- # +def test_payload_deep_links_to_the_t3_thread(sender): + n = Notifier(sender, base_url="https://t3.viktorbarzin.me") + n.notify(KIND_DONE, _issue(), "thread-abc", "done") + payload = sender.sent[0] + assert payload.link == "https://t3.viktorbarzin.me/?thread=thread-abc" + # The link is also surfaced in the human-readable body so it survives + # senders that drop structured fields (e.g. a plain ntfy message). + assert "https://t3.viktorbarzin.me/?thread=thread-abc" in payload.body + + +def test_base_url_trailing_slash_is_normalised(sender): + n = Notifier(sender, base_url="https://t3.viktorbarzin.me/") + n.notify(KIND_DONE, _issue(), "thread-x", "done") + assert sender.sent[0].link == "https://t3.viktorbarzin.me/?thread=thread-x" + + +def test_no_thread_id_means_no_link(sender): + """A run can reach 'needs-human' before any thread exists (e.g. dispatch + itself failed). Without a thread there is nothing to deep-link to, so the + link is None — but the doorbell still fires.""" + n = Notifier(sender) + n.notify(KIND_NEEDS_HUMAN, _issue(), None, "dispatch failed") + payload = sender.sent[0] + assert payload.link is None + assert len(sender.sent) == 1 + # No dangling "/?thread=" fragment leaks into the body either. + assert "?thread=" not in payload.body + + +# --------------------------------------------------------------------------- # +# Per-kind formatting — title / body / priority / tags differ per terminal kind. +# --------------------------------------------------------------------------- # +def test_done_payload_is_informational(sender): + n = Notifier(sender) + n.notify(KIND_DONE, _issue(number=7, repo="infra"), "thread-7", "merged + CI green") + p = sender.sent[0] + assert p.kind == KIND_DONE + assert p.issue_ref == "infra#7" + assert "infra#7" in p.title + assert "merged + CI green" in p.body + # A successful close is informational, not an escalation. + assert p.priority == "low" + assert "escalation" not in p.tags + + +def test_needs_human_payload_is_an_escalation(sender): + n = Notifier(sender) + n.notify(KIND_NEEDS_HUMAN, _issue(number=9, repo="claude-agent-service"), "thread-9", "errored before push") + p = sender.sent[0] + assert p.kind == KIND_NEEDS_HUMAN + assert p.issue_ref == "claude-agent-service#9" + assert "claude-agent-service#9" in p.title + assert "errored before push" in p.body + assert p.priority == "high" + assert "escalation" in p.tags + + +def test_frozen_payload_is_an_escalation(sender): + n = Notifier(sender) + n.notify(KIND_FROZEN, _issue(number=3, repo="infra"), "thread-3", "fix-forward budget exhausted") + p = sender.sent[0] + assert p.kind == KIND_FROZEN + assert "infra#3" in p.title + assert "fix-forward budget exhausted" in p.body + assert p.priority == "high" + assert "escalation" in p.tags + + +def test_titles_distinguish_the_three_kinds(sender): + """An operator skimming a Slack channel must tell the three apart from the + title alone, without reading the body.""" + n = Notifier(sender) + n.notify(KIND_DONE, _issue(), "t", "x") + n.notify(KIND_NEEDS_HUMAN, _issue(), "t", "x") + n.notify(KIND_FROZEN, _issue(), "t", "x") + titles = [p.title for p in sender.sent] + assert len({t.split(" ")[0] for t in titles}) == 3 # distinct leading marker per kind + + +# --------------------------------------------------------------------------- # +# Guardrail — only terminal kinds are sendable. An unknown kind is a bug. +# --------------------------------------------------------------------------- # +def test_unknown_kind_raises_and_sends_nothing(sender): + n = Notifier(sender) + with pytest.raises(ValueError): + n.notify("running", _issue(), "thread-1", "still working") + assert sender.sent == [] + + +# --------------------------------------------------------------------------- # +# Pure formatter — render_notification builds the payload independently of any +# sender, so the formatting is unit-testable on its own. +# --------------------------------------------------------------------------- # +def test_render_notification_is_pure_and_matches_notify(sender): + issue = _issue(number=11, repo="infra") + built = notifier_mod.render_notification( + KIND_FROZEN, issue, "thread-11", "stuck", base_url="https://t3.viktorbarzin.me" + ) + assert isinstance(built, Notification) + assert built.link == "https://t3.viktorbarzin.me/?thread=thread-11" + # notify() must produce the identical payload it hands the sender. + Notifier(sender, base_url="https://t3.viktorbarzin.me").notify( + KIND_FROZEN, issue, "thread-11", "stuck" + ) + assert sender.sent[0] == built + + +def test_sender_exception_propagates(sender): + """If the sender fails (Slack down), the notifier does not swallow it — the + loop decides what to do with a failed doorbell, not this adapter.""" + def boom(_notification: Notification) -> None: + raise RuntimeError("slack 503") + + n = Notifier(boom) + with pytest.raises(RuntimeError, match="slack 503"): + n.notify(KIND_DONE, _issue(), "thread-1", "done") diff --git a/tests/test_afk_phase_checklist.py b/tests/test_afk_phase_checklist.py new file mode 100644 index 0000000..2129e31 --- /dev/null +++ b/tests/test_afk_phase_checklist.py @@ -0,0 +1,247 @@ +"""Tests for ``app.afk.phase_checklist`` — the live progress checklist. + +``render(current, meta)`` is PURE: same inputs → byte-identical markdown, no I/O. +It draws the seven-phase lifecycle (worktree → tests-red → green → pushed → CI → +deployed → done) as a markdown task list, with phases *before* ``current`` checked +off, ``current`` marked in-progress, and later phases left empty. + +Style matches the existing suite: plain ``assert`` functions, parametrized cases, +and a couple of full-output snapshots so the rendered shape is pinned, not just +its line count. +""" +import pytest + +from app.afk.phase_checklist import render +from app.afk.types import Phase + + +# Lifecycle order, mirrored from the contract so a reordering of the enum that +# the renderer didn't track shows up as a test failure rather than silent drift. +PHASES_IN_ORDER = [ + Phase.WORKTREE, + Phase.TESTS_RED, + Phase.GREEN, + Phase.PUSHED, + Phase.CI, + Phase.DEPLOYED, + Phase.DONE, +] + + +# --------------------------------------------------------------------------- # +# Structure: one line per phase, in order, always all seven. +# --------------------------------------------------------------------------- # +def _checklist_lines(out: str) -> list[str]: + """The markdown task-list lines (``- [ ]`` / ``- [x]`` ...), in order.""" + return [ln for ln in out.splitlines() if ln.lstrip().startswith("- [")] + + +def test_renders_a_string(): + assert isinstance(render(Phase.WORKTREE, {}), str) + + +@pytest.mark.parametrize("current", PHASES_IN_ORDER) +def test_every_phase_has_exactly_one_checklist_line(current): + lines = _checklist_lines(render(current, {})) + assert len(lines) == len(PHASES_IN_ORDER) + + +@pytest.mark.parametrize("current", PHASES_IN_ORDER) +def test_checklist_lines_are_in_lifecycle_order(current): + lines = _checklist_lines(render(current, {})) + # Each phase's human label appears, and in the lifecycle order. + positions = [ + next(i for i, ln in enumerate(lines) if _has_label(ln, phase)) + for phase in PHASES_IN_ORDER + ] + assert positions == sorted(positions) + + +def _has_label(line: str, phase: Phase) -> bool: + """Whether a checklist line carries ``phase``'s headline word (case-insensitive + substring — the test asserts the label is *present*, not its exact decoration).""" + return _phase_label(phase).lower() in line.lower() + + +def _phase_label(phase: Phase) -> str: + """The headline word(s) the renderer must use for a phase. Loose on purpose: + the test asserts the label is *present*, not the exact decoration.""" + return { + Phase.WORKTREE: "worktree", + Phase.TESTS_RED: "test", + Phase.GREEN: "green", + Phase.PUSHED: "push", + Phase.CI: "CI", + Phase.DEPLOYED: "deploy", + Phase.DONE: "done", + }[phase] + + +# --------------------------------------------------------------------------- # +# Check/in-progress/empty partitioning around ``current``. +# --------------------------------------------------------------------------- # +def _classify(line: str) -> str: + """Bucket a checklist line by its marker: 'done' ``[x]``, 'todo' ``[ ]``, or + 'active' (anything else, e.g. an in-progress glyph).""" + body = line.lstrip() + if body.startswith("- [x]"): + return "done" + if body.startswith("- [ ]"): + return "todo" + return "active" + + +@pytest.mark.parametrize("idx,current", list(enumerate(PHASES_IN_ORDER))) +def test_earlier_checked_current_active_later_empty(idx, current): + lines = _checklist_lines(render(current, {})) + buckets = [_classify(ln) for ln in lines] + + # Everything strictly before the current phase is checked off. + assert all(b == "done" for b in buckets[:idx]), buckets + + if current is Phase.DONE: + # Terminal phase: the whole list is checked, nothing left active/empty. + assert all(b == "done" for b in buckets), buckets + else: + # The current phase is the single in-progress marker... + assert buckets[idx] == "active", buckets + assert buckets.count("active") == 1, buckets + # ...and every phase after it is still an empty checkbox. + assert all(b == "todo" for b in buckets[idx + 1 :]), buckets + + +def test_first_phase_has_nothing_checked_before_it(): + lines = _checklist_lines(render(Phase.WORKTREE, {})) + assert _classify(lines[0]) == "active" + assert "done" not in [_classify(ln) for ln in lines] + + +def test_done_checks_every_phase_including_done(): + lines = _checklist_lines(render(Phase.DONE, {})) + assert all(_classify(ln) == "done" for ln in lines) + # The DONE line itself is checked, not merely the ones before it. + done_line = next(ln for ln in lines if _has_label(ln, Phase.DONE)) + assert _classify(done_line) == "done" + + +# --------------------------------------------------------------------------- # +# Active-phase emphasis: the current phase is visually distinguishable. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize("current", [p for p in PHASES_IN_ORDER if p is not Phase.DONE]) +def test_active_phase_line_differs_from_todo_and_done_markers(current): + lines = _checklist_lines(render(current, {})) + active = [ln for ln in lines if _classify(ln) == "active"] + assert len(active) == 1 + # Not a plain checkbox in either state. + assert not active[0].lstrip().startswith("- [x]") + assert not active[0].lstrip().startswith("- [ ]") + + +# --------------------------------------------------------------------------- # +# meta rendering: optional context is surfaced, omission never explodes. +# --------------------------------------------------------------------------- # +def test_meta_empty_does_not_raise_and_still_lists_phases(): + out = render(Phase.GREEN, {}) + assert _checklist_lines(out) # non-empty + + +def test_meta_issue_and_repo_appear_in_output(): + out = render(Phase.GREEN, {"repo": "infra", "issue": 42}) + assert "infra" in out + assert "42" in out + + +def test_meta_thread_id_appears_when_present(): + out = render(Phase.PUSHED, {"thread_id": "thread-7"}) + assert "thread-7" in out + + +def test_meta_thread_id_absent_is_silent(): + out = render(Phase.PUSHED, {}) + assert "thread-" not in out + + +def test_meta_fix_forward_attempt_surfaced(): + out = render(Phase.CI, {"fix_forward_attempts": 3}) + assert "3" in out + + +def test_meta_unknown_keys_are_ignored(): + # An unexpected key must not crash or leak its raw value as a stray line. + out = render(Phase.WORKTREE, {"totally_unknown_field": "should-not-appear"}) + assert "should-not-appear" not in out + + +# --------------------------------------------------------------------------- # +# Determinism + idempotence (it's pure). +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize("current", PHASES_IN_ORDER) +def test_render_is_deterministic(current): + meta = {"repo": "infra", "issue": 9, "thread_id": "thread-1"} + assert render(current, meta) == render(current, meta) + + +def test_render_does_not_mutate_meta(): + meta = {"repo": "infra", "issue": 1} + before = dict(meta) + render(Phase.GREEN, meta) + assert meta == before + + +# --------------------------------------------------------------------------- # +# Snapshots: pin the exact rendered shape for two representative phases. If the +# format changes intentionally, update these strings; an accidental change to +# wording/markers/order fails here loudly. +# --------------------------------------------------------------------------- # +WORKTREE_SNAPSHOT = """\ +### infra#7 — AFK run progress + +- [~] Worktree created +- [ ] Failing test written (TDD red) +- [ ] Implementation passing (TDD green) +- [ ] Pushed to master +- [ ] CI green on pushed commit +- [ ] Deployed / rolled out +- [ ] Done — issue closed +""" + + +def test_snapshot_worktree_phase(): + out = render(Phase.WORKTREE, {"repo": "infra", "issue": 7}) + assert out == WORKTREE_SNAPSHOT + + +CI_SNAPSHOT = """\ +### infra#7 — AFK run progress (thread thread-3) + +- [x] Worktree created +- [x] Failing test written (TDD red) +- [x] Implementation passing (TDD green) +- [x] Pushed to master +- [~] CI green on pushed commit +- [ ] Deployed / rolled out +- [ ] Done — issue closed +""" + + +def test_snapshot_ci_phase_with_thread(): + out = render(Phase.CI, {"repo": "infra", "issue": 7, "thread_id": "thread-3"}) + assert out == CI_SNAPSHOT + + +DONE_SNAPSHOT = """\ +### infra#7 — AFK run progress + +- [x] Worktree created +- [x] Failing test written (TDD red) +- [x] Implementation passing (TDD green) +- [x] Pushed to master +- [x] CI green on pushed commit +- [x] Deployed / rolled out +- [x] Done — issue closed +""" + + +def test_snapshot_done_phase(): + out = render(Phase.DONE, {"repo": "infra", "issue": 7}) + assert out == DONE_SNAPSHOT diff --git a/tests/test_afk_poller.py b/tests/test_afk_poller.py new file mode 100644 index 0000000..d9d68d3 --- /dev/null +++ b/tests/test_afk_poller.py @@ -0,0 +1,269 @@ +"""Integration tests for ``app.afk.poller`` — the CronJob dispatch tick. + +Unlike the unit suites, these wire the REAL pure cores (the actual +``dispatch_policy.select_dispatchable``) to the in-memory adapter FAKES from +``conftest`` (``FakeTracker`` / ``FakeT3Client``). No test touches a real T3 +server, GitHub/Forgejo, or the cluster — the poller is exercised end to end with +fakes standing in only for the I/O edges. + +What the tick must do (the poller contract): + + * **kill switch** — a disabled config dispatches nothing AND never calls the + tracker or T3 (the CronJob does no I/O when the loop is off); + * read the ready set via ``tracker.list_ready(config.allowlist)``; + * derive the **per-repo lock** from the ready set itself — a repo with an issue + already carrying the ``in_progress_label`` is in flight and is skipped (the + CronJob is stateless between ticks, so the tracker is the source of truth); + * run the real ``select_dispatchable`` over (ready issues, config, in-flight + repos) and, for each decision, ``t3_client.dispatch(...)`` then + ``tracker.add_label(repo, issue, in_progress_label)`` — label AFTER a + successful dispatch so a dispatch failure never leaves a phantom lock. +""" +import pytest + +from app.afk import poller +from app.afk.types import Config + + +# --------------------------------------------------------------------------- # +# Helpers. +# --------------------------------------------------------------------------- # +def _poller(fake_tracker, fake_t3) -> poller.Poller: + """A Poller wired to the conftest fakes and the real dispatch policy.""" + return poller.Poller(tracker=fake_tracker, t3_client=fake_t3) + + +def _dispatched_pairs(fake_t3) -> set[tuple[str, int]]: + return {(d["repo"], d["issue"]) for d in fake_t3.dispatched} + + +def _added_in_progress(fake_tracker, label: str = "agent-in-progress") -> set[tuple[str, int]]: + return { + (repo, issue) + for (op, repo, issue, lbl) in fake_tracker.label_ops + if op == "add" and lbl == label + } + + +# --------------------------------------------------------------------------- # +# Kill switch — no dispatch, no I/O at all. +# --------------------------------------------------------------------------- # +def test_kill_switch_dispatches_nothing(fake_tracker, fake_t3, make_issue): + fake_tracker.seed("infra", [make_issue(number=1, repo="infra")]) + config = Config(allowlist=["infra"], kill_switch=True) + + result = _poller(fake_tracker, fake_t3).run_once(config) + + assert result.dispatched == [] + assert fake_t3.dispatched == [] + + +def test_kill_switch_does_not_even_read_the_tracker(fake_t3): + """When the loop is off the CronJob must do zero I/O — not a single tracker + or T3 call. A tracker that explodes if touched proves it.""" + class ExplodingTracker: + def list_ready(self, repos): + raise AssertionError("tracker must not be read when kill switch is on") + + config = Config(allowlist=["infra"], kill_switch=True) + result = poller.Poller(tracker=ExplodingTracker(), t3_client=fake_t3).run_once(config) + assert result.dispatched == [] + + +# --------------------------------------------------------------------------- # +# Empty allowlist — armed kill switch but nothing to run. +# --------------------------------------------------------------------------- # +def test_empty_allowlist_dispatches_nothing(fake_tracker, fake_t3, make_issue): + # list_ready([]) returns nothing, and even if it didn't the policy gates on + # the (empty) allowlist. The shipped default posture. + config = Config(allowlist=[], kill_switch=False) + result = _poller(fake_tracker, fake_t3).run_once(config) + assert result.dispatched == [] + assert fake_t3.dispatched == [] + + +# --------------------------------------------------------------------------- # +# Happy path — one ready issue gets dispatched and labelled. +# --------------------------------------------------------------------------- # +def test_dispatches_a_ready_issue(fake_tracker, fake_t3, make_issue): + fake_tracker.seed("infra", [make_issue(number=7, repo="infra")]) + config = Config(allowlist=["infra"], kill_switch=False) + + result = _poller(fake_tracker, fake_t3).run_once(config) + + assert _dispatched_pairs(fake_t3) == {("infra", 7)} + assert len(result.dispatched) == 1 + assert result.dispatched[0].thread_id == "thread-0" + assert result.dispatched[0].issue.number == 7 + + +def test_labels_in_progress_after_dispatch(fake_tracker, fake_t3, make_issue): + fake_tracker.seed("infra", [make_issue(number=7, repo="infra")]) + config = Config(allowlist=["infra"], kill_switch=False) + + _poller(fake_tracker, fake_t3).run_once(config) + + assert _added_in_progress(fake_tracker) == {("infra", 7)} + + +def test_in_progress_label_honours_config_override(fake_tracker, fake_t3, make_issue): + fake_tracker.seed("infra", [make_issue(number=7, repo="infra")]) + config = Config(allowlist=["infra"], kill_switch=False, in_progress_label="busy") + + _poller(fake_tracker, fake_t3).run_once(config) + + assert _added_in_progress(fake_tracker, "busy") == {("infra", 7)} + + +def test_dispatch_prompt_references_the_issue(fake_tracker, fake_t3, make_issue): + """The agent runs full-access and fetches the body itself, so the prompt the + poller sends must at minimum point at the concrete repo#issue.""" + fake_tracker.seed("infra", [make_issue(number=7, repo="infra")]) + config = Config(allowlist=["infra"], kill_switch=False) + + _poller(fake_tracker, fake_t3).run_once(config) + + prompt = fake_t3.dispatched[0]["prompt"] + assert "7" in prompt and "infra" in prompt + assert prompt.strip() # non-empty + + +# --------------------------------------------------------------------------- # +# Per-repo lock — an issue already carrying the in-progress label means an agent +# is in flight on that repo, so the repo is skipped this tick. +# --------------------------------------------------------------------------- # +def test_repo_with_in_progress_issue_is_locked(fake_tracker, fake_t3, make_issue): + in_flight = make_issue( + number=1, repo="infra", labels=["ready-for-agent", "agent-in-progress"] + ) + waiting = make_issue(number=2, repo="infra", labels=["ready-for-agent"]) + fake_tracker.seed("infra", [in_flight, waiting]) + config = Config(allowlist=["infra"], kill_switch=False) + + result = _poller(fake_tracker, fake_t3).run_once(config) + + # Repo already busy → nothing new dispatched, no new in-progress label. + assert result.dispatched == [] + assert fake_t3.dispatched == [] + assert _added_in_progress(fake_tracker) == set() + + +def test_lock_is_per_repo_not_global(fake_tracker, fake_t3, make_issue): + # infra is busy; a different repo is free and should still dispatch. + fake_tracker.seed( + "infra", + [make_issue(number=1, repo="infra", labels=["ready-for-agent", "agent-in-progress"])], + ) + fake_tracker.seed("dotfiles", [make_issue(number=2, repo="dotfiles")]) + config = Config(allowlist=["infra", "dotfiles"], kill_switch=False) + + result = _poller(fake_tracker, fake_t3).run_once(config) + + assert _dispatched_pairs(fake_t3) == {("dotfiles", 2)} + assert {d.issue.repo for d in result.dispatched} == {"dotfiles"} + + +def test_custom_in_progress_label_drives_the_lock(fake_tracker, fake_t3, make_issue): + # The lock keys off config.in_progress_label, not the hardcoded default. + fake_tracker.seed( + "infra", + [make_issue(number=1, repo="infra", labels=["ready-for-agent", "busy"])], + ) + config = Config(allowlist=["infra"], kill_switch=False, in_progress_label="busy") + result = _poller(fake_tracker, fake_t3).run_once(config) + assert result.dispatched == [] + + +# --------------------------------------------------------------------------- # +# One dispatch per repo per tick (the policy's one-agent-per-repo invariant, +# observed through the poller): highest-priority eligible issue wins the slot. +# --------------------------------------------------------------------------- # +def test_one_dispatch_per_repo_per_tick(fake_tracker, fake_t3, make_issue): + fake_tracker.seed( + "infra", + [ + make_issue(number=1, repo="infra", priority=1), + make_issue(number=2, repo="infra", priority=9), # highest priority + make_issue(number=3, repo="infra", priority=5), + ], + ) + config = Config(allowlist=["infra"], kill_switch=False) + + _poller(fake_tracker, fake_t3).run_once(config) + + assert _dispatched_pairs(fake_t3) == {("infra", 2)} + assert _added_in_progress(fake_tracker) == {("infra", 2)} + + +# --------------------------------------------------------------------------- # +# Gating still applies through the poller (the pure policy enforces it; the +# poller must not bypass it). +# --------------------------------------------------------------------------- # +def test_untrusted_issue_is_not_dispatched(fake_tracker, fake_t3, make_issue): + fake_tracker.seed( + "infra", [make_issue(number=1, repo="infra", labeled_by_trusted=False)] + ) + config = Config(allowlist=["infra"], kill_switch=False) + result = _poller(fake_tracker, fake_t3).run_once(config) + assert result.dispatched == [] + assert fake_t3.dispatched == [] + + +def test_blocked_issue_is_not_dispatched(fake_tracker, fake_t3, make_issue): + fake_tracker.seed( + "infra", [make_issue(number=2, repo="infra", blocked_by=[1])] + ) + config = Config(allowlist=["infra"], kill_switch=False) + result = _poller(fake_tracker, fake_t3).run_once(config) + assert result.dispatched == [] + + +def test_repo_outside_allowlist_is_not_dispatched(fake_tracker, fake_t3, make_issue): + # list_ready only queries the allowlist, but even if a stray repo's issues + # arrive the policy's allowlist gate drops them. + fake_tracker.seed("secret", [make_issue(number=1, repo="secret")]) + config = Config(allowlist=["infra"], kill_switch=False) + result = _poller(fake_tracker, fake_t3).run_once(config) + assert result.dispatched == [] + + +# --------------------------------------------------------------------------- # +# Dispatch failure must not leave a phantom lock (label only AFTER success). +# --------------------------------------------------------------------------- # +def test_dispatch_failure_does_not_label_in_progress(fake_tracker, make_issue): + class FailingT3: + def __init__(self): + self.dispatched = [] + + def dispatch(self, repo, issue, prompt): + raise RuntimeError("T3 down") + + fake_tracker.seed("infra", [make_issue(number=7, repo="infra")]) + config = Config(allowlist=["infra"], kill_switch=False) + + with pytest.raises(RuntimeError): + poller.Poller(tracker=fake_tracker, t3_client=FailingT3()).run_once(config) + + # No in-progress label was applied — the issue stays purely ready, so the + # next tick retries it rather than treating it as locked. + assert _added_in_progress(fake_tracker) == set() + + +# --------------------------------------------------------------------------- # +# list_ready is called with exactly the allowlist (not all repos). +# --------------------------------------------------------------------------- # +def test_queries_only_the_allowlisted_repos(fake_t3, make_issue): + seen_repos: list[list[str]] = [] + + class RecordingTracker: + def list_ready(self, repos): + seen_repos.append(list(repos)) + return [] + + def add_label(self, *a): # pragma: no cover - not reached here + raise AssertionError("nothing to label") + + config = Config(allowlist=["infra", "dotfiles"], kill_switch=False) + poller.Poller(tracker=RecordingTracker(), t3_client=fake_t3).run_once(config) + + assert seen_repos == [["infra", "dotfiles"]] diff --git a/tests/test_afk_run_state_machine.py b/tests/test_afk_run_state_machine.py new file mode 100644 index 0000000..5541724 --- /dev/null +++ b/tests/test_afk_run_state_machine.py @@ -0,0 +1,190 @@ +"""Tests for ``app.afk.run_state_machine.next_action`` — the pure decision +function that turns one assembled ``RunState`` into the next ``Action``. + +The function encodes ADR-0002's run lifecycle: + + * healthy (pushed AND CI green) -> CLOSE_SUCCESS + * cannot reach green before push (errored / + stalled with nothing pushed) -> ESCALATE_PREPUSH + * pushed but CI red, budget remaining -> FIX_FORWARD + * pushed but CI red, budget exhausted -> FREEZE_ESCALATE + * anything still in flight -> WAIT + +It is PURE: no I/O, no clock, no globals — it reads only its two arguments, so +every case is a plain table assertion. ``make_config`` / ``make_run_state`` come +from ``conftest.py`` (config defaults to ENABLED, run state to a fresh dispatch). +""" +import pytest + +from app.afk.run_state_machine import next_action +from app.afk.types import Action, CIStatus, ThreadStatus + + +# --------------------------------------------------------------------------- # +# Healthy terminal: pushed + CI green -> close, regardless of thread status. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize( + "thread_status", + [ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None], +) +def test_pushed_and_green_closes_success(make_config, make_run_state, thread_status): + state = make_run_state( + thread_status=thread_status, ci_status=CIStatus.GREEN, pushed=True + ) + assert next_action(state, make_config()) is Action.CLOSE_SUCCESS + + +# --------------------------------------------------------------------------- # +# Pre-push escalation: nothing pushed and the turn is no longer going to push +# (errored, or finished/stalled clean) -> hand back to a human. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize("thread_status", [ThreadStatus.ERROR, ThreadStatus.IDLE]) +@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING]) +def test_not_pushed_terminal_thread_escalates_prepush( + make_config, make_run_state, thread_status, ci_status +): + state = make_run_state( + thread_status=thread_status, ci_status=ci_status, pushed=False + ) + assert next_action(state, make_config()) is Action.ESCALATE_PREPUSH + + +# --------------------------------------------------------------------------- # +# Still working toward a first push -> WAIT (not yet an escalation). +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize("thread_status", [ThreadStatus.RUNNING, None]) +@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING]) +def test_not_pushed_in_flight_waits( + make_config, make_run_state, thread_status, ci_status +): + state = make_run_state( + thread_status=thread_status, ci_status=ci_status, pushed=False + ) + assert next_action(state, make_config()) is Action.WAIT + + +# --------------------------------------------------------------------------- # +# Pushed, CI not yet decided -> WAIT for the verdict, whatever the thread does. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize( + "thread_status", + [ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None], +) +@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING]) +def test_pushed_ci_pending_waits( + make_config, make_run_state, thread_status, ci_status +): + state = make_run_state( + thread_status=thread_status, ci_status=ci_status, pushed=True + ) + assert next_action(state, make_config()) is Action.WAIT + + +# --------------------------------------------------------------------------- # +# Pushed + CI red: fix-forward while BOTH budgets remain, else freeze. +# Boundaries are strict-less-than on attempts AND elapsed; at/over either freezes. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize( + ("attempts", "elapsed", "expected"), + [ + # fresh red, plenty of budget -> fix forward + (0, 0.0, Action.FIX_FORWARD), + (1, 10.0, Action.FIX_FORWARD), + # one attempt below the cap, well inside the clock -> still fix forward + (4, 3599.0, Action.FIX_FORWARD), + # attempts hit the cap (5) -> freeze + (5, 0.0, Action.FREEZE_ESCALATE), + (6, 0.0, Action.FREEZE_ESCALATE), + # clock hits the cap (3600s) -> freeze even with attempts to spare + (0, 3600.0, Action.FREEZE_ESCALATE), + (0, 7200.0, Action.FREEZE_ESCALATE), + # both exhausted -> freeze + (5, 3600.0, Action.FREEZE_ESCALATE), + ], +) +def test_pushed_red_fix_forward_until_budget_exhausted( + make_config, make_run_state, attempts, elapsed, expected +): + state = make_run_state( + thread_status=ThreadStatus.IDLE, + ci_status=CIStatus.RED, + pushed=True, + fix_forward_attempts=attempts, + elapsed_seconds=elapsed, + ) + assert next_action(state, make_config()) is expected + + +# --------------------------------------------------------------------------- # +# Fix-forward budget is honoured from config, not hardcoded. +# --------------------------------------------------------------------------- # +def test_fix_forward_attempts_cap_comes_from_config(make_config, make_run_state): + config = make_config(fix_forward_max_attempts=2) + red = dict(thread_status=ThreadStatus.IDLE, ci_status=CIStatus.RED, pushed=True) + assert next_action(make_run_state(fix_forward_attempts=1, **red), config) is Action.FIX_FORWARD + assert next_action(make_run_state(fix_forward_attempts=2, **red), config) is Action.FREEZE_ESCALATE + + +def test_fix_forward_seconds_cap_comes_from_config(make_config, make_run_state): + config = make_config(fix_forward_max_seconds=120) + red = dict(thread_status=ThreadStatus.IDLE, ci_status=CIStatus.RED, pushed=True) + assert next_action(make_run_state(elapsed_seconds=119.0, **red), config) is Action.FIX_FORWARD + assert next_action(make_run_state(elapsed_seconds=120.0, **red), config) is Action.FREEZE_ESCALATE + + +# --------------------------------------------------------------------------- # +# A red CI on a pushed commit while the thread is still RUNNING a fix is, per +# spec, keyed only on (pushed AND red) + budget — thread status doesn't gate it. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize( + "thread_status", + [ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None], +) +def test_pushed_red_with_budget_fixes_forward_for_any_thread_status( + make_config, make_run_state, thread_status +): + state = make_run_state( + thread_status=thread_status, + ci_status=CIStatus.RED, + pushed=True, + fix_forward_attempts=0, + elapsed_seconds=0.0, + ) + assert next_action(state, make_config()) is Action.FIX_FORWARD + + +# --------------------------------------------------------------------------- # +# Full cross-product sanity sweep: next_action is TOTAL — it returns a real +# Action for every reachable combination, and matches the reference table. +# --------------------------------------------------------------------------- # +def _expected(thread_status, ci_status, pushed): + """Reference implementation of the decision table, written independently of + the module under test, to cross-check every combination.""" + if pushed and ci_status is CIStatus.GREEN: + return Action.CLOSE_SUCCESS + if pushed and ci_status is CIStatus.RED: + return Action.FIX_FORWARD # budget always available in this sweep + if not pushed and thread_status in (ThreadStatus.ERROR, ThreadStatus.IDLE): + return Action.ESCALATE_PREPUSH + return Action.WAIT + + +@pytest.mark.parametrize( + "thread_status", + [ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None], +) +@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING, CIStatus.GREEN, CIStatus.RED]) +@pytest.mark.parametrize("pushed", [True, False]) +def test_decision_table_is_total( + make_config, make_run_state, thread_status, ci_status, pushed +): + state = make_run_state( + thread_status=thread_status, + ci_status=ci_status, + pushed=pushed, + fix_forward_attempts=0, + elapsed_seconds=0.0, + ) + result = next_action(state, make_config()) + assert isinstance(result, Action) + assert result is _expected(thread_status, ci_status, pushed) diff --git a/tests/test_afk_t3_client.py b/tests/test_afk_t3_client.py new file mode 100644 index 0000000..e969c29 --- /dev/null +++ b/tests/test_afk_t3_client.py @@ -0,0 +1,248 @@ +"""Tests for ``app.afk.t3_client`` — the in-cluster T3 dispatch/snapshot adapter. + +Everything here runs against an in-memory FAKE HTTP transport (``FakeHttp``); +no test touches a real T3 server, GitHub/Forgejo, or the cluster. The fake +records every request and replays staged responses, so the assertions pin the +wire contract the control plane depends on: + + * ``dispatch`` issues exactly TWO POSTs to ``/api/orchestration/dispatch`` — + ``thread.create`` then ``thread.turn.start`` — carrying + ``modelSelection.instanceId == "claudeAgent"`` and ``runtimeMode == + "full-access"``, with ``ISSUE_IMPLEMENTER_PREAMBLE`` PREPENDED to + ``message.text`` and the thread id from the first response threaded into the + second. + * each request carries the ``Authorization: Bearer `` header from the + injected bearer provider (re-read per call, so token refresh is honoured). + * ``snapshot`` GETs ``/api/orchestration/snapshot`` and returns the parsed body. +""" +import pytest + +from app.afk import t3_client +from app.afk.issue_implementer_prompt import ISSUE_IMPLEMENTER_PREAMBLE + + +# --------------------------------------------------------------------------- # +# Fake HTTP transport — httpx-shaped (``post``/``get`` → response with +# ``.json()`` + ``.raise_for_status()``), so the real client can hand the +# adapter a plain ``httpx.Client`` while tests hand it this recorder. +# --------------------------------------------------------------------------- # +class FakeResponse: + def __init__(self, payload: dict, status_code: int = 200) -> None: + self._payload = payload + self.status_code = status_code + + def json(self) -> dict: + return self._payload + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise RuntimeError(f"HTTP {self.status_code}") + + +class FakeHttp: + """Records each POST/GET and replays queued responses in order. + + ``post`` pops from ``post_responses`` (FIFO); ``get`` pops from + ``get_responses``. Each recorded call captures the url, json body, and + headers so tests can assert the two-command dispatch shape and the bearer. + """ + + def __init__( + self, + post_responses: list[dict] | None = None, + get_responses: list[dict] | None = None, + ) -> None: + self.post_responses = list(post_responses or []) + self.get_responses = list(get_responses or []) + self.posts: list[dict] = [] + self.gets: list[dict] = [] + + def post(self, url: str, json: dict, headers: dict) -> FakeResponse: + self.posts.append({"url": url, "json": json, "headers": headers}) + if not self.post_responses: + raise AssertionError("unexpected POST — no response staged") + return FakeResponse(self.post_responses.pop(0)) + + def get(self, url: str, headers: dict) -> FakeResponse: + self.gets.append({"url": url, "headers": headers}) + if not self.get_responses: + raise AssertionError("unexpected GET — no response staged") + return FakeResponse(self.get_responses.pop(0)) + + +# Two thread.create / thread.turn.start replies the happy-path dispatch needs. +_CREATE_REPLY = {"threadId": "thread-abc"} +_TURN_REPLY = {"ok": True} + + +def _client(http: FakeHttp, *, base_url: str = "http://t3-afk:8080", token: str = "tok-1"): + return t3_client.T3Client( + base_url=base_url, + http=http, + bearer_provider=lambda: token, + ) + + +def _dispatch(http: FakeHttp, **kw) -> str: + repo = kw.pop("repo", "infra") + issue = kw.pop("issue", 42) + prompt = kw.pop("prompt", "Do the thing.") + return _client(http, **kw).dispatch(repo=repo, issue=issue, prompt=prompt) + + +# --------------------------------------------------------------------------- # +# dispatch — the two-POST shape. +# --------------------------------------------------------------------------- # +def test_dispatch_issues_exactly_two_posts_to_dispatch_endpoint(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http) + assert len(http.posts) == 2 + assert http.gets == [] + for call in http.posts: + assert call["url"] == "http://t3-afk:8080/api/orchestration/dispatch" + + +def test_dispatch_first_command_is_thread_create(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http) + assert http.posts[0]["json"]["command"] == "thread.create" + + +def test_dispatch_second_command_is_thread_turn_start(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http) + assert http.posts[1]["json"]["command"] == "thread.turn.start" + + +def test_dispatch_returns_thread_id_from_create_response(): + http = FakeHttp(post_responses=[{"threadId": "thread-xyz"}, _TURN_REPLY]) + assert _dispatch(http) == "thread-xyz" + + +def test_dispatch_threads_created_id_into_turn_start(): + http = FakeHttp(post_responses=[{"threadId": "thread-xyz"}, _TURN_REPLY]) + _dispatch(http) + # The second command must target the thread the first call created. + assert http.posts[1]["json"]["threadId"] == "thread-xyz" + + +# --------------------------------------------------------------------------- # +# dispatch — model selection / runtime envelope (the pilot-baked constants). +# --------------------------------------------------------------------------- # +def test_dispatch_uses_claude_agent_instance_and_full_access_runtime(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http) + create_body = http.posts[0]["json"] + assert create_body["modelSelection"]["instanceId"] == "claudeAgent" + assert create_body["runtimeMode"] == "full-access" + + +def test_dispatch_create_carries_repo_and_issue(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http, repo="claude-agent-service", issue=7) + create_body = http.posts[0]["json"] + assert create_body["repo"] == "claude-agent-service" + assert create_body["issue"] == 7 + + +# --------------------------------------------------------------------------- # +# dispatch — the preamble PREPEND (behaviour injection). +# --------------------------------------------------------------------------- # +def test_dispatch_prepends_issue_implementer_preamble_to_message_text(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http, prompt="Implement issue 42 body here.") + text = http.posts[1]["json"]["message"]["text"] + assert text == ISSUE_IMPLEMENTER_PREAMBLE + "Implement issue 42 body here." + + +def test_dispatch_preamble_comes_strictly_before_the_prompt(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http, prompt="UNIQUE-PROMPT-MARKER") + text = http.posts[1]["json"]["message"]["text"] + assert text.startswith(ISSUE_IMPLEMENTER_PREAMBLE) + assert text.index(ISSUE_IMPLEMENTER_PREAMBLE) < text.index("UNIQUE-PROMPT-MARKER") + # The raw prompt is preserved verbatim after the preamble. + assert text.endswith("UNIQUE-PROMPT-MARKER") + + +def test_dispatch_does_not_prepend_preamble_to_create_command(): + # The preamble belongs only on the turn message, not the thread.create call. + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http) + assert "message" not in http.posts[0]["json"] + + +# --------------------------------------------------------------------------- # +# Auth — bearer header, read from the injected provider each call. +# --------------------------------------------------------------------------- # +def test_dispatch_sends_bearer_on_both_posts(): + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + _dispatch(http, token="secret-token") + for call in http.posts: + assert call["headers"]["Authorization"] == "Bearer secret-token" + + +def test_bearer_provider_is_called_per_request_so_refresh_is_honoured(): + # A rotating provider proves the token isn't captured once at construction + # (T3's orchestration token expires hourly and must be re-read). + tokens = iter(["tok-A", "tok-B", "tok-C"]) + http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY]) + client = t3_client.T3Client( + base_url="http://t3-afk:8080", + http=http, + bearer_provider=lambda: next(tokens), + ) + client.dispatch(repo="infra", issue=1, prompt="x") + assert http.posts[0]["headers"]["Authorization"] == "Bearer tok-A" + assert http.posts[1]["headers"]["Authorization"] == "Bearer tok-B" + + +# --------------------------------------------------------------------------- # +# snapshot — GET + parse. +# --------------------------------------------------------------------------- # +def test_snapshot_gets_snapshot_endpoint_and_returns_parsed_body(): + fleet = {"threads": [{"id": "thread-abc", "status": "running"}]} + http = FakeHttp(get_responses=[fleet]) + result = _client(http).snapshot() + assert result == fleet + assert len(http.gets) == 1 + assert http.gets[0]["url"] == "http://t3-afk:8080/api/orchestration/snapshot" + assert http.posts == [] + + +def test_snapshot_sends_bearer(): + http = FakeHttp(get_responses=[{"threads": []}]) + _client(http, token="snap-token").snapshot() + assert http.gets[0]["headers"]["Authorization"] == "Bearer snap-token" + + +# --------------------------------------------------------------------------- # +# base_url handling — a trailing slash must not produce a double slash. +# --------------------------------------------------------------------------- # +def test_trailing_slash_in_base_url_is_normalised(): + http = FakeHttp( + post_responses=[_CREATE_REPLY, _TURN_REPLY], + get_responses=[{"threads": []}], + ) + client = _client(http, base_url="http://t3-afk:8080/") + client.dispatch(repo="infra", issue=1, prompt="x") + client.snapshot() + assert http.posts[0]["url"] == "http://t3-afk:8080/api/orchestration/dispatch" + assert http.gets[0]["url"] == "http://t3-afk:8080/api/orchestration/snapshot" + + +# --------------------------------------------------------------------------- # +# Error surfacing — a non-2xx response must raise, not be swallowed. +# --------------------------------------------------------------------------- # +def test_dispatch_raises_when_a_post_returns_an_error_status(): + class ErroringHttp(FakeHttp): + def post(self, url: str, json: dict, headers: dict) -> FakeResponse: + self.posts.append({"url": url, "json": json, "headers": headers}) + return FakeResponse({}, status_code=500) + + http = ErroringHttp() + with pytest.raises(RuntimeError): + _dispatch(http) + # It failed on the FIRST call — never blindly fired thread.turn.start after + # a failed thread.create. + assert len(http.posts) == 1 diff --git a/tests/test_afk_tracker.py b/tests/test_afk_tracker.py new file mode 100644 index 0000000..198cafb --- /dev/null +++ b/tests/test_afk_tracker.py @@ -0,0 +1,493 @@ +"""Tests for ``app.afk.tracker`` — the GitHub issues adapter. + +The ``Tracker`` is the loop's read/write port onto the issue tracker. It wraps +an injected GitHub client (the real one shells out to ``gh``; here we inject a +FAKE that records calls and replays staged data) and holds all the *business* +logic the loop depends on: turning raw issues into ``Issue`` records with +``blocked_by`` parsed, ``labeled_by_trusted`` decided fail-closed from the label +event actor, and ``priority`` read off a priority label. No test here reaches a +real ``gh``, GitHub/Forgejo, or the network. +""" +import pytest + +from app.afk.tracker import ( + DEFAULT_TRUSTED_ASSOCIATIONS, + GitHubClient, + Tracker, +) +from app.afk.types import Issue + + +# --------------------------------------------------------------------------- # +# Fake GitHub client — the injected port. Records every mutating call and +# replays issues / label-events staged per repo. Implements the GitHubClient +# Protocol the Tracker depends on. +# --------------------------------------------------------------------------- # +class FakeGitHub: + def __init__(self) -> None: + # repo -> list of raw issue dicts (gh issue list --json shape) + self._issues: dict[str, list[dict]] = {} + # (repo, number) -> list of label-event dicts (who added which label) + self._events: dict[tuple[str, int], list[dict]] = {} + # recorded mutations + self.labels_added: list[tuple[str, int, str]] = [] + self.labels_removed: list[tuple[str, int, str]] = [] + self.comments: list[tuple[str, int, str]] = [] + self.closed: list[tuple[str, int]] = [] + + # --- staging helpers (test-only) --- # + def seed_issues(self, repo: str, issues: list[dict]) -> None: + self._issues[repo] = issues + + def seed_label_events(self, repo: str, number: int, events: list[dict]) -> None: + self._events[(repo, number)] = events + + # --- GitHubClient surface --- # + def list_issues(self, repo: str, label: str) -> list[dict]: + return [ + issue + for issue in self._issues.get(repo, []) + if label in [lbl["name"] for lbl in issue.get("labels", [])] + ] + + def label_events(self, repo: str, number: int) -> list[dict]: + return list(self._events.get((repo, number), [])) + + def add_label(self, repo: str, number: int, label: str) -> None: + self.labels_added.append((repo, number, label)) + + def remove_label(self, repo: str, number: int, label: str) -> None: + self.labels_removed.append((repo, number, label)) + + def comment(self, repo: str, number: int, body: str) -> None: + self.comments.append((repo, number, body)) + + def close(self, repo: str, number: int) -> None: + self.closed.append((repo, number)) + + +# --------------------------------------------------------------------------- # +# Raw-issue / event builders matching the gh JSON shapes the real client emits. +# --------------------------------------------------------------------------- # +def _raw_issue( + number: int = 1, + labels: list[str] | None = None, + body: str = "", +) -> dict: + return { + "number": number, + "labels": [{"name": name} for name in (labels or ["ready-for-agent"])], + "body": body, + } + + +def _label_event(label: str, association: str = "OWNER", actor: str = "viktorbarzin") -> dict: + # Mirrors the `gh api .../timeline` "labeled" event shape we care about. + return { + "event": "labeled", + "label": {"name": label}, + "actor": {"login": actor}, + "author_association": association, + } + + +@pytest.fixture +def gh() -> FakeGitHub: + return FakeGitHub() + + +@pytest.fixture +def tracker(gh: FakeGitHub) -> Tracker: + return Tracker(gh) + + +# --------------------------------------------------------------------------- # +# Construction / contract. +# --------------------------------------------------------------------------- # +def test_tracker_wraps_injected_client(gh: FakeGitHub): + t = Tracker(gh) + assert t.client is gh + + +def test_fake_satisfies_protocol(gh: FakeGitHub): + # The fake must be usable where a GitHubClient is expected (structural typing). + assert isinstance(gh, GitHubClient) + + +def test_default_trusted_associations_are_collaborator_or_above(): + assert DEFAULT_TRUSTED_ASSOCIATIONS == frozenset({"OWNER", "MEMBER", "COLLABORATOR"}) + + +# --------------------------------------------------------------------------- # +# list_ready — the read path that builds Issue records. +# --------------------------------------------------------------------------- # +def test_list_ready_returns_issue_objects(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=7)]) + gh.seed_label_events("infra", 7, [_label_event("ready-for-agent")]) + + issues = tracker.list_ready(["infra"]) + + assert len(issues) == 1 + issue = issues[0] + assert isinstance(issue, Issue) + assert issue.number == 7 + assert issue.repo == "infra" + assert issue.labels == ["ready-for-agent"] + + +def test_list_ready_spans_multiple_repos(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_issues("crawler", [_raw_issue(number=2)]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + gh.seed_label_events("crawler", 2, [_label_event("ready-for-agent")]) + + issues = tracker.list_ready(["infra", "crawler"]) + + assert {(i.repo, i.number) for i in issues} == {("infra", 1), ("crawler", 2)} + + +def test_list_ready_empty_when_no_ready_issues(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=1, labels=["bug"])]) + assert tracker.list_ready(["infra"]) == [] + + +def test_list_ready_queries_with_configured_ready_label(gh: FakeGitHub): + # A Tracker built with a custom ready label must query the client for *that* + # label, not the default. + seen: dict[str, str] = {} + + class _RecordingGitHub(FakeGitHub): + def list_issues(self, repo: str, label: str) -> list[dict]: + seen["label"] = label + return super().list_issues(repo, label) + + rec = _RecordingGitHub() + rec.seed_issues("infra", [_raw_issue(number=1, labels=["queue-me"])]) + rec.seed_label_events("infra", 1, [_label_event("queue-me")]) + t = Tracker(rec, ready_label="queue-me") + + issues = t.list_ready(["infra"]) + + assert seen["label"] == "queue-me" + assert len(issues) == 1 + + +# --------------------------------------------------------------------------- # +# Trust gate — labeled_by_trusted is decided from the label-event actor, +# fail-closed. +# --------------------------------------------------------------------------- # +def test_owner_labeled_issue_is_trusted(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association="OWNER")]) + + assert tracker.list_ready(["infra"])[0].labeled_by_trusted is True + + +@pytest.mark.parametrize("association", ["MEMBER", "COLLABORATOR"]) +def test_collaborator_and_member_are_trusted(gh: FakeGitHub, tracker: Tracker, association: str): + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association=association)]) + + assert tracker.list_ready(["infra"])[0].labeled_by_trusted is True + + +@pytest.mark.parametrize("association", ["NONE", "CONTRIBUTOR", "FIRST_TIME_CONTRIBUTOR", ""]) +def test_untrusted_association_is_not_trusted(gh: FakeGitHub, tracker: Tracker, association: str): + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association=association)]) + + assert tracker.list_ready(["infra"])[0].labeled_by_trusted is False + + +def test_missing_label_event_is_not_trusted(gh: FakeGitHub, tracker: Tracker): + # The issue carries the ready label, but no event records WHO applied it — + # fail closed: an unattributable label is never trusted. + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events("infra", 1, []) + + assert tracker.list_ready(["infra"])[0].labeled_by_trusted is False + + +def test_trust_uses_latest_application_of_ready_label(gh: FakeGitHub, tracker: Tracker): + # If the ready label was removed and re-added, the MOST RECENT application + # decides trust — a trusted re-label after an untrusted one is trusted. + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events( + "infra", + 1, + [ + _label_event("ready-for-agent", association="NONE", actor="drive-by"), + _label_event("ready-for-agent", association="OWNER", actor="viktorbarzin"), + ], + ) + + assert tracker.list_ready(["infra"])[0].labeled_by_trusted is True + + +def test_trust_ignores_events_for_other_labels(gh: FakeGitHub, tracker: Tracker): + # A trusted actor labeling something else must not make the ready label trusted. + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events( + "infra", + 1, + [ + _label_event("priority:high", association="OWNER"), + _label_event("ready-for-agent", association="NONE", actor="drive-by"), + ], + ) + + assert tracker.list_ready(["infra"])[0].labeled_by_trusted is False + + +def test_custom_trusted_associations_override_default(gh: FakeGitHub): + # Tighten the trust set to OWNER only: a COLLABORATOR label is no longer trusted. + t = Tracker(gh, trusted_associations=frozenset({"OWNER"})) + gh.seed_issues("infra", [_raw_issue(number=1)]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association="COLLABORATOR")]) + + assert t.list_ready(["infra"])[0].labeled_by_trusted is False + + +# --------------------------------------------------------------------------- # +# blocked_by — parsed from the issue body's "Blocked by" references. +# --------------------------------------------------------------------------- # +def test_blocked_by_empty_when_body_has_no_references(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=1, body="just implement the thing")]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].blocked_by == [] + + +def test_blocked_by_parses_single_reference(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=5, body="Blocked by #3")]) + gh.seed_label_events("infra", 5, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].blocked_by == [3] + + +def test_blocked_by_parses_multiple_references(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=9, body="Blocked by #3, #4 and #10")]) + gh.seed_label_events("infra", 9, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].blocked_by == [3, 4, 10] + + +def test_blocked_by_is_case_insensitive_and_dedupes(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=9, body="blocked BY #3 and Blocked by #3, #4")]) + gh.seed_label_events("infra", 9, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].blocked_by == [3, 4] + + +def test_blocked_by_ignores_plain_issue_mentions(gh: FakeGitHub, tracker: Tracker): + # A bare "#7" that is not part of a "Blocked by" clause is NOT a blocker. + gh.seed_issues("infra", [_raw_issue(number=9, body="See #7 for context. Blocked by #3")]) + gh.seed_label_events("infra", 9, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].blocked_by == [3] + + +def test_blocked_by_tolerates_missing_body(gh: FakeGitHub, tracker: Tracker): + issue = _raw_issue(number=1) + issue["body"] = None # gh returns null for an empty body + gh.seed_issues("infra", [issue]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].blocked_by == [] + + +# --------------------------------------------------------------------------- # +# priority — read off a priority label (lower number runs first). +# --------------------------------------------------------------------------- # +def test_priority_defaults_to_zero_without_priority_label(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=1, labels=["ready-for-agent"])]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].priority == 0 + + +def test_priority_read_from_priority_label(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues("infra", [_raw_issue(number=1, labels=["ready-for-agent", "priority:2"])]) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].priority == 2 + + +def test_priority_lowest_label_wins_when_several(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues( + "infra", [_raw_issue(number=1, labels=["ready-for-agent", "priority:5", "priority:1"])] + ) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].priority == 1 + + +def test_priority_ignores_non_numeric_priority_label(gh: FakeGitHub, tracker: Tracker): + gh.seed_issues( + "infra", [_raw_issue(number=1, labels=["ready-for-agent", "priority:high"])] + ) + gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")]) + + assert tracker.list_ready(["infra"])[0].priority == 0 + + +# --------------------------------------------------------------------------- # +# Mutations delegate to the injected client. +# --------------------------------------------------------------------------- # +def test_add_label_delegates(gh: FakeGitHub, tracker: Tracker): + tracker.add_label("infra", 7, "agent-in-progress") + assert gh.labels_added == [("infra", 7, "agent-in-progress")] + + +def test_remove_label_delegates(gh: FakeGitHub, tracker: Tracker): + tracker.remove_label("infra", 7, "agent-in-progress") + assert gh.labels_removed == [("infra", 7, "agent-in-progress")] + + +def test_comment_delegates(gh: FakeGitHub, tracker: Tracker): + tracker.comment("infra", 7, "phase: tests-red done") + assert gh.comments == [("infra", 7, "phase: tests-red done")] + + +def test_close_delegates(gh: FakeGitHub, tracker: Tracker): + tracker.close("infra", 7) + assert gh.closed == [("infra", 7)] + + +# --------------------------------------------------------------------------- # +# The concrete gh-CLI-backed client builds no-shell argv and parses JSON; we +# inject a fake runner so no real `gh` is ever spawned. +# --------------------------------------------------------------------------- # +from app.afk.tracker import GhCliClient # noqa: E402 + + +class _FakeRunner: + """Stand-in for the subprocess runner GhCliClient shells out through. + + Records every argv and returns staged stdout per command, so we can pin the + exact `gh` invocations without spawning a process. + """ + + def __init__(self, responses: dict[tuple[str, ...], str] | None = None) -> None: + self.calls: list[tuple[str, ...]] = [] + self._responses = responses or {} + + def __call__(self, argv: list[str]) -> str: + key = tuple(argv) + self.calls.append(key) + return self._responses.get(key, "") + + +def test_gh_cli_list_issues_builds_no_shell_argv_and_parses_json(): + argv = ( + "gh", "issue", "list", "--repo", "owner/infra", + "--label", "ready-for-agent", "--state", "open", + "--json", "number,labels,body", "--limit", "100", + ) + runner = _FakeRunner({argv: '[{"number": 4, "labels": [{"name": "ready-for-agent"}], "body": "x"}]'}) + client = GhCliClient(repo_owner="owner", run=runner) + + issues = client.list_issues("infra", "ready-for-agent") + + assert runner.calls == [argv] + assert issues == [{"number": 4, "labels": [{"name": "ready-for-agent"}], "body": "x"}] + + +def test_gh_cli_list_issues_empty_output_is_empty_list(): + runner = _FakeRunner() # returns "" for everything + client = GhCliClient(repo_owner="owner", run=runner) + assert client.list_issues("infra", "ready-for-agent") == [] + + +def test_gh_cli_label_events_filters_labeled_events(): + timeline = ( + '[{"event": "commented"},' + ' {"event": "labeled", "label": {"name": "ready-for-agent"},' + ' "actor": {"login": "viktorbarzin"}, "author_association": "OWNER"}]' + ) + argv = ( + "gh", "api", + "repos/owner/infra/issues/4/timeline", + "--paginate", + "-H", "Accept: application/vnd.github+json", + ) + runner = _FakeRunner({argv: timeline}) + client = GhCliClient(repo_owner="owner", run=runner) + + events = client.label_events("infra", 4) + + assert runner.calls == [argv] + assert [e["event"] for e in events] == ["labeled"] + assert events[0]["label"]["name"] == "ready-for-agent" + + +def test_gh_cli_add_label_builds_argv(): + runner = _FakeRunner() + client = GhCliClient(repo_owner="owner", run=runner) + client.add_label("infra", 4, "agent-in-progress") + assert runner.calls == [ + ("gh", "issue", "edit", "4", "--repo", "owner/infra", "--add-label", "agent-in-progress") + ] + + +def test_gh_cli_remove_label_builds_argv(): + runner = _FakeRunner() + client = GhCliClient(repo_owner="owner", run=runner) + client.remove_label("infra", 4, "agent-in-progress") + assert runner.calls == [ + ("gh", "issue", "edit", "4", "--repo", "owner/infra", "--remove-label", "agent-in-progress") + ] + + +def test_gh_cli_comment_builds_argv(): + runner = _FakeRunner() + client = GhCliClient(repo_owner="owner", run=runner) + client.comment("infra", 4, "phase update") + assert runner.calls == [ + ("gh", "issue", "comment", "4", "--repo", "owner/infra", "--body", "phase update") + ] + + +def test_gh_cli_close_builds_argv(): + runner = _FakeRunner() + client = GhCliClient(repo_owner="owner", run=runner) + client.close("infra", 4) + assert runner.calls == [ + ("gh", "issue", "close", "4", "--repo", "owner/infra") + ] + + +def test_gh_cli_end_to_end_through_tracker(): + # Wire the gh-CLI client (fake runner) behind a real Tracker and confirm a + # full read produces a correctly-decoded, trusted, blocked Issue. + list_argv = ( + "gh", "issue", "list", "--repo", "owner/infra", + "--label", "ready-for-agent", "--state", "open", + "--json", "number,labels,body", "--limit", "100", + ) + timeline_argv = ( + "gh", "api", + "repos/owner/infra/issues/12/timeline", + "--paginate", + "-H", "Accept: application/vnd.github+json", + ) + runner = _FakeRunner({ + list_argv: ( + '[{"number": 12,' + ' "labels": [{"name": "ready-for-agent"}, {"name": "priority:3"}],' + ' "body": "Blocked by #11"}]' + ), + timeline_argv: ( + '[{"event": "labeled", "label": {"name": "ready-for-agent"},' + ' "actor": {"login": "viktorbarzin"}, "author_association": "OWNER"}]' + ), + }) + tracker = Tracker(GhCliClient(repo_owner="owner", run=runner)) + + issue = tracker.list_ready(["infra"])[0] + + assert issue.number == 12 + assert issue.repo == "infra" + assert issue.blocked_by == [11] + assert issue.priority == 3 + assert issue.labeled_by_trusted is True diff --git a/tests/test_afk_watcher.py b/tests/test_afk_watcher.py new file mode 100644 index 0000000..052a58c --- /dev/null +++ b/tests/test_afk_watcher.py @@ -0,0 +1,349 @@ +"""Integration tests for ``app.afk.watcher`` — the in-flight run driver. + +These wire the REAL pure cores (the actual ``run_state_machine.next_action`` and +``phase_checklist.render``) to the in-memory adapter FAKES from ``conftest`` +(``FakeT3Client`` / ``FakeTracker`` / ``FakeCIWatcher`` / ``FakeNotifier``). No +test touches a real T3 server, GitHub/Forgejo, the cluster, or Slack — the +watcher is exercised end to end with fakes only at the I/O edges. + +What one watch tick must do (the watcher contract), given an in-flight run +``(issue, thread_id, commit, bookkeeping)``: + + * assemble a ``RunState`` from ``t3_client.snapshot()`` (the thread's liveness) + + ``ci_watcher.status(repo, commit)`` (the CI verdict, only when something is + pushed) + the run's own ``pushed`` / ``fix_forward_attempts`` / + ``elapsed_seconds`` bookkeeping, and feed it to the pure state machine; + * **CLOSE_SUCCESS** → ``tracker.close``, drop the in-progress label, post the + DONE checklist, and ring the ``done`` doorbell; + * **ESCALATE_PREPUSH / FREEZE_ESCALATE** → drop the in-progress label, relabel + ``ready-for-human``, ring the ``needs-human`` / ``frozen`` doorbell, post the + checklist — the run is handed back to a human; + * **FIX_FORWARD** → dispatch a corrective turn (``t3_client.dispatch``), bump + the fix-forward attempt count, keep the run in flight, refresh the checklist; + NOT terminal, so no doorbell and no label churn; + * **WAIT** → just refresh the progress checklist and keep waiting; no labels, + no close, no doorbell, no dispatch. +""" +import pytest + +from app.afk import watcher +from app.afk.notifier import KIND_DONE, KIND_FROZEN, KIND_NEEDS_HUMAN +from app.afk.types import CIStatus, Issue + + +# --------------------------------------------------------------------------- # +# Helpers. +# --------------------------------------------------------------------------- # +READY_FOR_HUMAN = "ready-for-human" + + +def _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier) -> watcher.Watcher: + return watcher.Watcher( + t3_client=fake_t3, + tracker=fake_tracker, + ci_watcher=fake_ci, + notifier=fake_notifier, + ) + + +def _run( + issue: Issue, + thread_id: str = "thread-0", + commit: str | None = None, + fix_forward_attempts: int = 0, + elapsed_seconds: float = 0.0, +) -> watcher.InFlightRun: + return watcher.InFlightRun( + issue=issue, + thread_id=thread_id, + commit=commit, + fix_forward_attempts=fix_forward_attempts, + elapsed_seconds=elapsed_seconds, + ) + + +def _snapshot(thread_id: str, status: str) -> dict: + return {"threads": [{"id": thread_id, "status": status}]} + + +def _labels(fake_tracker): + return [(op, repo, num, lbl) for (op, repo, num, lbl) in fake_tracker.label_ops] + + +def _kinds(fake_notifier): + return [n["kind"] for n in fake_notifier.sent] + + +# --------------------------------------------------------------------------- # +# WAIT — agent still working, nothing pushed: refresh the checklist, no action. +# --------------------------------------------------------------------------- # +def test_wait_refreshes_checklist_and_does_nothing_else( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "running")) + + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue), make_config() + ) + + assert result.action.value == "wait" + assert result.terminal is False + assert fake_tracker.closed == [] + assert _labels(fake_tracker) == [] # no label churn while waiting + assert fake_notifier.sent == [] # no doorbell + assert fake_t3.dispatched == [] # no corrective turn + # The progress checklist was posted as a comment. + assert len(fake_tracker.comments) == 1 + repo, num, body = fake_tracker.comments[0] + assert (repo, num) == ("infra", 7) + assert "AFK run progress" in body + + +def test_wait_when_thread_missing_from_snapshot( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + # No snapshot entry for this thread yet -> thread_status None -> WAIT. + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot({"threads": []}) + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue), make_config() + ) + assert result.action.value == "wait" + assert result.terminal is False + + +def test_pushed_ci_pending_waits( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "running")) + # commit present (pushed) but CI not yet decided -> PENDING -> WAIT. + fake_ci.set_status("infra", "deadbeef", CIStatus.PENDING) + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="deadbeef"), make_config() + ) + assert result.action.value == "wait" + assert fake_tracker.closed == [] + + +# --------------------------------------------------------------------------- # +# CLOSE_SUCCESS — pushed + CI green: close, unlabel, DONE checklist, doorbell. +# --------------------------------------------------------------------------- # +def test_close_success_closes_and_unlabels_and_notifies( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "cafef00d", CIStatus.GREEN) + + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="cafef00d"), make_config() + ) + + assert result.action.value == "close_success" + assert result.terminal is True + assert fake_tracker.closed == [("infra", 7)] + # in-progress label removed (no ready-for-human on the happy path). + assert ("remove", "infra", 7, "agent-in-progress") in _labels(fake_tracker) + assert ("add", "infra", 7, READY_FOR_HUMAN) not in _labels(fake_tracker) + # done doorbell fired with the thread deep-link target. + assert _kinds(fake_notifier) == [KIND_DONE] + assert fake_notifier.sent[0]["thread_id"] == "thread-0" + assert fake_notifier.sent[0]["issue"] is issue + + +def test_close_success_posts_done_checklist( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "cafef00d", CIStatus.GREEN) + + _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="cafef00d"), make_config() + ) + + # The final checklist shows the run DONE — every phase checked. + body = fake_tracker.comments[-1][2] + assert "Done — issue closed" in body + assert "- [ ]" not in body # nothing left unchecked at DONE + + +# --------------------------------------------------------------------------- # +# ESCALATE_PREPUSH — agent stalled/errored before any push: hand to a human. +# --------------------------------------------------------------------------- # +@pytest.mark.parametrize("thread_state", ["error", "idle"]) +def test_escalate_prepush_relabels_and_notifies( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config, thread_state +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", thread_state)) + + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit=None), make_config() + ) + + assert result.action.value == "escalate_prepush" + assert result.terminal is True + assert fake_tracker.closed == [] # NOT closed — needs a human + labels = _labels(fake_tracker) + assert ("remove", "infra", 7, "agent-in-progress") in labels + assert ("add", "infra", 7, READY_FOR_HUMAN) in labels + assert _kinds(fake_notifier) == [KIND_NEEDS_HUMAN] + + +# --------------------------------------------------------------------------- # +# FREEZE_ESCALATE — pushed, CI red, fix-forward budget exhausted: freeze + page. +# --------------------------------------------------------------------------- # +def test_freeze_escalate_relabels_and_notifies( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "badc0de", CIStatus.RED) + config = make_config(fix_forward_max_attempts=3) + + # attempts already at the cap -> budget exhausted -> FREEZE_ESCALATE. + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="badc0de", fix_forward_attempts=3), config + ) + + assert result.action.value == "freeze_escalate" + assert result.terminal is True + assert fake_tracker.closed == [] + labels = _labels(fake_tracker) + assert ("remove", "infra", 7, "agent-in-progress") in labels + assert ("add", "infra", 7, READY_FOR_HUMAN) in labels + assert _kinds(fake_notifier) == [KIND_FROZEN] + + +# --------------------------------------------------------------------------- # +# FIX_FORWARD — pushed, CI red, budget remaining: corrective turn, stay in flight. +# --------------------------------------------------------------------------- # +def test_fix_forward_dispatches_corrective_turn( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "badc0de", CIStatus.RED) + config = make_config(fix_forward_max_attempts=5) + + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="badc0de", fix_forward_attempts=1), config + ) + + assert result.action.value == "fix_forward" + assert result.terminal is False + # A corrective turn was dispatched against the same repo/issue. + assert len(fake_t3.dispatched) == 1 + assert (fake_t3.dispatched[0]["repo"], fake_t3.dispatched[0]["issue"]) == ("infra", 7) + # Attempt count advanced and is surfaced on the result for the caller's + # bookkeeping on the next tick. + assert result.fix_forward_attempts == 2 + # Not terminal: no close, no ready-for-human, no doorbell. + assert fake_tracker.closed == [] + assert ("add", "infra", 7, READY_FOR_HUMAN) not in _labels(fake_tracker) + assert fake_notifier.sent == [] + + +def test_fix_forward_updates_thread_id_to_corrective_turn( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + # The corrective dispatch spawns a new thread; the result carries the new id + # so the next tick polls the right thread. + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "badc0de", CIStatus.RED) + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, thread_id="thread-old", commit="badc0de"), make_config() + ) + assert result.thread_id == "thread-0" # FakeT3Client hands back thread-0 + assert result.thread_id != "thread-old" + + +def test_fix_forward_note_appears_in_checklist( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "badc0de", CIStatus.RED) + _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="badc0de", fix_forward_attempts=1), make_config() + ) + body = fake_tracker.comments[-1][2] + assert "Fix-forward" in body + + +# --------------------------------------------------------------------------- # +# Unknown / unrecognised thread status folds to "keep waiting" (fail-safe). +# --------------------------------------------------------------------------- # +def test_unknown_thread_status_waits( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "provisioning")) # not a known status + result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit=None), make_config() + ) + # Unknown status must not escalate or close — treat as "no status yet". + assert result.action.value == "wait" + assert fake_tracker.closed == [] + assert fake_notifier.sent == [] + + +# --------------------------------------------------------------------------- # +# Terminal cleanup only happens once / cleanly: a terminal tick posts exactly +# one checklist comment (no double-commenting on the way out). +# --------------------------------------------------------------------------- # +def test_terminal_tick_posts_exactly_one_checklist( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "idle")) + fake_ci.set_status("infra", "cafef00d", CIStatus.GREEN) + _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick( + _run(issue, commit="cafef00d"), make_config() + ) + assert len(fake_tracker.comments) == 1 + + +# --------------------------------------------------------------------------- # +# CI status is only queried when something is pushed (don't hit CI for an +# unpushed run — there's no commit to check). +# --------------------------------------------------------------------------- # +def test_ci_not_queried_when_nothing_pushed( + fake_t3, fake_tracker, fake_notifier, make_issue, make_config +): + class ExplodingCI: + def status(self, repo, commit): + raise AssertionError("CI must not be queried with no pushed commit") + + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "running")) + result = watcher.Watcher( + t3_client=fake_t3, + tracker=fake_tracker, + ci_watcher=ExplodingCI(), + notifier=fake_notifier, + ).tick(_run(issue, commit=None), make_config()) + assert result.action.value == "wait" + + +# --------------------------------------------------------------------------- # +# ready-for-human label is configurable. +# --------------------------------------------------------------------------- # +def test_ready_for_human_label_is_configurable( + fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config +): + issue = make_issue(number=7, repo="infra") + fake_t3.set_snapshot(_snapshot("thread-0", "error")) + w = watcher.Watcher( + t3_client=fake_t3, + tracker=fake_tracker, + ci_watcher=fake_ci, + notifier=fake_notifier, + ready_for_human_label="needs-eyes", + ) + w.tick(_run(issue, commit=None), make_config()) + assert ("add", "infra", 7, "needs-eyes") in _labels(fake_tracker)