afk: add the autonomous issue-implementer loop (SHIPS DISABLED)

Adds app/afk/ — the "away-from-keyboard" control plane that watches the
issue tracker for ready-for-agent issues, dispatches each to a fresh
full-access T3 thread (with the issue-implementer preamble prepended,
because T3 does not honour ~/.claude/CLAUDE.md), and drives the resulting
run through its lifecycle: tests-red -> green -> pushed -> CI -> deployed,
escalating or fix-forwarding via a small pure state machine.

The loop is split into pure cores (no I/O, exhaustively unit-tested) and
thin injected adapters (the only edges that ever touch T3, the tracker,
CI, or Slack — faked in every test, so nothing here talks to a real
server, GitHub/Forgejo, or the cluster):

  pure:     types, dispatch_policy, run_state_machine, phase_checklist,
            config, issue_implementer_prompt
  adapters: t3_client (two-POST dispatch + snapshot), tracker, ci_watcher,
            notifier
  loops:    poller  — CronJob tick #1: list_ready -> select_dispatchable
                      -> dispatch + stamp the in-progress lock (label only
                      AFTER a successful dispatch, so a failed dispatch
                      never leaves a phantom lock). Per-repo lock derived
                      from the ready set, since the CronJob is stateless
                      between ticks.
            watcher — CronJob tick #2: assemble RunState from snapshot +
                      CI -> next_action -> act (close on success; relabel
                      ready-for-human + ring the doorbell on the two
                      escalations; dispatch a corrective turn on
                      fix-forward; refresh the progress checklist).

SHIPS DISABLED, on purpose: Config defaults to kill_switch=True AND an
empty allowlist, so a freshly-loaded config dispatches nothing and does
zero I/O. The package is not imported by the running service and has no
auto-enable path. Arming it is a deliberate, later, manual step requiring
BOTH gates (clear the kill switch AND enrol the exact repos) so one
fat-fingered env var can't arm every repo.

Test-first throughout: 412 tests pass (poller + watcher add integration
tests wiring the real pure cores to in-memory fakes). mypy clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-15 21:15:11 +00:00
parent 171857da6b
commit 2ef0db9a96
23 changed files with 4717 additions and 0 deletions

View file

@ -43,3 +43,186 @@ def drain():
break
await asyncio.sleep(0.01)
return _drain
# --------------------------------------------------------------------------- #
# AFK loop fixtures.
#
# Shared factories + in-memory fakes for the app.afk modules. EVERYTHING the AFK
# tests touch is faked here — no test ever reaches a real T3 server, GitHub /
# Forgejo, or the cluster. The fakes implement the module interfaces from the
# contract and record their calls so tests can assert on them.
# --------------------------------------------------------------------------- #
from app.afk.types import ( # noqa: E402 (after the env setup above, like app_main)
CIStatus,
Config,
Issue,
RunState,
ThreadStatus,
)
@pytest.fixture
def make_issue():
"""Factory for ``Issue``. Defaults to a clean, dispatchable issue (trusted
label, nothing blocking); override any field per test."""
def _make(
number: int = 1,
repo: str = "infra",
labels: list[str] | None = None,
blocked_by: list[int] | None = None,
labeled_by_trusted: bool = True,
priority: int = 0,
) -> Issue:
return Issue(
number=number,
repo=repo,
labels=["ready-for-agent"] if labels is None else labels,
blocked_by=[] if blocked_by is None else blocked_by,
labeled_by_trusted=labeled_by_trusted,
priority=priority,
)
return _make
@pytest.fixture
def make_config():
"""Factory for ``Config``. Defaults to an ENABLED config (kill switch off,
a one-repo allowlist) so policy/state-machine tests exercise real behaviour;
the disabled production default is covered separately in the config tests."""
def _make(
allowlist: list[str] | None = None,
kill_switch: bool = False,
**overrides,
) -> Config:
return Config(
allowlist=["infra"] if allowlist is None else allowlist,
kill_switch=kill_switch,
**overrides,
)
return _make
@pytest.fixture
def make_run_state():
"""Factory for ``RunState``. Defaults to a freshly-dispatched run (thread
running, nothing pushed, no CI, no fix-forward attempts yet)."""
def _make(
thread_status: ThreadStatus | None = ThreadStatus.RUNNING,
ci_status: CIStatus | None = None,
pushed: bool = False,
fix_forward_attempts: int = 0,
elapsed_seconds: float = 0.0,
) -> RunState:
return RunState(
thread_status=thread_status,
ci_status=ci_status,
pushed=pushed,
fix_forward_attempts=fix_forward_attempts,
elapsed_seconds=elapsed_seconds,
)
return _make
class FakeT3Client:
"""In-memory stand-in for ``t3_client.T3Client``. Records each dispatch and
hands back a deterministic thread id; ``snapshot`` returns whatever was
staged via ``set_snapshot``."""
def __init__(self) -> None:
self.dispatched: list[dict] = []
self._snapshot: dict = {"threads": []}
self._next_id = 0
def dispatch(self, repo: str, issue: int, prompt: str) -> str:
thread_id = f"thread-{self._next_id}"
self._next_id += 1
self.dispatched.append(
{"repo": repo, "issue": issue, "prompt": prompt, "thread_id": thread_id}
)
return thread_id
def snapshot(self) -> dict:
return self._snapshot
def set_snapshot(self, snapshot: dict) -> None:
self._snapshot = snapshot
class FakeTracker:
"""In-memory stand-in for ``tracker.Tracker``. ``list_ready`` returns issues
staged via ``seed``; label/comment/close just record their calls."""
def __init__(self) -> None:
self._ready: dict[str, list[Issue]] = {}
self.label_ops: list[tuple[str, str, int, str]] = [] # (op, repo, issue, label)
self.comments: list[tuple[str, int, str]] = []
self.closed: list[tuple[str, int]] = []
def seed(self, repo: str, issues: list[Issue]) -> None:
self._ready[repo] = issues
def list_ready(self, repos: list[str]) -> list[Issue]:
out: list[Issue] = []
for repo in repos:
out.extend(self._ready.get(repo, []))
return out
def add_label(self, repo: str, issue: int, label: str) -> None:
self.label_ops.append(("add", repo, issue, label))
def remove_label(self, repo: str, issue: int, label: str) -> None:
self.label_ops.append(("remove", repo, issue, label))
def comment(self, repo: str, issue: int, body: str) -> None:
self.comments.append((repo, issue, body))
def close(self, repo: str, issue: int) -> None:
self.closed.append((repo, issue))
class FakeCIWatcher:
"""In-memory stand-in for ``ci_watcher.CIWatcher``. Returns the status staged
per ``(repo, commit)`` via ``set_status``; unknown commits read PENDING."""
def __init__(self) -> None:
self._statuses: dict[tuple[str, str], CIStatus] = {}
def set_status(self, repo: str, commit: str, status: CIStatus) -> None:
self._statuses[(repo, commit)] = status
def status(self, repo: str, commit: str) -> CIStatus:
return self._statuses.get((repo, commit), CIStatus.PENDING)
class FakeNotifier:
"""In-memory stand-in for ``notifier.Notifier``. Records every notification
so tests can assert escalations fired with the right kind/detail."""
def __init__(self) -> None:
self.sent: list[dict] = []
def notify(self, kind: str, issue: Issue, thread_id: str | None, detail: str) -> None:
self.sent.append(
{"kind": kind, "issue": issue, "thread_id": thread_id, "detail": detail}
)
@pytest.fixture
def fake_t3() -> FakeT3Client:
return FakeT3Client()
@pytest.fixture
def fake_tracker() -> FakeTracker:
return FakeTracker()
@pytest.fixture
def fake_ci() -> FakeCIWatcher:
return FakeCIWatcher()
@pytest.fixture
def fake_notifier() -> FakeNotifier:
return FakeNotifier()

View file

@ -0,0 +1,285 @@
"""Tests for ``app.afk.ci_watcher`` — the commit → ``CIStatus`` adapter.
The watcher folds two independent signals into one verdict the state machine
reads: the **GHA run** for a pushed commit (build/test/lint) and the
**deploy/rollout** that reaches the cluster (Woodpecker pipeline Keel/k8s
rollout). The CI/CD chain is GHA ghcr Woodpecker Keel
(``docs/2026-06-14-afk-implementation-pipeline-design.md``), so a commit is only
truly GREEN once *both* the build passed AND its image actually rolled out.
Every test injects FAKE clients no test ever shells out to ``gh``,
``woodpecker``, or ``kubectl``, or reaches the network. The fakes implement the
``ci_watcher`` client Protocols and return staged ``StageResult`` values per
``(repo, commit)``; the watcher's only job is to query them and fold the result,
so the folding table is what these tests pin.
"""
import pytest
from app.afk.ci_watcher import (
CIWatcher,
StageResult,
)
from app.afk.types import CIStatus
# --------------------------------------------------------------------------- #
# Fakes for the three injected clients.
#
# Each maps (repo, commit) → StageResult and records every query, so tests can
# assert both the folded verdict AND that short-circuiting skips later stages
# (a RED build must not even ask the rollout client).
# --------------------------------------------------------------------------- #
class _FakeStageClient:
"""A recording stand-in for any of the three stage clients. ``default`` is
returned for an unstaged ``(repo, commit)`` defaults to ``PENDING`` so an
un-seeded stage reads "not done yet", never a false GREEN."""
def __init__(self, default: StageResult = StageResult.PENDING) -> None:
self._results: dict[tuple[str, str], StageResult] = {}
self._default = default
self.queries: list[tuple[str, str]] = []
def set(self, repo: str, commit: str, result: StageResult) -> None:
self._results[(repo, commit)] = result
def _lookup(self, repo: str, commit: str) -> StageResult:
self.queries.append((repo, commit))
return self._results.get((repo, commit), self._default)
class FakeGitHubChecks(_FakeStageClient):
def run_conclusion(self, repo: str, commit: str) -> StageResult:
return self._lookup(repo, commit)
class FakeWoodpecker(_FakeStageClient):
def deploy_conclusion(self, repo: str, commit: str) -> StageResult:
return self._lookup(repo, commit)
class FakeRollout(_FakeStageClient):
def rollout_status(self, repo: str, commit: str) -> StageResult:
return self._lookup(repo, commit)
# --------------------------------------------------------------------------- #
# Fixtures.
# --------------------------------------------------------------------------- #
REPO = "infra"
COMMIT = "deadbeefcafe"
@pytest.fixture
def gha() -> FakeGitHubChecks:
return FakeGitHubChecks()
@pytest.fixture
def woodpecker() -> FakeWoodpecker:
return FakeWoodpecker()
@pytest.fixture
def rollout() -> FakeRollout:
return FakeRollout()
@pytest.fixture
def watcher(gha, woodpecker, rollout) -> CIWatcher:
return CIWatcher(github=gha, woodpecker=woodpecker, rollout=rollout)
def _stage_all(gha, woodpecker, rollout, *, build, deploy, roll) -> None:
"""Stage all three clients for the canonical ``(REPO, COMMIT)`` at once."""
gha.set(REPO, COMMIT, build)
woodpecker.set(REPO, COMMIT, deploy)
rollout.set(REPO, COMMIT, roll)
# --------------------------------------------------------------------------- #
# StageResult vocabulary.
# --------------------------------------------------------------------------- #
def test_stageresult_has_the_four_outcomes():
assert {s.name for s in StageResult} == {"NONE", "PENDING", "SUCCESS", "FAILURE"}
# --------------------------------------------------------------------------- #
# The happy path: every stage green ⇒ GREEN.
# --------------------------------------------------------------------------- #
def test_all_stages_success_is_green(watcher, gha, woodpecker, rollout):
_stage_all(gha, woodpecker, rollout,
build=StageResult.SUCCESS,
deploy=StageResult.SUCCESS,
roll=StageResult.SUCCESS)
assert watcher.status(REPO, COMMIT) is CIStatus.GREEN
# --------------------------------------------------------------------------- #
# GHA build stage gates everything below it.
# --------------------------------------------------------------------------- #
def test_build_failure_is_red(watcher, gha):
gha.set(REPO, COMMIT, StageResult.FAILURE)
assert watcher.status(REPO, COMMIT) is CIStatus.RED
@pytest.mark.parametrize("build", [StageResult.NONE, StageResult.PENDING])
def test_build_not_yet_concluded_is_pending(watcher, gha, build):
# No run yet (NONE) and in-progress (PENDING) both read PENDING — the state
# machine waits on either.
gha.set(REPO, COMMIT, build)
assert watcher.status(REPO, COMMIT) is CIStatus.PENDING
def test_build_failure_short_circuits_before_deploy_and_rollout(
watcher, gha, woodpecker, rollout
):
gha.set(REPO, COMMIT, StageResult.FAILURE)
# Even if later stages would (nonsensically) be green, a red build wins...
woodpecker.set(REPO, COMMIT, StageResult.SUCCESS)
rollout.set(REPO, COMMIT, StageResult.SUCCESS)
assert watcher.status(REPO, COMMIT) is CIStatus.RED
# ...and the later clients are never even queried.
assert woodpecker.queries == []
assert rollout.queries == []
def test_build_pending_short_circuits_before_deploy_and_rollout(
watcher, gha, woodpecker, rollout
):
gha.set(REPO, COMMIT, StageResult.PENDING)
assert watcher.status(REPO, COMMIT) is CIStatus.PENDING
assert woodpecker.queries == []
assert rollout.queries == []
# --------------------------------------------------------------------------- #
# Deploy (Woodpecker) stage — only consulted once the build is green.
# --------------------------------------------------------------------------- #
def test_deploy_failure_is_red_even_with_green_build(watcher, gha, woodpecker):
gha.set(REPO, COMMIT, StageResult.SUCCESS)
woodpecker.set(REPO, COMMIT, StageResult.FAILURE)
assert watcher.status(REPO, COMMIT) is CIStatus.RED
@pytest.mark.parametrize("deploy", [StageResult.NONE, StageResult.PENDING])
def test_deploy_not_yet_concluded_is_pending(watcher, gha, woodpecker, deploy):
gha.set(REPO, COMMIT, StageResult.SUCCESS)
woodpecker.set(REPO, COMMIT, deploy)
assert watcher.status(REPO, COMMIT) is CIStatus.PENDING
def test_deploy_failure_short_circuits_before_rollout(
watcher, gha, woodpecker, rollout
):
gha.set(REPO, COMMIT, StageResult.SUCCESS)
woodpecker.set(REPO, COMMIT, StageResult.FAILURE)
rollout.set(REPO, COMMIT, StageResult.SUCCESS)
assert watcher.status(REPO, COMMIT) is CIStatus.RED
assert rollout.queries == []
# The build WAS consulted (it had to pass to reach deploy).
assert gha.queries == [(REPO, COMMIT)]
# --------------------------------------------------------------------------- #
# Rollout stage — the final gate. Green build + green deploy is still only
# PENDING until the image actually reaches the cluster.
# --------------------------------------------------------------------------- #
def test_rollout_failure_is_red(watcher, gha, woodpecker, rollout):
_stage_all(gha, woodpecker, rollout,
build=StageResult.SUCCESS,
deploy=StageResult.SUCCESS,
roll=StageResult.FAILURE)
assert watcher.status(REPO, COMMIT) is CIStatus.RED
@pytest.mark.parametrize("roll", [StageResult.NONE, StageResult.PENDING])
def test_green_build_and_deploy_but_unfinished_rollout_is_pending(
watcher, gha, woodpecker, rollout, roll
):
_stage_all(gha, woodpecker, rollout,
build=StageResult.SUCCESS,
deploy=StageResult.SUCCESS,
roll=roll)
assert watcher.status(REPO, COMMIT) is CIStatus.PENDING
def test_green_requires_all_three_stages_consulted(
watcher, gha, woodpecker, rollout
):
_stage_all(gha, woodpecker, rollout,
build=StageResult.SUCCESS,
deploy=StageResult.SUCCESS,
roll=StageResult.SUCCESS)
assert watcher.status(REPO, COMMIT) is CIStatus.GREEN
assert gha.queries == [(REPO, COMMIT)]
assert woodpecker.queries == [(REPO, COMMIT)]
assert rollout.queries == [(REPO, COMMIT)]
# --------------------------------------------------------------------------- #
# Plumbing: the commit and repo are passed through verbatim to every client,
# and an entirely un-seeded commit reads PENDING (not GREEN, not RED).
# --------------------------------------------------------------------------- #
def test_repo_and_commit_passed_through_to_clients(watcher, gha):
gha.set("realestate-crawler", "abc123", StageResult.FAILURE)
assert watcher.status("realestate-crawler", "abc123") is CIStatus.RED
assert gha.queries == [("realestate-crawler", "abc123")]
def test_unknown_commit_defaults_to_pending(watcher):
# Nothing staged anywhere ⇒ the build stage reads PENDING by default ⇒ the
# whole verdict is PENDING. A never-pushed/just-pushed commit is never a
# false GREEN.
assert watcher.status(REPO, "never-seen") is CIStatus.PENDING
# --------------------------------------------------------------------------- #
# The default rollout client is OPTIONAL — per the pilot facts, state.sqlite /
# kubectl reads are optional, so a CIWatcher built without a rollout client must
# still work, treating "build green + deploy green" as the terminal GREEN.
# --------------------------------------------------------------------------- #
def test_rollout_client_is_optional_deploy_green_is_green(gha, woodpecker):
w = CIWatcher(github=gha, woodpecker=woodpecker) # no rollout client
gha.set(REPO, COMMIT, StageResult.SUCCESS)
woodpecker.set(REPO, COMMIT, StageResult.SUCCESS)
assert w.status(REPO, COMMIT) is CIStatus.GREEN
def test_rollout_client_optional_still_honours_build_and_deploy_failures(
gha, woodpecker
):
w = CIWatcher(github=gha, woodpecker=woodpecker)
gha.set(REPO, COMMIT, StageResult.SUCCESS)
woodpecker.set(REPO, COMMIT, StageResult.FAILURE)
assert w.status(REPO, COMMIT) is CIStatus.RED
# --------------------------------------------------------------------------- #
# Full folding table — exhaustive over (build, deploy, rollout) so the
# precedence rules (FAILURE short-circuits red; otherwise any PENDING/NONE keeps
# it pending; all-success ⇒ green) can never silently drift.
# --------------------------------------------------------------------------- #
_N, _P, _S, _F = (
StageResult.NONE,
StageResult.PENDING,
StageResult.SUCCESS,
StageResult.FAILURE,
)
def _expected(build: StageResult, deploy: StageResult, roll: StageResult) -> CIStatus:
# Reference fold, independent of the implementation, evaluated stage by stage.
for stage in (build, deploy, roll):
if stage is _F:
return CIStatus.RED
if stage in (_N, _P):
return CIStatus.PENDING
return CIStatus.GREEN
@pytest.mark.parametrize("build", [_N, _P, _S, _F])
@pytest.mark.parametrize("deploy", [_N, _P, _S, _F])
@pytest.mark.parametrize("roll", [_N, _P, _S, _F])
def test_full_folding_table(watcher, gha, woodpecker, rollout, build, deploy, roll):
_stage_all(gha, woodpecker, rollout, build=build, deploy=deploy, roll=roll)
assert watcher.status(REPO, COMMIT) is _expected(build, deploy, roll)

View file

@ -0,0 +1,374 @@
"""Tests for ``app.afk.dispatch_policy.select_dispatchable`` — the pure gate that
turns a pile of ready issues into the ordered set the loop may dispatch *now*.
The function is PURE (no IO), so every test here is a plain in-memory call over
the fakes/factories in ``conftest`` (``make_issue`` / ``make_config``); nothing
touches a real T3 server, tracker, or cluster. The suite walks the full
dispatchability matrix trust gate, allowlist, per-repo lock, blocked_by,
kill switch plus the priority ordering and the one-agent-per-repo invariant.
Ordering contract under test: **higher ``priority`` first** (per the AFK module
spec), with a deterministic tiebreaker so the output is stable regardless of
input order. NOTE: ``Issue.priority``'s own docstring says "lower runs first";
this module follows the explicit dispatch-policy spec instead see the module
docstring in ``dispatch_policy.py``.
"""
import itertools
import pytest
from app.afk import dispatch_policy
from app.afk.types import DispatchDecision, Issue
# --------------------------------------------------------------------------- #
# Helpers — keep assertions terse and intent-revealing.
# --------------------------------------------------------------------------- #
def _selected_numbers(decisions: list[DispatchDecision]) -> list[int]:
"""The issue numbers, in the order the policy returned them."""
return [d.issue.number for d in decisions]
def _selected_set(decisions: list[DispatchDecision]) -> set[int]:
return {d.issue.number for d in decisions}
# --------------------------------------------------------------------------- #
# Return shape & purity.
# --------------------------------------------------------------------------- #
def test_returns_list_of_dispatch_decisions(make_issue, make_config):
issue = make_issue(number=7, repo="infra")
decisions = dispatch_policy.select_dispatchable([issue], make_config(), set())
assert isinstance(decisions, list)
assert len(decisions) == 1
assert isinstance(decisions[0], DispatchDecision)
assert decisions[0].issue is issue
assert isinstance(decisions[0].reason, str) and decisions[0].reason # non-empty
def test_empty_input_yields_empty_output(make_config):
assert dispatch_policy.select_dispatchable([], make_config(), set()) == []
def test_does_not_mutate_inputs(make_issue, make_config):
issues = [make_issue(number=1, priority=0), make_issue(number=2, priority=9)]
issues_snapshot = list(issues)
config = make_config(allowlist=["infra"])
in_flight: set[str] = set()
dispatch_policy.select_dispatchable(issues, config, in_flight)
# Caller's list (and its order) and the lock set are left untouched.
assert issues == issues_snapshot
assert [i.number for i in issues] == [1, 2]
assert in_flight == set()
assert config.allowlist == ["infra"]
def test_decision_wraps_the_same_issue_object(make_issue, make_config):
issue = make_issue(number=42)
[decision] = dispatch_policy.select_dispatchable([issue], make_config(), set())
assert decision.issue is issue # identity, not a copy
# --------------------------------------------------------------------------- #
# Kill switch — highest-precedence short-circuit.
# --------------------------------------------------------------------------- #
def test_kill_switch_returns_empty_even_with_perfect_issues(make_issue, make_config):
issues = [make_issue(number=n, repo="infra") for n in range(1, 6)]
config = make_config(allowlist=["infra"], kill_switch=True)
assert dispatch_policy.select_dispatchable(issues, config, set()) == []
def test_kill_switch_off_dispatches(make_issue, make_config):
issue = make_issue(repo="infra")
config = make_config(allowlist=["infra"], kill_switch=False)
assert len(dispatch_policy.select_dispatchable([issue], config, set())) == 1
def test_production_default_config_dispatches_nothing(make_issue):
"""The shipped default (kill switch ON, empty allowlist) is inert: even a
pristine, trusted issue is never selected."""
from app.afk import config as afk_config
issue = make_issue(repo="infra")
assert dispatch_policy.select_dispatchable([issue], afk_config.default(), set()) == []
# --------------------------------------------------------------------------- #
# Trust gate.
# --------------------------------------------------------------------------- #
def test_untrusted_issue_is_skipped(make_issue, make_config):
issue = make_issue(repo="infra", labeled_by_trusted=False)
assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == []
def test_trusted_issue_is_eligible(make_issue, make_config):
issue = make_issue(repo="infra", labeled_by_trusted=True)
assert len(dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set())) == 1
def test_trust_gate_filters_only_untrusted(make_issue, make_config):
trusted = make_issue(number=1, repo="infra", labeled_by_trusted=True)
untrusted = make_issue(number=2, repo="infra", labeled_by_trusted=False)
decisions = dispatch_policy.select_dispatchable(
[trusted, untrusted], make_config(allowlist=["infra"]), set()
)
assert _selected_set(decisions) == {1}
# --------------------------------------------------------------------------- #
# Allowlist membership.
# --------------------------------------------------------------------------- #
def test_repo_not_in_allowlist_is_skipped(make_issue, make_config):
issue = make_issue(repo="some-other-repo")
assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == []
def test_empty_allowlist_dispatches_nothing(make_issue, make_config):
issue = make_issue(repo="infra")
# kill switch off but allowlist empty -> still inert (the two-gate posture).
config = make_config(allowlist=[], kill_switch=False)
assert dispatch_policy.select_dispatchable([issue], config, set()) == []
def test_allowlist_selects_only_listed_repos(make_issue, make_config):
a = make_issue(number=1, repo="infra")
b = make_issue(number=2, repo="realestate-crawler")
c = make_issue(number=3, repo="not-allowed")
decisions = dispatch_policy.select_dispatchable(
[a, b, c], make_config(allowlist=["infra", "realestate-crawler"]), set()
)
assert _selected_set(decisions) == {1, 2}
# --------------------------------------------------------------------------- #
# Per-repo lock (in_flight_repos).
# --------------------------------------------------------------------------- #
def test_repo_already_in_flight_is_skipped(make_issue, make_config):
issue = make_issue(repo="infra")
decisions = dispatch_policy.select_dispatchable(
[issue], make_config(allowlist=["infra"]), in_flight_repos={"infra"}
)
assert decisions == []
def test_in_flight_lock_is_per_repo(make_issue, make_config):
locked = make_issue(number=1, repo="infra")
free = make_issue(number=2, repo="realestate-crawler")
decisions = dispatch_policy.select_dispatchable(
[locked, free],
make_config(allowlist=["infra", "realestate-crawler"]),
in_flight_repos={"infra"},
)
assert _selected_set(decisions) == {2} # only the unlocked repo's issue runs
def test_all_repos_in_flight_dispatches_nothing(make_issue, make_config):
a = make_issue(number=1, repo="infra")
b = make_issue(number=2, repo="realestate-crawler")
decisions = dispatch_policy.select_dispatchable(
[a, b],
make_config(allowlist=["infra", "realestate-crawler"]),
in_flight_repos={"infra", "realestate-crawler"},
)
assert decisions == []
# --------------------------------------------------------------------------- #
# One-agent-per-repo invariant — at most ONE decision per repo per call.
#
# The whole design serialises agents within a repo (two would collide on the
# working tree). A single call must therefore never hand back two issues for the
# same repo, even when both are eligible and the repo is not yet in-flight.
# --------------------------------------------------------------------------- #
def test_at_most_one_decision_per_repo(make_issue, make_config):
lo = make_issue(number=1, repo="infra", priority=1)
hi = make_issue(number=2, repo="infra", priority=9)
decisions = dispatch_policy.select_dispatchable(
[lo, hi], make_config(allowlist=["infra"]), set()
)
assert len(decisions) == 1
assert decisions[0].issue.number == 2 # the higher-priority one wins the slot
def test_one_decision_per_repo_across_many_repos(make_issue, make_config):
issues = [
make_issue(number=10, repo="infra", priority=1),
make_issue(number=11, repo="infra", priority=5),
make_issue(number=20, repo="realestate-crawler", priority=3),
make_issue(number=21, repo="realestate-crawler", priority=2),
]
decisions = dispatch_policy.select_dispatchable(
issues, make_config(allowlist=["infra", "realestate-crawler"]), set()
)
# One per repo, each the repo's highest-priority eligible issue.
assert _selected_set(decisions) == {11, 20}
repos = [d.issue.repo for d in decisions]
assert len(repos) == len(set(repos)) # no repo appears twice
def test_ineligible_higher_priority_does_not_consume_repo_slot(make_issue, make_config):
"""A higher-priority issue that is itself ineligible (e.g. blocked) must not
suppress a lower-priority *eligible* issue in the same repo the slot goes
to the best ELIGIBLE candidate, not merely the highest-priority one."""
blocked_hi = make_issue(number=1, repo="infra", priority=9, blocked_by=[99])
ready_lo = make_issue(number=2, repo="infra", priority=1)
decisions = dispatch_policy.select_dispatchable(
[blocked_hi, ready_lo], make_config(allowlist=["infra"]), set()
)
assert _selected_numbers(decisions) == [2]
# --------------------------------------------------------------------------- #
# blocked_by gating — blocked_by holds OPEN blocker numbers.
# --------------------------------------------------------------------------- #
def test_blocked_issue_is_skipped(make_issue, make_config):
issue = make_issue(repo="infra", blocked_by=[101])
assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == []
def test_unblocked_issue_with_empty_blocked_by_is_eligible(make_issue, make_config):
issue = make_issue(repo="infra", blocked_by=[])
assert len(dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set())) == 1
@pytest.mark.parametrize("blockers", [[1], [1, 2], [5, 6, 7]])
def test_any_open_blocker_blocks(make_issue, make_config, blockers):
issue = make_issue(repo="infra", blocked_by=blockers)
assert dispatch_policy.select_dispatchable([issue], make_config(allowlist=["infra"]), set()) == []
def test_blocked_filters_only_blocked(make_issue, make_config):
ready = make_issue(number=1, repo="infra", blocked_by=[])
blocked = make_issue(number=2, repo="realestate-crawler", blocked_by=[7])
decisions = dispatch_policy.select_dispatchable(
[ready, blocked], make_config(allowlist=["infra", "realestate-crawler"]), set()
)
assert _selected_set(decisions) == {1}
# --------------------------------------------------------------------------- #
# Priority ordering — higher priority first, deterministic tiebreaker.
# --------------------------------------------------------------------------- #
def test_higher_priority_first(make_issue, make_config):
lo = make_issue(number=1, repo="infra", priority=1)
mid = make_issue(number=2, repo="realestate-crawler", priority=5)
hi = make_issue(number=3, repo="SparkyFitness", priority=9)
decisions = dispatch_policy.select_dispatchable(
[lo, hi, mid],
make_config(allowlist=["infra", "realestate-crawler", "SparkyFitness"]),
set(),
)
assert _selected_numbers(decisions) == [3, 2, 1] # 9, 5, 1
def test_ordering_independent_of_input_order(make_issue, make_config):
"""Whatever order the caller supplies issues in, the dispatch order is the
same sorted purely by the policy, not by arrival."""
base = [
("infra", 10, 2),
("realestate-crawler", 20, 8),
("SparkyFitness", 30, 5),
("health", 40, 1),
]
allow = ["infra", "realestate-crawler", "SparkyFitness", "health"]
config = make_config(allowlist=allow)
expected = [20, 30, 10, 40] # priorities 8,5,2,1
for perm in itertools.permutations(base):
issues = [make_issue(number=n, repo=r, priority=p) for (r, n, p) in perm]
decisions = dispatch_policy.select_dispatchable(issues, config, set())
assert _selected_numbers(decisions) == expected
def test_priority_ties_break_deterministically_by_issue_number(make_issue, make_config):
"""Equal priority across different repos -> a stable, total order. We tie-break
on ascending issue number so the result never depends on dict/set iteration
or input order."""
a = make_issue(number=30, repo="infra", priority=5)
b = make_issue(number=10, repo="realestate-crawler", priority=5)
c = make_issue(number=20, repo="SparkyFitness", priority=5)
config = make_config(allowlist=["infra", "realestate-crawler", "SparkyFitness"])
for perm in itertools.permutations([a, b, c]):
decisions = dispatch_policy.select_dispatchable(list(perm), config, set())
assert _selected_numbers(decisions) == [10, 20, 30]
def test_negative_and_zero_priorities_order_correctly(make_issue, make_config):
neg = make_issue(number=1, repo="infra", priority=-5)
zero = make_issue(number=2, repo="realestate-crawler", priority=0)
pos = make_issue(number=3, repo="SparkyFitness", priority=3)
decisions = dispatch_policy.select_dispatchable(
[neg, zero, pos],
make_config(allowlist=["infra", "realestate-crawler", "SparkyFitness"]),
set(),
)
assert _selected_numbers(decisions) == [3, 2, 1] # 3 > 0 > -5
# --------------------------------------------------------------------------- #
# Reasons — human-readable, never parsed, but must be present and sensible.
# --------------------------------------------------------------------------- #
def test_every_decision_has_a_nonempty_reason(make_issue, make_config):
issues = [
make_issue(number=1, repo="infra", priority=3),
make_issue(number=2, repo="realestate-crawler", priority=1),
]
decisions = dispatch_policy.select_dispatchable(
issues, make_config(allowlist=["infra", "realestate-crawler"]), set()
)
assert decisions # sanity
assert all(d.reason.strip() for d in decisions)
# --------------------------------------------------------------------------- #
# Combined matrix — every gate together. A single eligible needle in a haystack
# of issues that each trip exactly one gate.
# --------------------------------------------------------------------------- #
def test_only_the_fully_eligible_issue_survives_all_gates(make_issue, make_config):
config = make_config(allowlist=["infra", "realestate-crawler"], kill_switch=False)
in_flight = {"realestate-crawler"} # this repo is locked
issues = [
make_issue(number=1, repo="infra", priority=5), # ELIGIBLE
make_issue(number=2, repo="not-allowed", priority=9), # allowlist
make_issue(number=3, repo="infra", priority=9, labeled_by_trusted=False), # trust
make_issue(number=4, repo="infra", priority=9, blocked_by=[1]), # blocked
make_issue(number=5, repo="realestate-crawler", priority=9), # repo locked
]
decisions = dispatch_policy.select_dispatchable(issues, config, in_flight)
assert _selected_numbers(decisions) == [1]
assert decisions[0].issue.repo == "infra"
@pytest.mark.parametrize("trusted", [True, False])
@pytest.mark.parametrize("allowed", [True, False])
@pytest.mark.parametrize("blocked", [True, False])
@pytest.mark.parametrize("locked", [True, False])
@pytest.mark.parametrize("killed", [True, False])
def test_full_eligibility_matrix(
make_issue, make_config, trusted, allowed, blocked, locked, killed
):
"""Exhaustive truth table: an issue is dispatched iff ALL gates pass and the
kill switch is off. 2**5 = 32 cases, single issue so ordering is moot."""
issue = make_issue(
number=1,
repo="infra",
priority=0,
labeled_by_trusted=trusted,
blocked_by=[99] if blocked else [],
)
config = make_config(
allowlist=["infra"] if allowed else ["other-repo"],
kill_switch=killed,
)
in_flight = {"infra"} if locked else set()
decisions = dispatch_policy.select_dispatchable([issue], config, in_flight)
should_dispatch = trusted and allowed and not blocked and not locked and not killed
assert (len(decisions) == 1) is should_dispatch
if should_dispatch:
assert decisions[0].issue is issue

198
tests/test_afk_notifier.py Normal file
View file

@ -0,0 +1,198 @@
"""Tests for ``app.afk.notifier`` — the terminal-state doorbell.
The notifier's whole job is to format a human-facing alert (Slack / ntfy) with a
deep-link back to the T3 thread when a run reaches a terminal state done,
needs-human, or frozen and hand it to an injected sender. Every test here
injects a recording fake sender, so nothing is ever POSTed: we assert the
*formatted payload* per kind, plus the deep-link, the kind vocabulary, and the
guardrails (no thread no link, unknown kind rejected, sender called exactly
once with the return value being None).
No real Slack/ntfy/T3 is touched consistent with the rest of the AFK suite.
"""
import pytest
from app.afk import notifier as notifier_mod
from app.afk.notifier import KIND_DONE, KIND_FROZEN, KIND_NEEDS_HUMAN, Notification, Notifier
from app.afk.types import Issue
# --------------------------------------------------------------------------- #
# A recording sender — captures the Notification instead of posting it.
# --------------------------------------------------------------------------- #
class RecordingSender:
"""Injectable stand-in for the real Slack/ntfy POST. Records each payload so
a test can assert the formatting without any network."""
def __init__(self) -> None:
self.sent: list[Notification] = []
def __call__(self, notification: Notification) -> None:
self.sent.append(notification)
@pytest.fixture
def sender() -> RecordingSender:
return RecordingSender()
def _issue(number: int = 42, repo: str = "infra") -> Issue:
return Issue(
number=number,
repo=repo,
labels=["ready-for-agent"],
blocked_by=[],
labeled_by_trusted=True,
priority=0,
)
# --------------------------------------------------------------------------- #
# Kind vocabulary — the three terminal states, and nothing else.
# --------------------------------------------------------------------------- #
def test_terminal_kinds_are_exactly_the_three_terminal_states():
assert KIND_DONE == "done"
assert KIND_NEEDS_HUMAN == "needs-human"
assert KIND_FROZEN == "frozen"
assert notifier_mod.TERMINAL_KINDS == {KIND_DONE, KIND_NEEDS_HUMAN, KIND_FROZEN}
# --------------------------------------------------------------------------- #
# Dispatch mechanics — sender injected, called exactly once, returns None.
# --------------------------------------------------------------------------- #
def test_notify_calls_sender_exactly_once_and_returns_none(sender):
n = Notifier(sender)
result = n.notify(KIND_DONE, _issue(), "thread-7", "all green")
assert result is None
assert len(sender.sent) == 1
def test_notify_does_not_post_anything_itself(sender):
"""The Notifier must never reach the network on its own — all egress goes
through the injected sender. A test-only sentinel proves that."""
n = Notifier(sender)
n.notify(KIND_FROZEN, _issue(), "thread-1", "budget exhausted")
# Nothing other than the injected sender ran: exactly one recorded payload,
# and it is the Notification dataclass (not a raw dict / HTTP response).
assert isinstance(sender.sent[0], Notification)
# --------------------------------------------------------------------------- #
# Deep-link — every payload links back to the T3 thread (when there is one).
# --------------------------------------------------------------------------- #
def test_payload_deep_links_to_the_t3_thread(sender):
n = Notifier(sender, base_url="https://t3.viktorbarzin.me")
n.notify(KIND_DONE, _issue(), "thread-abc", "done")
payload = sender.sent[0]
assert payload.link == "https://t3.viktorbarzin.me/?thread=thread-abc"
# The link is also surfaced in the human-readable body so it survives
# senders that drop structured fields (e.g. a plain ntfy message).
assert "https://t3.viktorbarzin.me/?thread=thread-abc" in payload.body
def test_base_url_trailing_slash_is_normalised(sender):
n = Notifier(sender, base_url="https://t3.viktorbarzin.me/")
n.notify(KIND_DONE, _issue(), "thread-x", "done")
assert sender.sent[0].link == "https://t3.viktorbarzin.me/?thread=thread-x"
def test_no_thread_id_means_no_link(sender):
"""A run can reach 'needs-human' before any thread exists (e.g. dispatch
itself failed). Without a thread there is nothing to deep-link to, so the
link is None but the doorbell still fires."""
n = Notifier(sender)
n.notify(KIND_NEEDS_HUMAN, _issue(), None, "dispatch failed")
payload = sender.sent[0]
assert payload.link is None
assert len(sender.sent) == 1
# No dangling "/?thread=" fragment leaks into the body either.
assert "?thread=" not in payload.body
# --------------------------------------------------------------------------- #
# Per-kind formatting — title / body / priority / tags differ per terminal kind.
# --------------------------------------------------------------------------- #
def test_done_payload_is_informational(sender):
n = Notifier(sender)
n.notify(KIND_DONE, _issue(number=7, repo="infra"), "thread-7", "merged + CI green")
p = sender.sent[0]
assert p.kind == KIND_DONE
assert p.issue_ref == "infra#7"
assert "infra#7" in p.title
assert "merged + CI green" in p.body
# A successful close is informational, not an escalation.
assert p.priority == "low"
assert "escalation" not in p.tags
def test_needs_human_payload_is_an_escalation(sender):
n = Notifier(sender)
n.notify(KIND_NEEDS_HUMAN, _issue(number=9, repo="claude-agent-service"), "thread-9", "errored before push")
p = sender.sent[0]
assert p.kind == KIND_NEEDS_HUMAN
assert p.issue_ref == "claude-agent-service#9"
assert "claude-agent-service#9" in p.title
assert "errored before push" in p.body
assert p.priority == "high"
assert "escalation" in p.tags
def test_frozen_payload_is_an_escalation(sender):
n = Notifier(sender)
n.notify(KIND_FROZEN, _issue(number=3, repo="infra"), "thread-3", "fix-forward budget exhausted")
p = sender.sent[0]
assert p.kind == KIND_FROZEN
assert "infra#3" in p.title
assert "fix-forward budget exhausted" in p.body
assert p.priority == "high"
assert "escalation" in p.tags
def test_titles_distinguish_the_three_kinds(sender):
"""An operator skimming a Slack channel must tell the three apart from the
title alone, without reading the body."""
n = Notifier(sender)
n.notify(KIND_DONE, _issue(), "t", "x")
n.notify(KIND_NEEDS_HUMAN, _issue(), "t", "x")
n.notify(KIND_FROZEN, _issue(), "t", "x")
titles = [p.title for p in sender.sent]
assert len({t.split(" ")[0] for t in titles}) == 3 # distinct leading marker per kind
# --------------------------------------------------------------------------- #
# Guardrail — only terminal kinds are sendable. An unknown kind is a bug.
# --------------------------------------------------------------------------- #
def test_unknown_kind_raises_and_sends_nothing(sender):
n = Notifier(sender)
with pytest.raises(ValueError):
n.notify("running", _issue(), "thread-1", "still working")
assert sender.sent == []
# --------------------------------------------------------------------------- #
# Pure formatter — render_notification builds the payload independently of any
# sender, so the formatting is unit-testable on its own.
# --------------------------------------------------------------------------- #
def test_render_notification_is_pure_and_matches_notify(sender):
issue = _issue(number=11, repo="infra")
built = notifier_mod.render_notification(
KIND_FROZEN, issue, "thread-11", "stuck", base_url="https://t3.viktorbarzin.me"
)
assert isinstance(built, Notification)
assert built.link == "https://t3.viktorbarzin.me/?thread=thread-11"
# notify() must produce the identical payload it hands the sender.
Notifier(sender, base_url="https://t3.viktorbarzin.me").notify(
KIND_FROZEN, issue, "thread-11", "stuck"
)
assert sender.sent[0] == built
def test_sender_exception_propagates(sender):
"""If the sender fails (Slack down), the notifier does not swallow it — the
loop decides what to do with a failed doorbell, not this adapter."""
def boom(_notification: Notification) -> None:
raise RuntimeError("slack 503")
n = Notifier(boom)
with pytest.raises(RuntimeError, match="slack 503"):
n.notify(KIND_DONE, _issue(), "thread-1", "done")

View file

@ -0,0 +1,247 @@
"""Tests for ``app.afk.phase_checklist`` — the live progress checklist.
``render(current, meta)`` is PURE: same inputs byte-identical markdown, no I/O.
It draws the seven-phase lifecycle (worktree tests-red green pushed CI
deployed done) as a markdown task list, with phases *before* ``current`` checked
off, ``current`` marked in-progress, and later phases left empty.
Style matches the existing suite: plain ``assert`` functions, parametrized cases,
and a couple of full-output snapshots so the rendered shape is pinned, not just
its line count.
"""
import pytest
from app.afk.phase_checklist import render
from app.afk.types import Phase
# Lifecycle order, mirrored from the contract so a reordering of the enum that
# the renderer didn't track shows up as a test failure rather than silent drift.
PHASES_IN_ORDER = [
Phase.WORKTREE,
Phase.TESTS_RED,
Phase.GREEN,
Phase.PUSHED,
Phase.CI,
Phase.DEPLOYED,
Phase.DONE,
]
# --------------------------------------------------------------------------- #
# Structure: one line per phase, in order, always all seven.
# --------------------------------------------------------------------------- #
def _checklist_lines(out: str) -> list[str]:
"""The markdown task-list lines (``- [ ]`` / ``- [x]`` ...), in order."""
return [ln for ln in out.splitlines() if ln.lstrip().startswith("- [")]
def test_renders_a_string():
assert isinstance(render(Phase.WORKTREE, {}), str)
@pytest.mark.parametrize("current", PHASES_IN_ORDER)
def test_every_phase_has_exactly_one_checklist_line(current):
lines = _checklist_lines(render(current, {}))
assert len(lines) == len(PHASES_IN_ORDER)
@pytest.mark.parametrize("current", PHASES_IN_ORDER)
def test_checklist_lines_are_in_lifecycle_order(current):
lines = _checklist_lines(render(current, {}))
# Each phase's human label appears, and in the lifecycle order.
positions = [
next(i for i, ln in enumerate(lines) if _has_label(ln, phase))
for phase in PHASES_IN_ORDER
]
assert positions == sorted(positions)
def _has_label(line: str, phase: Phase) -> bool:
"""Whether a checklist line carries ``phase``'s headline word (case-insensitive
substring the test asserts the label is *present*, not its exact decoration)."""
return _phase_label(phase).lower() in line.lower()
def _phase_label(phase: Phase) -> str:
"""The headline word(s) the renderer must use for a phase. Loose on purpose:
the test asserts the label is *present*, not the exact decoration."""
return {
Phase.WORKTREE: "worktree",
Phase.TESTS_RED: "test",
Phase.GREEN: "green",
Phase.PUSHED: "push",
Phase.CI: "CI",
Phase.DEPLOYED: "deploy",
Phase.DONE: "done",
}[phase]
# --------------------------------------------------------------------------- #
# Check/in-progress/empty partitioning around ``current``.
# --------------------------------------------------------------------------- #
def _classify(line: str) -> str:
"""Bucket a checklist line by its marker: 'done' ``[x]``, 'todo' ``[ ]``, or
'active' (anything else, e.g. an in-progress glyph)."""
body = line.lstrip()
if body.startswith("- [x]"):
return "done"
if body.startswith("- [ ]"):
return "todo"
return "active"
@pytest.mark.parametrize("idx,current", list(enumerate(PHASES_IN_ORDER)))
def test_earlier_checked_current_active_later_empty(idx, current):
lines = _checklist_lines(render(current, {}))
buckets = [_classify(ln) for ln in lines]
# Everything strictly before the current phase is checked off.
assert all(b == "done" for b in buckets[:idx]), buckets
if current is Phase.DONE:
# Terminal phase: the whole list is checked, nothing left active/empty.
assert all(b == "done" for b in buckets), buckets
else:
# The current phase is the single in-progress marker...
assert buckets[idx] == "active", buckets
assert buckets.count("active") == 1, buckets
# ...and every phase after it is still an empty checkbox.
assert all(b == "todo" for b in buckets[idx + 1 :]), buckets
def test_first_phase_has_nothing_checked_before_it():
lines = _checklist_lines(render(Phase.WORKTREE, {}))
assert _classify(lines[0]) == "active"
assert "done" not in [_classify(ln) for ln in lines]
def test_done_checks_every_phase_including_done():
lines = _checklist_lines(render(Phase.DONE, {}))
assert all(_classify(ln) == "done" for ln in lines)
# The DONE line itself is checked, not merely the ones before it.
done_line = next(ln for ln in lines if _has_label(ln, Phase.DONE))
assert _classify(done_line) == "done"
# --------------------------------------------------------------------------- #
# Active-phase emphasis: the current phase is visually distinguishable.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize("current", [p for p in PHASES_IN_ORDER if p is not Phase.DONE])
def test_active_phase_line_differs_from_todo_and_done_markers(current):
lines = _checklist_lines(render(current, {}))
active = [ln for ln in lines if _classify(ln) == "active"]
assert len(active) == 1
# Not a plain checkbox in either state.
assert not active[0].lstrip().startswith("- [x]")
assert not active[0].lstrip().startswith("- [ ]")
# --------------------------------------------------------------------------- #
# meta rendering: optional context is surfaced, omission never explodes.
# --------------------------------------------------------------------------- #
def test_meta_empty_does_not_raise_and_still_lists_phases():
out = render(Phase.GREEN, {})
assert _checklist_lines(out) # non-empty
def test_meta_issue_and_repo_appear_in_output():
out = render(Phase.GREEN, {"repo": "infra", "issue": 42})
assert "infra" in out
assert "42" in out
def test_meta_thread_id_appears_when_present():
out = render(Phase.PUSHED, {"thread_id": "thread-7"})
assert "thread-7" in out
def test_meta_thread_id_absent_is_silent():
out = render(Phase.PUSHED, {})
assert "thread-" not in out
def test_meta_fix_forward_attempt_surfaced():
out = render(Phase.CI, {"fix_forward_attempts": 3})
assert "3" in out
def test_meta_unknown_keys_are_ignored():
# An unexpected key must not crash or leak its raw value as a stray line.
out = render(Phase.WORKTREE, {"totally_unknown_field": "should-not-appear"})
assert "should-not-appear" not in out
# --------------------------------------------------------------------------- #
# Determinism + idempotence (it's pure).
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize("current", PHASES_IN_ORDER)
def test_render_is_deterministic(current):
meta = {"repo": "infra", "issue": 9, "thread_id": "thread-1"}
assert render(current, meta) == render(current, meta)
def test_render_does_not_mutate_meta():
meta = {"repo": "infra", "issue": 1}
before = dict(meta)
render(Phase.GREEN, meta)
assert meta == before
# --------------------------------------------------------------------------- #
# Snapshots: pin the exact rendered shape for two representative phases. If the
# format changes intentionally, update these strings; an accidental change to
# wording/markers/order fails here loudly.
# --------------------------------------------------------------------------- #
WORKTREE_SNAPSHOT = """\
### infra#7 — AFK run progress
- [~] Worktree created
- [ ] Failing test written (TDD red)
- [ ] Implementation passing (TDD green)
- [ ] Pushed to master
- [ ] CI green on pushed commit
- [ ] Deployed / rolled out
- [ ] Done issue closed
"""
def test_snapshot_worktree_phase():
out = render(Phase.WORKTREE, {"repo": "infra", "issue": 7})
assert out == WORKTREE_SNAPSHOT
CI_SNAPSHOT = """\
### infra#7 — AFK run progress (thread thread-3)
- [x] Worktree created
- [x] Failing test written (TDD red)
- [x] Implementation passing (TDD green)
- [x] Pushed to master
- [~] CI green on pushed commit
- [ ] Deployed / rolled out
- [ ] Done issue closed
"""
def test_snapshot_ci_phase_with_thread():
out = render(Phase.CI, {"repo": "infra", "issue": 7, "thread_id": "thread-3"})
assert out == CI_SNAPSHOT
DONE_SNAPSHOT = """\
### infra#7 — AFK run progress
- [x] Worktree created
- [x] Failing test written (TDD red)
- [x] Implementation passing (TDD green)
- [x] Pushed to master
- [x] CI green on pushed commit
- [x] Deployed / rolled out
- [x] Done issue closed
"""
def test_snapshot_done_phase():
out = render(Phase.DONE, {"repo": "infra", "issue": 7})
assert out == DONE_SNAPSHOT

269
tests/test_afk_poller.py Normal file
View file

@ -0,0 +1,269 @@
"""Integration tests for ``app.afk.poller`` — the CronJob dispatch tick.
Unlike the unit suites, these wire the REAL pure cores (the actual
``dispatch_policy.select_dispatchable``) to the in-memory adapter FAKES from
``conftest`` (``FakeTracker`` / ``FakeT3Client``). No test touches a real T3
server, GitHub/Forgejo, or the cluster the poller is exercised end to end with
fakes standing in only for the I/O edges.
What the tick must do (the poller contract):
* **kill switch** a disabled config dispatches nothing AND never calls the
tracker or T3 (the CronJob does no I/O when the loop is off);
* read the ready set via ``tracker.list_ready(config.allowlist)``;
* derive the **per-repo lock** from the ready set itself a repo with an issue
already carrying the ``in_progress_label`` is in flight and is skipped (the
CronJob is stateless between ticks, so the tracker is the source of truth);
* run the real ``select_dispatchable`` over (ready issues, config, in-flight
repos) and, for each decision, ``t3_client.dispatch(...)`` then
``tracker.add_label(repo, issue, in_progress_label)`` label AFTER a
successful dispatch so a dispatch failure never leaves a phantom lock.
"""
import pytest
from app.afk import poller
from app.afk.types import Config
# --------------------------------------------------------------------------- #
# Helpers.
# --------------------------------------------------------------------------- #
def _poller(fake_tracker, fake_t3) -> poller.Poller:
"""A Poller wired to the conftest fakes and the real dispatch policy."""
return poller.Poller(tracker=fake_tracker, t3_client=fake_t3)
def _dispatched_pairs(fake_t3) -> set[tuple[str, int]]:
return {(d["repo"], d["issue"]) for d in fake_t3.dispatched}
def _added_in_progress(fake_tracker, label: str = "agent-in-progress") -> set[tuple[str, int]]:
return {
(repo, issue)
for (op, repo, issue, lbl) in fake_tracker.label_ops
if op == "add" and lbl == label
}
# --------------------------------------------------------------------------- #
# Kill switch — no dispatch, no I/O at all.
# --------------------------------------------------------------------------- #
def test_kill_switch_dispatches_nothing(fake_tracker, fake_t3, make_issue):
fake_tracker.seed("infra", [make_issue(number=1, repo="infra")])
config = Config(allowlist=["infra"], kill_switch=True)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert result.dispatched == []
assert fake_t3.dispatched == []
def test_kill_switch_does_not_even_read_the_tracker(fake_t3):
"""When the loop is off the CronJob must do zero I/O — not a single tracker
or T3 call. A tracker that explodes if touched proves it."""
class ExplodingTracker:
def list_ready(self, repos):
raise AssertionError("tracker must not be read when kill switch is on")
config = Config(allowlist=["infra"], kill_switch=True)
result = poller.Poller(tracker=ExplodingTracker(), t3_client=fake_t3).run_once(config)
assert result.dispatched == []
# --------------------------------------------------------------------------- #
# Empty allowlist — armed kill switch but nothing to run.
# --------------------------------------------------------------------------- #
def test_empty_allowlist_dispatches_nothing(fake_tracker, fake_t3, make_issue):
# list_ready([]) returns nothing, and even if it didn't the policy gates on
# the (empty) allowlist. The shipped default posture.
config = Config(allowlist=[], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert result.dispatched == []
assert fake_t3.dispatched == []
# --------------------------------------------------------------------------- #
# Happy path — one ready issue gets dispatched and labelled.
# --------------------------------------------------------------------------- #
def test_dispatches_a_ready_issue(fake_tracker, fake_t3, make_issue):
fake_tracker.seed("infra", [make_issue(number=7, repo="infra")])
config = Config(allowlist=["infra"], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert _dispatched_pairs(fake_t3) == {("infra", 7)}
assert len(result.dispatched) == 1
assert result.dispatched[0].thread_id == "thread-0"
assert result.dispatched[0].issue.number == 7
def test_labels_in_progress_after_dispatch(fake_tracker, fake_t3, make_issue):
fake_tracker.seed("infra", [make_issue(number=7, repo="infra")])
config = Config(allowlist=["infra"], kill_switch=False)
_poller(fake_tracker, fake_t3).run_once(config)
assert _added_in_progress(fake_tracker) == {("infra", 7)}
def test_in_progress_label_honours_config_override(fake_tracker, fake_t3, make_issue):
fake_tracker.seed("infra", [make_issue(number=7, repo="infra")])
config = Config(allowlist=["infra"], kill_switch=False, in_progress_label="busy")
_poller(fake_tracker, fake_t3).run_once(config)
assert _added_in_progress(fake_tracker, "busy") == {("infra", 7)}
def test_dispatch_prompt_references_the_issue(fake_tracker, fake_t3, make_issue):
"""The agent runs full-access and fetches the body itself, so the prompt the
poller sends must at minimum point at the concrete repo#issue."""
fake_tracker.seed("infra", [make_issue(number=7, repo="infra")])
config = Config(allowlist=["infra"], kill_switch=False)
_poller(fake_tracker, fake_t3).run_once(config)
prompt = fake_t3.dispatched[0]["prompt"]
assert "7" in prompt and "infra" in prompt
assert prompt.strip() # non-empty
# --------------------------------------------------------------------------- #
# Per-repo lock — an issue already carrying the in-progress label means an agent
# is in flight on that repo, so the repo is skipped this tick.
# --------------------------------------------------------------------------- #
def test_repo_with_in_progress_issue_is_locked(fake_tracker, fake_t3, make_issue):
in_flight = make_issue(
number=1, repo="infra", labels=["ready-for-agent", "agent-in-progress"]
)
waiting = make_issue(number=2, repo="infra", labels=["ready-for-agent"])
fake_tracker.seed("infra", [in_flight, waiting])
config = Config(allowlist=["infra"], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
# Repo already busy → nothing new dispatched, no new in-progress label.
assert result.dispatched == []
assert fake_t3.dispatched == []
assert _added_in_progress(fake_tracker) == set()
def test_lock_is_per_repo_not_global(fake_tracker, fake_t3, make_issue):
# infra is busy; a different repo is free and should still dispatch.
fake_tracker.seed(
"infra",
[make_issue(number=1, repo="infra", labels=["ready-for-agent", "agent-in-progress"])],
)
fake_tracker.seed("dotfiles", [make_issue(number=2, repo="dotfiles")])
config = Config(allowlist=["infra", "dotfiles"], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert _dispatched_pairs(fake_t3) == {("dotfiles", 2)}
assert {d.issue.repo for d in result.dispatched} == {"dotfiles"}
def test_custom_in_progress_label_drives_the_lock(fake_tracker, fake_t3, make_issue):
# The lock keys off config.in_progress_label, not the hardcoded default.
fake_tracker.seed(
"infra",
[make_issue(number=1, repo="infra", labels=["ready-for-agent", "busy"])],
)
config = Config(allowlist=["infra"], kill_switch=False, in_progress_label="busy")
result = _poller(fake_tracker, fake_t3).run_once(config)
assert result.dispatched == []
# --------------------------------------------------------------------------- #
# One dispatch per repo per tick (the policy's one-agent-per-repo invariant,
# observed through the poller): highest-priority eligible issue wins the slot.
# --------------------------------------------------------------------------- #
def test_one_dispatch_per_repo_per_tick(fake_tracker, fake_t3, make_issue):
fake_tracker.seed(
"infra",
[
make_issue(number=1, repo="infra", priority=1),
make_issue(number=2, repo="infra", priority=9), # highest priority
make_issue(number=3, repo="infra", priority=5),
],
)
config = Config(allowlist=["infra"], kill_switch=False)
_poller(fake_tracker, fake_t3).run_once(config)
assert _dispatched_pairs(fake_t3) == {("infra", 2)}
assert _added_in_progress(fake_tracker) == {("infra", 2)}
# --------------------------------------------------------------------------- #
# Gating still applies through the poller (the pure policy enforces it; the
# poller must not bypass it).
# --------------------------------------------------------------------------- #
def test_untrusted_issue_is_not_dispatched(fake_tracker, fake_t3, make_issue):
fake_tracker.seed(
"infra", [make_issue(number=1, repo="infra", labeled_by_trusted=False)]
)
config = Config(allowlist=["infra"], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert result.dispatched == []
assert fake_t3.dispatched == []
def test_blocked_issue_is_not_dispatched(fake_tracker, fake_t3, make_issue):
fake_tracker.seed(
"infra", [make_issue(number=2, repo="infra", blocked_by=[1])]
)
config = Config(allowlist=["infra"], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert result.dispatched == []
def test_repo_outside_allowlist_is_not_dispatched(fake_tracker, fake_t3, make_issue):
# list_ready only queries the allowlist, but even if a stray repo's issues
# arrive the policy's allowlist gate drops them.
fake_tracker.seed("secret", [make_issue(number=1, repo="secret")])
config = Config(allowlist=["infra"], kill_switch=False)
result = _poller(fake_tracker, fake_t3).run_once(config)
assert result.dispatched == []
# --------------------------------------------------------------------------- #
# Dispatch failure must not leave a phantom lock (label only AFTER success).
# --------------------------------------------------------------------------- #
def test_dispatch_failure_does_not_label_in_progress(fake_tracker, make_issue):
class FailingT3:
def __init__(self):
self.dispatched = []
def dispatch(self, repo, issue, prompt):
raise RuntimeError("T3 down")
fake_tracker.seed("infra", [make_issue(number=7, repo="infra")])
config = Config(allowlist=["infra"], kill_switch=False)
with pytest.raises(RuntimeError):
poller.Poller(tracker=fake_tracker, t3_client=FailingT3()).run_once(config)
# No in-progress label was applied — the issue stays purely ready, so the
# next tick retries it rather than treating it as locked.
assert _added_in_progress(fake_tracker) == set()
# --------------------------------------------------------------------------- #
# list_ready is called with exactly the allowlist (not all repos).
# --------------------------------------------------------------------------- #
def test_queries_only_the_allowlisted_repos(fake_t3, make_issue):
seen_repos: list[list[str]] = []
class RecordingTracker:
def list_ready(self, repos):
seen_repos.append(list(repos))
return []
def add_label(self, *a): # pragma: no cover - not reached here
raise AssertionError("nothing to label")
config = Config(allowlist=["infra", "dotfiles"], kill_switch=False)
poller.Poller(tracker=RecordingTracker(), t3_client=fake_t3).run_once(config)
assert seen_repos == [["infra", "dotfiles"]]

View file

@ -0,0 +1,190 @@
"""Tests for ``app.afk.run_state_machine.next_action`` — the pure decision
function that turns one assembled ``RunState`` into the next ``Action``.
The function encodes ADR-0002's run lifecycle:
* healthy (pushed AND CI green) -> CLOSE_SUCCESS
* cannot reach green before push (errored /
stalled with nothing pushed) -> ESCALATE_PREPUSH
* pushed but CI red, budget remaining -> FIX_FORWARD
* pushed but CI red, budget exhausted -> FREEZE_ESCALATE
* anything still in flight -> WAIT
It is PURE: no I/O, no clock, no globals it reads only its two arguments, so
every case is a plain table assertion. ``make_config`` / ``make_run_state`` come
from ``conftest.py`` (config defaults to ENABLED, run state to a fresh dispatch).
"""
import pytest
from app.afk.run_state_machine import next_action
from app.afk.types import Action, CIStatus, ThreadStatus
# --------------------------------------------------------------------------- #
# Healthy terminal: pushed + CI green -> close, regardless of thread status.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize(
"thread_status",
[ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None],
)
def test_pushed_and_green_closes_success(make_config, make_run_state, thread_status):
state = make_run_state(
thread_status=thread_status, ci_status=CIStatus.GREEN, pushed=True
)
assert next_action(state, make_config()) is Action.CLOSE_SUCCESS
# --------------------------------------------------------------------------- #
# Pre-push escalation: nothing pushed and the turn is no longer going to push
# (errored, or finished/stalled clean) -> hand back to a human.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize("thread_status", [ThreadStatus.ERROR, ThreadStatus.IDLE])
@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING])
def test_not_pushed_terminal_thread_escalates_prepush(
make_config, make_run_state, thread_status, ci_status
):
state = make_run_state(
thread_status=thread_status, ci_status=ci_status, pushed=False
)
assert next_action(state, make_config()) is Action.ESCALATE_PREPUSH
# --------------------------------------------------------------------------- #
# Still working toward a first push -> WAIT (not yet an escalation).
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize("thread_status", [ThreadStatus.RUNNING, None])
@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING])
def test_not_pushed_in_flight_waits(
make_config, make_run_state, thread_status, ci_status
):
state = make_run_state(
thread_status=thread_status, ci_status=ci_status, pushed=False
)
assert next_action(state, make_config()) is Action.WAIT
# --------------------------------------------------------------------------- #
# Pushed, CI not yet decided -> WAIT for the verdict, whatever the thread does.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize(
"thread_status",
[ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None],
)
@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING])
def test_pushed_ci_pending_waits(
make_config, make_run_state, thread_status, ci_status
):
state = make_run_state(
thread_status=thread_status, ci_status=ci_status, pushed=True
)
assert next_action(state, make_config()) is Action.WAIT
# --------------------------------------------------------------------------- #
# Pushed + CI red: fix-forward while BOTH budgets remain, else freeze.
# Boundaries are strict-less-than on attempts AND elapsed; at/over either freezes.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize(
("attempts", "elapsed", "expected"),
[
# fresh red, plenty of budget -> fix forward
(0, 0.0, Action.FIX_FORWARD),
(1, 10.0, Action.FIX_FORWARD),
# one attempt below the cap, well inside the clock -> still fix forward
(4, 3599.0, Action.FIX_FORWARD),
# attempts hit the cap (5) -> freeze
(5, 0.0, Action.FREEZE_ESCALATE),
(6, 0.0, Action.FREEZE_ESCALATE),
# clock hits the cap (3600s) -> freeze even with attempts to spare
(0, 3600.0, Action.FREEZE_ESCALATE),
(0, 7200.0, Action.FREEZE_ESCALATE),
# both exhausted -> freeze
(5, 3600.0, Action.FREEZE_ESCALATE),
],
)
def test_pushed_red_fix_forward_until_budget_exhausted(
make_config, make_run_state, attempts, elapsed, expected
):
state = make_run_state(
thread_status=ThreadStatus.IDLE,
ci_status=CIStatus.RED,
pushed=True,
fix_forward_attempts=attempts,
elapsed_seconds=elapsed,
)
assert next_action(state, make_config()) is expected
# --------------------------------------------------------------------------- #
# Fix-forward budget is honoured from config, not hardcoded.
# --------------------------------------------------------------------------- #
def test_fix_forward_attempts_cap_comes_from_config(make_config, make_run_state):
config = make_config(fix_forward_max_attempts=2)
red = dict(thread_status=ThreadStatus.IDLE, ci_status=CIStatus.RED, pushed=True)
assert next_action(make_run_state(fix_forward_attempts=1, **red), config) is Action.FIX_FORWARD
assert next_action(make_run_state(fix_forward_attempts=2, **red), config) is Action.FREEZE_ESCALATE
def test_fix_forward_seconds_cap_comes_from_config(make_config, make_run_state):
config = make_config(fix_forward_max_seconds=120)
red = dict(thread_status=ThreadStatus.IDLE, ci_status=CIStatus.RED, pushed=True)
assert next_action(make_run_state(elapsed_seconds=119.0, **red), config) is Action.FIX_FORWARD
assert next_action(make_run_state(elapsed_seconds=120.0, **red), config) is Action.FREEZE_ESCALATE
# --------------------------------------------------------------------------- #
# A red CI on a pushed commit while the thread is still RUNNING a fix is, per
# spec, keyed only on (pushed AND red) + budget — thread status doesn't gate it.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize(
"thread_status",
[ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None],
)
def test_pushed_red_with_budget_fixes_forward_for_any_thread_status(
make_config, make_run_state, thread_status
):
state = make_run_state(
thread_status=thread_status,
ci_status=CIStatus.RED,
pushed=True,
fix_forward_attempts=0,
elapsed_seconds=0.0,
)
assert next_action(state, make_config()) is Action.FIX_FORWARD
# --------------------------------------------------------------------------- #
# Full cross-product sanity sweep: next_action is TOTAL — it returns a real
# Action for every reachable combination, and matches the reference table.
# --------------------------------------------------------------------------- #
def _expected(thread_status, ci_status, pushed):
"""Reference implementation of the decision table, written independently of
the module under test, to cross-check every combination."""
if pushed and ci_status is CIStatus.GREEN:
return Action.CLOSE_SUCCESS
if pushed and ci_status is CIStatus.RED:
return Action.FIX_FORWARD # budget always available in this sweep
if not pushed and thread_status in (ThreadStatus.ERROR, ThreadStatus.IDLE):
return Action.ESCALATE_PREPUSH
return Action.WAIT
@pytest.mark.parametrize(
"thread_status",
[ThreadStatus.RUNNING, ThreadStatus.IDLE, ThreadStatus.ERROR, None],
)
@pytest.mark.parametrize("ci_status", [None, CIStatus.PENDING, CIStatus.GREEN, CIStatus.RED])
@pytest.mark.parametrize("pushed", [True, False])
def test_decision_table_is_total(
make_config, make_run_state, thread_status, ci_status, pushed
):
state = make_run_state(
thread_status=thread_status,
ci_status=ci_status,
pushed=pushed,
fix_forward_attempts=0,
elapsed_seconds=0.0,
)
result = next_action(state, make_config())
assert isinstance(result, Action)
assert result is _expected(thread_status, ci_status, pushed)

248
tests/test_afk_t3_client.py Normal file
View file

@ -0,0 +1,248 @@
"""Tests for ``app.afk.t3_client`` — the in-cluster T3 dispatch/snapshot adapter.
Everything here runs against an in-memory FAKE HTTP transport (``FakeHttp``);
no test touches a real T3 server, GitHub/Forgejo, or the cluster. The fake
records every request and replays staged responses, so the assertions pin the
wire contract the control plane depends on:
* ``dispatch`` issues exactly TWO POSTs to ``/api/orchestration/dispatch``
``thread.create`` then ``thread.turn.start`` carrying
``modelSelection.instanceId == "claudeAgent"`` and ``runtimeMode ==
"full-access"``, with ``ISSUE_IMPLEMENTER_PREAMBLE`` PREPENDED to
``message.text`` and the thread id from the first response threaded into the
second.
* each request carries the ``Authorization: Bearer <token>`` header from the
injected bearer provider (re-read per call, so token refresh is honoured).
* ``snapshot`` GETs ``/api/orchestration/snapshot`` and returns the parsed body.
"""
import pytest
from app.afk import t3_client
from app.afk.issue_implementer_prompt import ISSUE_IMPLEMENTER_PREAMBLE
# --------------------------------------------------------------------------- #
# Fake HTTP transport — httpx-shaped (``post``/``get`` → response with
# ``.json()`` + ``.raise_for_status()``), so the real client can hand the
# adapter a plain ``httpx.Client`` while tests hand it this recorder.
# --------------------------------------------------------------------------- #
class FakeResponse:
def __init__(self, payload: dict, status_code: int = 200) -> None:
self._payload = payload
self.status_code = status_code
def json(self) -> dict:
return self._payload
def raise_for_status(self) -> None:
if self.status_code >= 400:
raise RuntimeError(f"HTTP {self.status_code}")
class FakeHttp:
"""Records each POST/GET and replays queued responses in order.
``post`` pops from ``post_responses`` (FIFO); ``get`` pops from
``get_responses``. Each recorded call captures the url, json body, and
headers so tests can assert the two-command dispatch shape and the bearer.
"""
def __init__(
self,
post_responses: list[dict] | None = None,
get_responses: list[dict] | None = None,
) -> None:
self.post_responses = list(post_responses or [])
self.get_responses = list(get_responses or [])
self.posts: list[dict] = []
self.gets: list[dict] = []
def post(self, url: str, json: dict, headers: dict) -> FakeResponse:
self.posts.append({"url": url, "json": json, "headers": headers})
if not self.post_responses:
raise AssertionError("unexpected POST — no response staged")
return FakeResponse(self.post_responses.pop(0))
def get(self, url: str, headers: dict) -> FakeResponse:
self.gets.append({"url": url, "headers": headers})
if not self.get_responses:
raise AssertionError("unexpected GET — no response staged")
return FakeResponse(self.get_responses.pop(0))
# Two thread.create / thread.turn.start replies the happy-path dispatch needs.
_CREATE_REPLY = {"threadId": "thread-abc"}
_TURN_REPLY = {"ok": True}
def _client(http: FakeHttp, *, base_url: str = "http://t3-afk:8080", token: str = "tok-1"):
return t3_client.T3Client(
base_url=base_url,
http=http,
bearer_provider=lambda: token,
)
def _dispatch(http: FakeHttp, **kw) -> str:
repo = kw.pop("repo", "infra")
issue = kw.pop("issue", 42)
prompt = kw.pop("prompt", "Do the thing.")
return _client(http, **kw).dispatch(repo=repo, issue=issue, prompt=prompt)
# --------------------------------------------------------------------------- #
# dispatch — the two-POST shape.
# --------------------------------------------------------------------------- #
def test_dispatch_issues_exactly_two_posts_to_dispatch_endpoint():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http)
assert len(http.posts) == 2
assert http.gets == []
for call in http.posts:
assert call["url"] == "http://t3-afk:8080/api/orchestration/dispatch"
def test_dispatch_first_command_is_thread_create():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http)
assert http.posts[0]["json"]["command"] == "thread.create"
def test_dispatch_second_command_is_thread_turn_start():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http)
assert http.posts[1]["json"]["command"] == "thread.turn.start"
def test_dispatch_returns_thread_id_from_create_response():
http = FakeHttp(post_responses=[{"threadId": "thread-xyz"}, _TURN_REPLY])
assert _dispatch(http) == "thread-xyz"
def test_dispatch_threads_created_id_into_turn_start():
http = FakeHttp(post_responses=[{"threadId": "thread-xyz"}, _TURN_REPLY])
_dispatch(http)
# The second command must target the thread the first call created.
assert http.posts[1]["json"]["threadId"] == "thread-xyz"
# --------------------------------------------------------------------------- #
# dispatch — model selection / runtime envelope (the pilot-baked constants).
# --------------------------------------------------------------------------- #
def test_dispatch_uses_claude_agent_instance_and_full_access_runtime():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http)
create_body = http.posts[0]["json"]
assert create_body["modelSelection"]["instanceId"] == "claudeAgent"
assert create_body["runtimeMode"] == "full-access"
def test_dispatch_create_carries_repo_and_issue():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http, repo="claude-agent-service", issue=7)
create_body = http.posts[0]["json"]
assert create_body["repo"] == "claude-agent-service"
assert create_body["issue"] == 7
# --------------------------------------------------------------------------- #
# dispatch — the preamble PREPEND (behaviour injection).
# --------------------------------------------------------------------------- #
def test_dispatch_prepends_issue_implementer_preamble_to_message_text():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http, prompt="Implement issue 42 body here.")
text = http.posts[1]["json"]["message"]["text"]
assert text == ISSUE_IMPLEMENTER_PREAMBLE + "Implement issue 42 body here."
def test_dispatch_preamble_comes_strictly_before_the_prompt():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http, prompt="UNIQUE-PROMPT-MARKER")
text = http.posts[1]["json"]["message"]["text"]
assert text.startswith(ISSUE_IMPLEMENTER_PREAMBLE)
assert text.index(ISSUE_IMPLEMENTER_PREAMBLE) < text.index("UNIQUE-PROMPT-MARKER")
# The raw prompt is preserved verbatim after the preamble.
assert text.endswith("UNIQUE-PROMPT-MARKER")
def test_dispatch_does_not_prepend_preamble_to_create_command():
# The preamble belongs only on the turn message, not the thread.create call.
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http)
assert "message" not in http.posts[0]["json"]
# --------------------------------------------------------------------------- #
# Auth — bearer header, read from the injected provider each call.
# --------------------------------------------------------------------------- #
def test_dispatch_sends_bearer_on_both_posts():
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
_dispatch(http, token="secret-token")
for call in http.posts:
assert call["headers"]["Authorization"] == "Bearer secret-token"
def test_bearer_provider_is_called_per_request_so_refresh_is_honoured():
# A rotating provider proves the token isn't captured once at construction
# (T3's orchestration token expires hourly and must be re-read).
tokens = iter(["tok-A", "tok-B", "tok-C"])
http = FakeHttp(post_responses=[_CREATE_REPLY, _TURN_REPLY])
client = t3_client.T3Client(
base_url="http://t3-afk:8080",
http=http,
bearer_provider=lambda: next(tokens),
)
client.dispatch(repo="infra", issue=1, prompt="x")
assert http.posts[0]["headers"]["Authorization"] == "Bearer tok-A"
assert http.posts[1]["headers"]["Authorization"] == "Bearer tok-B"
# --------------------------------------------------------------------------- #
# snapshot — GET + parse.
# --------------------------------------------------------------------------- #
def test_snapshot_gets_snapshot_endpoint_and_returns_parsed_body():
fleet = {"threads": [{"id": "thread-abc", "status": "running"}]}
http = FakeHttp(get_responses=[fleet])
result = _client(http).snapshot()
assert result == fleet
assert len(http.gets) == 1
assert http.gets[0]["url"] == "http://t3-afk:8080/api/orchestration/snapshot"
assert http.posts == []
def test_snapshot_sends_bearer():
http = FakeHttp(get_responses=[{"threads": []}])
_client(http, token="snap-token").snapshot()
assert http.gets[0]["headers"]["Authorization"] == "Bearer snap-token"
# --------------------------------------------------------------------------- #
# base_url handling — a trailing slash must not produce a double slash.
# --------------------------------------------------------------------------- #
def test_trailing_slash_in_base_url_is_normalised():
http = FakeHttp(
post_responses=[_CREATE_REPLY, _TURN_REPLY],
get_responses=[{"threads": []}],
)
client = _client(http, base_url="http://t3-afk:8080/")
client.dispatch(repo="infra", issue=1, prompt="x")
client.snapshot()
assert http.posts[0]["url"] == "http://t3-afk:8080/api/orchestration/dispatch"
assert http.gets[0]["url"] == "http://t3-afk:8080/api/orchestration/snapshot"
# --------------------------------------------------------------------------- #
# Error surfacing — a non-2xx response must raise, not be swallowed.
# --------------------------------------------------------------------------- #
def test_dispatch_raises_when_a_post_returns_an_error_status():
class ErroringHttp(FakeHttp):
def post(self, url: str, json: dict, headers: dict) -> FakeResponse:
self.posts.append({"url": url, "json": json, "headers": headers})
return FakeResponse({}, status_code=500)
http = ErroringHttp()
with pytest.raises(RuntimeError):
_dispatch(http)
# It failed on the FIRST call — never blindly fired thread.turn.start after
# a failed thread.create.
assert len(http.posts) == 1

493
tests/test_afk_tracker.py Normal file
View file

@ -0,0 +1,493 @@
"""Tests for ``app.afk.tracker`` — the GitHub issues adapter.
The ``Tracker`` is the loop's read/write port onto the issue tracker. It wraps
an injected GitHub client (the real one shells out to ``gh``; here we inject a
FAKE that records calls and replays staged data) and holds all the *business*
logic the loop depends on: turning raw issues into ``Issue`` records with
``blocked_by`` parsed, ``labeled_by_trusted`` decided fail-closed from the label
event actor, and ``priority`` read off a priority label. No test here reaches a
real ``gh``, GitHub/Forgejo, or the network.
"""
import pytest
from app.afk.tracker import (
DEFAULT_TRUSTED_ASSOCIATIONS,
GitHubClient,
Tracker,
)
from app.afk.types import Issue
# --------------------------------------------------------------------------- #
# Fake GitHub client — the injected port. Records every mutating call and
# replays issues / label-events staged per repo. Implements the GitHubClient
# Protocol the Tracker depends on.
# --------------------------------------------------------------------------- #
class FakeGitHub:
def __init__(self) -> None:
# repo -> list of raw issue dicts (gh issue list --json shape)
self._issues: dict[str, list[dict]] = {}
# (repo, number) -> list of label-event dicts (who added which label)
self._events: dict[tuple[str, int], list[dict]] = {}
# recorded mutations
self.labels_added: list[tuple[str, int, str]] = []
self.labels_removed: list[tuple[str, int, str]] = []
self.comments: list[tuple[str, int, str]] = []
self.closed: list[tuple[str, int]] = []
# --- staging helpers (test-only) --- #
def seed_issues(self, repo: str, issues: list[dict]) -> None:
self._issues[repo] = issues
def seed_label_events(self, repo: str, number: int, events: list[dict]) -> None:
self._events[(repo, number)] = events
# --- GitHubClient surface --- #
def list_issues(self, repo: str, label: str) -> list[dict]:
return [
issue
for issue in self._issues.get(repo, [])
if label in [lbl["name"] for lbl in issue.get("labels", [])]
]
def label_events(self, repo: str, number: int) -> list[dict]:
return list(self._events.get((repo, number), []))
def add_label(self, repo: str, number: int, label: str) -> None:
self.labels_added.append((repo, number, label))
def remove_label(self, repo: str, number: int, label: str) -> None:
self.labels_removed.append((repo, number, label))
def comment(self, repo: str, number: int, body: str) -> None:
self.comments.append((repo, number, body))
def close(self, repo: str, number: int) -> None:
self.closed.append((repo, number))
# --------------------------------------------------------------------------- #
# Raw-issue / event builders matching the gh JSON shapes the real client emits.
# --------------------------------------------------------------------------- #
def _raw_issue(
number: int = 1,
labels: list[str] | None = None,
body: str = "",
) -> dict:
return {
"number": number,
"labels": [{"name": name} for name in (labels or ["ready-for-agent"])],
"body": body,
}
def _label_event(label: str, association: str = "OWNER", actor: str = "viktorbarzin") -> dict:
# Mirrors the `gh api .../timeline` "labeled" event shape we care about.
return {
"event": "labeled",
"label": {"name": label},
"actor": {"login": actor},
"author_association": association,
}
@pytest.fixture
def gh() -> FakeGitHub:
return FakeGitHub()
@pytest.fixture
def tracker(gh: FakeGitHub) -> Tracker:
return Tracker(gh)
# --------------------------------------------------------------------------- #
# Construction / contract.
# --------------------------------------------------------------------------- #
def test_tracker_wraps_injected_client(gh: FakeGitHub):
t = Tracker(gh)
assert t.client is gh
def test_fake_satisfies_protocol(gh: FakeGitHub):
# The fake must be usable where a GitHubClient is expected (structural typing).
assert isinstance(gh, GitHubClient)
def test_default_trusted_associations_are_collaborator_or_above():
assert DEFAULT_TRUSTED_ASSOCIATIONS == frozenset({"OWNER", "MEMBER", "COLLABORATOR"})
# --------------------------------------------------------------------------- #
# list_ready — the read path that builds Issue records.
# --------------------------------------------------------------------------- #
def test_list_ready_returns_issue_objects(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=7)])
gh.seed_label_events("infra", 7, [_label_event("ready-for-agent")])
issues = tracker.list_ready(["infra"])
assert len(issues) == 1
issue = issues[0]
assert isinstance(issue, Issue)
assert issue.number == 7
assert issue.repo == "infra"
assert issue.labels == ["ready-for-agent"]
def test_list_ready_spans_multiple_repos(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_issues("crawler", [_raw_issue(number=2)])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
gh.seed_label_events("crawler", 2, [_label_event("ready-for-agent")])
issues = tracker.list_ready(["infra", "crawler"])
assert {(i.repo, i.number) for i in issues} == {("infra", 1), ("crawler", 2)}
def test_list_ready_empty_when_no_ready_issues(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=1, labels=["bug"])])
assert tracker.list_ready(["infra"]) == []
def test_list_ready_queries_with_configured_ready_label(gh: FakeGitHub):
# A Tracker built with a custom ready label must query the client for *that*
# label, not the default.
seen: dict[str, str] = {}
class _RecordingGitHub(FakeGitHub):
def list_issues(self, repo: str, label: str) -> list[dict]:
seen["label"] = label
return super().list_issues(repo, label)
rec = _RecordingGitHub()
rec.seed_issues("infra", [_raw_issue(number=1, labels=["queue-me"])])
rec.seed_label_events("infra", 1, [_label_event("queue-me")])
t = Tracker(rec, ready_label="queue-me")
issues = t.list_ready(["infra"])
assert seen["label"] == "queue-me"
assert len(issues) == 1
# --------------------------------------------------------------------------- #
# Trust gate — labeled_by_trusted is decided from the label-event actor,
# fail-closed.
# --------------------------------------------------------------------------- #
def test_owner_labeled_issue_is_trusted(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association="OWNER")])
assert tracker.list_ready(["infra"])[0].labeled_by_trusted is True
@pytest.mark.parametrize("association", ["MEMBER", "COLLABORATOR"])
def test_collaborator_and_member_are_trusted(gh: FakeGitHub, tracker: Tracker, association: str):
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association=association)])
assert tracker.list_ready(["infra"])[0].labeled_by_trusted is True
@pytest.mark.parametrize("association", ["NONE", "CONTRIBUTOR", "FIRST_TIME_CONTRIBUTOR", ""])
def test_untrusted_association_is_not_trusted(gh: FakeGitHub, tracker: Tracker, association: str):
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association=association)])
assert tracker.list_ready(["infra"])[0].labeled_by_trusted is False
def test_missing_label_event_is_not_trusted(gh: FakeGitHub, tracker: Tracker):
# The issue carries the ready label, but no event records WHO applied it —
# fail closed: an unattributable label is never trusted.
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events("infra", 1, [])
assert tracker.list_ready(["infra"])[0].labeled_by_trusted is False
def test_trust_uses_latest_application_of_ready_label(gh: FakeGitHub, tracker: Tracker):
# If the ready label was removed and re-added, the MOST RECENT application
# decides trust — a trusted re-label after an untrusted one is trusted.
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events(
"infra",
1,
[
_label_event("ready-for-agent", association="NONE", actor="drive-by"),
_label_event("ready-for-agent", association="OWNER", actor="viktorbarzin"),
],
)
assert tracker.list_ready(["infra"])[0].labeled_by_trusted is True
def test_trust_ignores_events_for_other_labels(gh: FakeGitHub, tracker: Tracker):
# A trusted actor labeling something else must not make the ready label trusted.
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events(
"infra",
1,
[
_label_event("priority:high", association="OWNER"),
_label_event("ready-for-agent", association="NONE", actor="drive-by"),
],
)
assert tracker.list_ready(["infra"])[0].labeled_by_trusted is False
def test_custom_trusted_associations_override_default(gh: FakeGitHub):
# Tighten the trust set to OWNER only: a COLLABORATOR label is no longer trusted.
t = Tracker(gh, trusted_associations=frozenset({"OWNER"}))
gh.seed_issues("infra", [_raw_issue(number=1)])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent", association="COLLABORATOR")])
assert t.list_ready(["infra"])[0].labeled_by_trusted is False
# --------------------------------------------------------------------------- #
# blocked_by — parsed from the issue body's "Blocked by" references.
# --------------------------------------------------------------------------- #
def test_blocked_by_empty_when_body_has_no_references(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=1, body="just implement the thing")])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].blocked_by == []
def test_blocked_by_parses_single_reference(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=5, body="Blocked by #3")])
gh.seed_label_events("infra", 5, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].blocked_by == [3]
def test_blocked_by_parses_multiple_references(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=9, body="Blocked by #3, #4 and #10")])
gh.seed_label_events("infra", 9, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].blocked_by == [3, 4, 10]
def test_blocked_by_is_case_insensitive_and_dedupes(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=9, body="blocked BY #3 and Blocked by #3, #4")])
gh.seed_label_events("infra", 9, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].blocked_by == [3, 4]
def test_blocked_by_ignores_plain_issue_mentions(gh: FakeGitHub, tracker: Tracker):
# A bare "#7" that is not part of a "Blocked by" clause is NOT a blocker.
gh.seed_issues("infra", [_raw_issue(number=9, body="See #7 for context. Blocked by #3")])
gh.seed_label_events("infra", 9, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].blocked_by == [3]
def test_blocked_by_tolerates_missing_body(gh: FakeGitHub, tracker: Tracker):
issue = _raw_issue(number=1)
issue["body"] = None # gh returns null for an empty body
gh.seed_issues("infra", [issue])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].blocked_by == []
# --------------------------------------------------------------------------- #
# priority — read off a priority label (lower number runs first).
# --------------------------------------------------------------------------- #
def test_priority_defaults_to_zero_without_priority_label(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=1, labels=["ready-for-agent"])])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].priority == 0
def test_priority_read_from_priority_label(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues("infra", [_raw_issue(number=1, labels=["ready-for-agent", "priority:2"])])
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].priority == 2
def test_priority_lowest_label_wins_when_several(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues(
"infra", [_raw_issue(number=1, labels=["ready-for-agent", "priority:5", "priority:1"])]
)
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].priority == 1
def test_priority_ignores_non_numeric_priority_label(gh: FakeGitHub, tracker: Tracker):
gh.seed_issues(
"infra", [_raw_issue(number=1, labels=["ready-for-agent", "priority:high"])]
)
gh.seed_label_events("infra", 1, [_label_event("ready-for-agent")])
assert tracker.list_ready(["infra"])[0].priority == 0
# --------------------------------------------------------------------------- #
# Mutations delegate to the injected client.
# --------------------------------------------------------------------------- #
def test_add_label_delegates(gh: FakeGitHub, tracker: Tracker):
tracker.add_label("infra", 7, "agent-in-progress")
assert gh.labels_added == [("infra", 7, "agent-in-progress")]
def test_remove_label_delegates(gh: FakeGitHub, tracker: Tracker):
tracker.remove_label("infra", 7, "agent-in-progress")
assert gh.labels_removed == [("infra", 7, "agent-in-progress")]
def test_comment_delegates(gh: FakeGitHub, tracker: Tracker):
tracker.comment("infra", 7, "phase: tests-red done")
assert gh.comments == [("infra", 7, "phase: tests-red done")]
def test_close_delegates(gh: FakeGitHub, tracker: Tracker):
tracker.close("infra", 7)
assert gh.closed == [("infra", 7)]
# --------------------------------------------------------------------------- #
# The concrete gh-CLI-backed client builds no-shell argv and parses JSON; we
# inject a fake runner so no real `gh` is ever spawned.
# --------------------------------------------------------------------------- #
from app.afk.tracker import GhCliClient # noqa: E402
class _FakeRunner:
"""Stand-in for the subprocess runner GhCliClient shells out through.
Records every argv and returns staged stdout per command, so we can pin the
exact `gh` invocations without spawning a process.
"""
def __init__(self, responses: dict[tuple[str, ...], str] | None = None) -> None:
self.calls: list[tuple[str, ...]] = []
self._responses = responses or {}
def __call__(self, argv: list[str]) -> str:
key = tuple(argv)
self.calls.append(key)
return self._responses.get(key, "")
def test_gh_cli_list_issues_builds_no_shell_argv_and_parses_json():
argv = (
"gh", "issue", "list", "--repo", "owner/infra",
"--label", "ready-for-agent", "--state", "open",
"--json", "number,labels,body", "--limit", "100",
)
runner = _FakeRunner({argv: '[{"number": 4, "labels": [{"name": "ready-for-agent"}], "body": "x"}]'})
client = GhCliClient(repo_owner="owner", run=runner)
issues = client.list_issues("infra", "ready-for-agent")
assert runner.calls == [argv]
assert issues == [{"number": 4, "labels": [{"name": "ready-for-agent"}], "body": "x"}]
def test_gh_cli_list_issues_empty_output_is_empty_list():
runner = _FakeRunner() # returns "" for everything
client = GhCliClient(repo_owner="owner", run=runner)
assert client.list_issues("infra", "ready-for-agent") == []
def test_gh_cli_label_events_filters_labeled_events():
timeline = (
'[{"event": "commented"},'
' {"event": "labeled", "label": {"name": "ready-for-agent"},'
' "actor": {"login": "viktorbarzin"}, "author_association": "OWNER"}]'
)
argv = (
"gh", "api",
"repos/owner/infra/issues/4/timeline",
"--paginate",
"-H", "Accept: application/vnd.github+json",
)
runner = _FakeRunner({argv: timeline})
client = GhCliClient(repo_owner="owner", run=runner)
events = client.label_events("infra", 4)
assert runner.calls == [argv]
assert [e["event"] for e in events] == ["labeled"]
assert events[0]["label"]["name"] == "ready-for-agent"
def test_gh_cli_add_label_builds_argv():
runner = _FakeRunner()
client = GhCliClient(repo_owner="owner", run=runner)
client.add_label("infra", 4, "agent-in-progress")
assert runner.calls == [
("gh", "issue", "edit", "4", "--repo", "owner/infra", "--add-label", "agent-in-progress")
]
def test_gh_cli_remove_label_builds_argv():
runner = _FakeRunner()
client = GhCliClient(repo_owner="owner", run=runner)
client.remove_label("infra", 4, "agent-in-progress")
assert runner.calls == [
("gh", "issue", "edit", "4", "--repo", "owner/infra", "--remove-label", "agent-in-progress")
]
def test_gh_cli_comment_builds_argv():
runner = _FakeRunner()
client = GhCliClient(repo_owner="owner", run=runner)
client.comment("infra", 4, "phase update")
assert runner.calls == [
("gh", "issue", "comment", "4", "--repo", "owner/infra", "--body", "phase update")
]
def test_gh_cli_close_builds_argv():
runner = _FakeRunner()
client = GhCliClient(repo_owner="owner", run=runner)
client.close("infra", 4)
assert runner.calls == [
("gh", "issue", "close", "4", "--repo", "owner/infra")
]
def test_gh_cli_end_to_end_through_tracker():
# Wire the gh-CLI client (fake runner) behind a real Tracker and confirm a
# full read produces a correctly-decoded, trusted, blocked Issue.
list_argv = (
"gh", "issue", "list", "--repo", "owner/infra",
"--label", "ready-for-agent", "--state", "open",
"--json", "number,labels,body", "--limit", "100",
)
timeline_argv = (
"gh", "api",
"repos/owner/infra/issues/12/timeline",
"--paginate",
"-H", "Accept: application/vnd.github+json",
)
runner = _FakeRunner({
list_argv: (
'[{"number": 12,'
' "labels": [{"name": "ready-for-agent"}, {"name": "priority:3"}],'
' "body": "Blocked by #11"}]'
),
timeline_argv: (
'[{"event": "labeled", "label": {"name": "ready-for-agent"},'
' "actor": {"login": "viktorbarzin"}, "author_association": "OWNER"}]'
),
})
tracker = Tracker(GhCliClient(repo_owner="owner", run=runner))
issue = tracker.list_ready(["infra"])[0]
assert issue.number == 12
assert issue.repo == "infra"
assert issue.blocked_by == [11]
assert issue.priority == 3
assert issue.labeled_by_trusted is True

349
tests/test_afk_watcher.py Normal file
View file

@ -0,0 +1,349 @@
"""Integration tests for ``app.afk.watcher`` — the in-flight run driver.
These wire the REAL pure cores (the actual ``run_state_machine.next_action`` and
``phase_checklist.render``) to the in-memory adapter FAKES from ``conftest``
(``FakeT3Client`` / ``FakeTracker`` / ``FakeCIWatcher`` / ``FakeNotifier``). No
test touches a real T3 server, GitHub/Forgejo, the cluster, or Slack the
watcher is exercised end to end with fakes only at the I/O edges.
What one watch tick must do (the watcher contract), given an in-flight run
``(issue, thread_id, commit, bookkeeping)``:
* assemble a ``RunState`` from ``t3_client.snapshot()`` (the thread's liveness)
+ ``ci_watcher.status(repo, commit)`` (the CI verdict, only when something is
pushed) + the run's own ``pushed`` / ``fix_forward_attempts`` /
``elapsed_seconds`` bookkeeping, and feed it to the pure state machine;
* **CLOSE_SUCCESS** ``tracker.close``, drop the in-progress label, post the
DONE checklist, and ring the ``done`` doorbell;
* **ESCALATE_PREPUSH / FREEZE_ESCALATE** drop the in-progress label, relabel
``ready-for-human``, ring the ``needs-human`` / ``frozen`` doorbell, post the
checklist the run is handed back to a human;
* **FIX_FORWARD** dispatch a corrective turn (``t3_client.dispatch``), bump
the fix-forward attempt count, keep the run in flight, refresh the checklist;
NOT terminal, so no doorbell and no label churn;
* **WAIT** just refresh the progress checklist and keep waiting; no labels,
no close, no doorbell, no dispatch.
"""
import pytest
from app.afk import watcher
from app.afk.notifier import KIND_DONE, KIND_FROZEN, KIND_NEEDS_HUMAN
from app.afk.types import CIStatus, Issue
# --------------------------------------------------------------------------- #
# Helpers.
# --------------------------------------------------------------------------- #
READY_FOR_HUMAN = "ready-for-human"
def _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier) -> watcher.Watcher:
return watcher.Watcher(
t3_client=fake_t3,
tracker=fake_tracker,
ci_watcher=fake_ci,
notifier=fake_notifier,
)
def _run(
issue: Issue,
thread_id: str = "thread-0",
commit: str | None = None,
fix_forward_attempts: int = 0,
elapsed_seconds: float = 0.0,
) -> watcher.InFlightRun:
return watcher.InFlightRun(
issue=issue,
thread_id=thread_id,
commit=commit,
fix_forward_attempts=fix_forward_attempts,
elapsed_seconds=elapsed_seconds,
)
def _snapshot(thread_id: str, status: str) -> dict:
return {"threads": [{"id": thread_id, "status": status}]}
def _labels(fake_tracker):
return [(op, repo, num, lbl) for (op, repo, num, lbl) in fake_tracker.label_ops]
def _kinds(fake_notifier):
return [n["kind"] for n in fake_notifier.sent]
# --------------------------------------------------------------------------- #
# WAIT — agent still working, nothing pushed: refresh the checklist, no action.
# --------------------------------------------------------------------------- #
def test_wait_refreshes_checklist_and_does_nothing_else(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "running"))
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue), make_config()
)
assert result.action.value == "wait"
assert result.terminal is False
assert fake_tracker.closed == []
assert _labels(fake_tracker) == [] # no label churn while waiting
assert fake_notifier.sent == [] # no doorbell
assert fake_t3.dispatched == [] # no corrective turn
# The progress checklist was posted as a comment.
assert len(fake_tracker.comments) == 1
repo, num, body = fake_tracker.comments[0]
assert (repo, num) == ("infra", 7)
assert "AFK run progress" in body
def test_wait_when_thread_missing_from_snapshot(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
# No snapshot entry for this thread yet -> thread_status None -> WAIT.
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot({"threads": []})
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue), make_config()
)
assert result.action.value == "wait"
assert result.terminal is False
def test_pushed_ci_pending_waits(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "running"))
# commit present (pushed) but CI not yet decided -> PENDING -> WAIT.
fake_ci.set_status("infra", "deadbeef", CIStatus.PENDING)
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="deadbeef"), make_config()
)
assert result.action.value == "wait"
assert fake_tracker.closed == []
# --------------------------------------------------------------------------- #
# CLOSE_SUCCESS — pushed + CI green: close, unlabel, DONE checklist, doorbell.
# --------------------------------------------------------------------------- #
def test_close_success_closes_and_unlabels_and_notifies(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "cafef00d", CIStatus.GREEN)
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="cafef00d"), make_config()
)
assert result.action.value == "close_success"
assert result.terminal is True
assert fake_tracker.closed == [("infra", 7)]
# in-progress label removed (no ready-for-human on the happy path).
assert ("remove", "infra", 7, "agent-in-progress") in _labels(fake_tracker)
assert ("add", "infra", 7, READY_FOR_HUMAN) not in _labels(fake_tracker)
# done doorbell fired with the thread deep-link target.
assert _kinds(fake_notifier) == [KIND_DONE]
assert fake_notifier.sent[0]["thread_id"] == "thread-0"
assert fake_notifier.sent[0]["issue"] is issue
def test_close_success_posts_done_checklist(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "cafef00d", CIStatus.GREEN)
_watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="cafef00d"), make_config()
)
# The final checklist shows the run DONE — every phase checked.
body = fake_tracker.comments[-1][2]
assert "Done — issue closed" in body
assert "- [ ]" not in body # nothing left unchecked at DONE
# --------------------------------------------------------------------------- #
# ESCALATE_PREPUSH — agent stalled/errored before any push: hand to a human.
# --------------------------------------------------------------------------- #
@pytest.mark.parametrize("thread_state", ["error", "idle"])
def test_escalate_prepush_relabels_and_notifies(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config, thread_state
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", thread_state))
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit=None), make_config()
)
assert result.action.value == "escalate_prepush"
assert result.terminal is True
assert fake_tracker.closed == [] # NOT closed — needs a human
labels = _labels(fake_tracker)
assert ("remove", "infra", 7, "agent-in-progress") in labels
assert ("add", "infra", 7, READY_FOR_HUMAN) in labels
assert _kinds(fake_notifier) == [KIND_NEEDS_HUMAN]
# --------------------------------------------------------------------------- #
# FREEZE_ESCALATE — pushed, CI red, fix-forward budget exhausted: freeze + page.
# --------------------------------------------------------------------------- #
def test_freeze_escalate_relabels_and_notifies(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "badc0de", CIStatus.RED)
config = make_config(fix_forward_max_attempts=3)
# attempts already at the cap -> budget exhausted -> FREEZE_ESCALATE.
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="badc0de", fix_forward_attempts=3), config
)
assert result.action.value == "freeze_escalate"
assert result.terminal is True
assert fake_tracker.closed == []
labels = _labels(fake_tracker)
assert ("remove", "infra", 7, "agent-in-progress") in labels
assert ("add", "infra", 7, READY_FOR_HUMAN) in labels
assert _kinds(fake_notifier) == [KIND_FROZEN]
# --------------------------------------------------------------------------- #
# FIX_FORWARD — pushed, CI red, budget remaining: corrective turn, stay in flight.
# --------------------------------------------------------------------------- #
def test_fix_forward_dispatches_corrective_turn(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "badc0de", CIStatus.RED)
config = make_config(fix_forward_max_attempts=5)
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="badc0de", fix_forward_attempts=1), config
)
assert result.action.value == "fix_forward"
assert result.terminal is False
# A corrective turn was dispatched against the same repo/issue.
assert len(fake_t3.dispatched) == 1
assert (fake_t3.dispatched[0]["repo"], fake_t3.dispatched[0]["issue"]) == ("infra", 7)
# Attempt count advanced and is surfaced on the result for the caller's
# bookkeeping on the next tick.
assert result.fix_forward_attempts == 2
# Not terminal: no close, no ready-for-human, no doorbell.
assert fake_tracker.closed == []
assert ("add", "infra", 7, READY_FOR_HUMAN) not in _labels(fake_tracker)
assert fake_notifier.sent == []
def test_fix_forward_updates_thread_id_to_corrective_turn(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
# The corrective dispatch spawns a new thread; the result carries the new id
# so the next tick polls the right thread.
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "badc0de", CIStatus.RED)
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, thread_id="thread-old", commit="badc0de"), make_config()
)
assert result.thread_id == "thread-0" # FakeT3Client hands back thread-0
assert result.thread_id != "thread-old"
def test_fix_forward_note_appears_in_checklist(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "badc0de", CIStatus.RED)
_watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="badc0de", fix_forward_attempts=1), make_config()
)
body = fake_tracker.comments[-1][2]
assert "Fix-forward" in body
# --------------------------------------------------------------------------- #
# Unknown / unrecognised thread status folds to "keep waiting" (fail-safe).
# --------------------------------------------------------------------------- #
def test_unknown_thread_status_waits(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "provisioning")) # not a known status
result = _watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit=None), make_config()
)
# Unknown status must not escalate or close — treat as "no status yet".
assert result.action.value == "wait"
assert fake_tracker.closed == []
assert fake_notifier.sent == []
# --------------------------------------------------------------------------- #
# Terminal cleanup only happens once / cleanly: a terminal tick posts exactly
# one checklist comment (no double-commenting on the way out).
# --------------------------------------------------------------------------- #
def test_terminal_tick_posts_exactly_one_checklist(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "idle"))
fake_ci.set_status("infra", "cafef00d", CIStatus.GREEN)
_watcher(fake_t3, fake_tracker, fake_ci, fake_notifier).tick(
_run(issue, commit="cafef00d"), make_config()
)
assert len(fake_tracker.comments) == 1
# --------------------------------------------------------------------------- #
# CI status is only queried when something is pushed (don't hit CI for an
# unpushed run — there's no commit to check).
# --------------------------------------------------------------------------- #
def test_ci_not_queried_when_nothing_pushed(
fake_t3, fake_tracker, fake_notifier, make_issue, make_config
):
class ExplodingCI:
def status(self, repo, commit):
raise AssertionError("CI must not be queried with no pushed commit")
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "running"))
result = watcher.Watcher(
t3_client=fake_t3,
tracker=fake_tracker,
ci_watcher=ExplodingCI(),
notifier=fake_notifier,
).tick(_run(issue, commit=None), make_config())
assert result.action.value == "wait"
# --------------------------------------------------------------------------- #
# ready-for-human label is configurable.
# --------------------------------------------------------------------------- #
def test_ready_for_human_label_is_configurable(
fake_t3, fake_tracker, fake_ci, fake_notifier, make_issue, make_config
):
issue = make_issue(number=7, repo="infra")
fake_t3.set_snapshot(_snapshot("thread-0", "error"))
w = watcher.Watcher(
t3_client=fake_t3,
tracker=fake_tracker,
ci_watcher=fake_ci,
notifier=fake_notifier,
ready_for_human_label="needs-eyes",
)
w.tick(_run(issue, commit=None), make_config())
assert ("add", "infra", 7, "needs-eyes") in _labels(fake_tracker)