From f0948493b365777836ad7a5f98b1c9490fca24d4 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Tue, 2 Jun 2026 21:02:36 +0000
Subject: [PATCH] claude-agent-service: wire parallel execution (git-crypt
 mount, memory, MAX_CONCURRENCY)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The service now runs agent calls concurrently (bounded semaphore, per-job
isolated clones) instead of single-flight. Infra side:
- mount git-crypt-key into the main container (each job re-unlocks its own clone)
- MAX_CONCURRENCY=10 env (excess calls queue FIFO)
- bump pod memory 2Gi req / 12Gi limit, cpu req 1 (Burstable, tier-aux) — sized
  for ~10 concurrent claude+terraform runs; fits node2/3/5 headroom
- docs: beads-auto-dispatch + automated-upgrades no longer describe single-slot

Service code: viktor/claude-agent-service @ 66104a3.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/architecture/automated-upgrades.md |  2 +-
 docs/runbooks/beads-auto-dispatch.md    |  9 ++++---
 stacks/claude-agent-service/main.tf     | 33 ++++++++++++++++++++-----
 3 files changed, 34 insertions(+), 10 deletions(-)
diff --git a/docs/architecture/automated-upgrades.md b/docs/architecture/automated-upgrades.md
index 8ff35609..5d8b1c9e 100644
--- a/docs/architecture/automated-upgrades.md
+++ b/docs/architecture/automated-upgrades.md
@@ -141,7 +141,7 @@ curl -s -X POST "$WEBHOOK" \
   -d '{"diun_entry_status":"update","diun_entry_image":"<image>","diun_entry_imagetag":"<new_tag>","diun_entry_provider":"kubernetes"}'
 ```
 
-n8n processes all webhooks in parallel (one `claude -p` per webhook). Before bulk runs, increase the rate limit in the n8n Code node (`MAX_UPGRADES_PER_WINDOW`) and reset the counter:
+n8n processes all webhooks in parallel (one `claude -p` per webhook); `claude-agent-service` runs them concurrently via a bounded pool (`MAX_CONCURRENCY`, default 10, excess queued) — it no longer single-flight-locks. Before bulk runs, increase the rate limit in the n8n Code node (`MAX_UPGRADES_PER_WINDOW`) and reset the counter:
 
 ```sql
 -- Reset rate limiter
diff --git a/docs/runbooks/beads-auto-dispatch.md b/docs/runbooks/beads-auto-dispatch.md
index f8bade68..dd3fb3c7 100644
--- a/docs/runbooks/beads-auto-dispatch.md
+++ b/docs/runbooks/beads-auto-dispatch.md
@@ -137,9 +137,12 @@ bd assign <id> agent     # re-arm for next dispatcher tick
 
 - **Sentinel assignee `agent`** — free-form, no Beads schema change. Any bd
   client can set it (`bd assign <id> agent`).
-- **Sequential dispatch** — matches `claude-agent-service`'s single-slot
-  `asyncio.Lock`. With a 2-min poll cadence and ~5-min average run,
-  throughput is ~12 beads/hour. Parallelism is a separate plan.
+- **One-bead-per-tick dispatch** — the dispatcher submits at most one bead
+  per 2-min tick, gating on `claude-agent-service`'s `/health` `busy` flag.
+  `busy` now means `active >= capacity` (bounded semaphore, default 10) — the
+  service no longer single-flight-locks via `asyncio.Lock`. So up to
+  ~`capacity` beads can run concurrently; the 2-min poll cadence (not
+  single-slot execution) now bounds ramp-up.
 - **Fixed agent (`beads-task-runner`)** — read-only rails, matches BeadBoard's
   manual Dispatch button. Broader-privilege agents stay manual.
 - **CronJob (not in-service polling, not n8n)** — matches existing infra
diff --git a/stacks/claude-agent-service/main.tf b/stacks/claude-agent-service/main.tf
index 2187f52f..d789c938 100644
--- a/stacks/claude-agent-service/main.tf
+++ b/stacks/claude-agent-service/main.tf
@@ -12,7 +12,7 @@ locals {
   namespace = "claude-agent"
   # Phase 3 cutover 2026-05-07 — see infra/docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md.
   image     = "forgejo.viktorbarzin.me/viktor/claude-agent-service"
-  image_tag = "191ed5dd"
+  image_tag = "latest"
   labels = {
     app = "claude-agent-service"
   }
@@ -201,8 +201,11 @@ resource "kubernetes_cluster_role_binding" "claude_agent" {
 # For cases where the agent DOES need to persist state across pod restarts
 # (caches, ad-hoc outputs, anything that should survive a pod reschedule),
 # `module.persistent` below provides a 5Gi NFS-backed RWX volume mounted
-# at /persistent. RWX so all 3 replicas can read/write the same dir;
-# sequential job mutex in the service prevents concurrent writes.
+# at /persistent for state that should survive a pod reschedule. Since the
+# service now runs jobs concurrently (bounded semaphore, no single-flight
+# lock), agents sharing /persistent must use per-job paths to avoid races —
+# per-job *workspaces* are isolated (own clone under /workspace/jobs/<id>),
+# but /persistent is shared.
 module "persistent" {
   source     = "../../modules/kubernetes/nfs_volume"
   name       = "claude-agent-persistent"
@@ -416,6 +419,14 @@ resource "kubernetes_deployment" "claude_agent" {
             value = "/workspace/infra"
           }
 
+          # Soft-unbounded concurrency: this caps simultaneous agent runs;
+          # excess calls queue FIFO rather than 409/503. Each run peaks ~0.5-1.5Gi
+          # (claude + terraform), so this and the memory limit are sized together.
+          env {
+            name  = "MAX_CONCURRENCY"
+            value = "10"
+          }
+
           liveness_probe {
             http_get {
               path = "/health"
@@ -451,13 +462,23 @@ resource "kubernetes_deployment" "claude_agent" {
             mount_path = "/home/agent/.claude"
           }
 
+          # git-crypt key — each job re-unlocks its own clone, so the runtime
+          # container (not just the git-init init container) needs the key.
+          volume_mount {
+            name       = "git-crypt-key"
+            mount_path = "/secrets/git-crypt"
+          }
+
+          # Burstable (tier-aux). Sized for ~10 concurrent agent runs at
+          # ~0.5-1.5Gi each (see MAX_CONCURRENCY). No CPU limit per cluster
+          # policy (CFS throttling); request only.
           resources {
             requests = {
-              cpu    = "500m"
-              memory = "1Gi"
+              cpu    = "1"
+              memory = "2Gi"
             }
             limits = {
-              memory = "2Gi"
+              memory = "12Gi"
             }
           }
         }