From e1d20457c4c75acc38a9a8dae4020af2a58d8e6e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 14:23:19 +0000 Subject: [PATCH] [infra/claude-agent-service] Seed beads metadata + scratch dir at runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Review of the BeadBoard Dispatch wiring found that the claude-agent-service Dockerfile's `COPY beads/metadata.json /workspace/.beads/metadata.json` and `COPY agents/beads-task-runner.md /home/agent/.claude/agents/...` both land on paths that are volume-mounted at runtime: - `/workspace` → `claude-agent-workspace-encrypted` PVC (main.tf:394-398) - `/home/agent/.claude` → `claude-home` emptyDir (main.tf:424-427) Kubernetes mounts hide image-layer content at those paths, so the COPYs are dead. The companion commit in `claude-agent-service` restages both files to `/usr/share/agent-seed/` (an image-layer path that is never mounted). Additionally, the beads-task-runner agent rails expect `/workspace/scratch//` to exist, but nothing was creating it. ## Layout before / after ``` Before (dead COPYs): image layer runtime (mounted volumes hide the files) ----------- ----------------------------------- /workspace/ <- hidden by PVC mount .beads/ metadata.json <- UNREACHABLE /home/agent/.claude/ <- hidden by emptyDir mount agents/ beads-task-runner.md <- UNREACHABLE After (init container seeds volumes at pod start): image layer runtime ----------- ------------------------------------ /usr/share/agent-seed/ beads-metadata.json --+ beads-task-runner.md --+-> copied by seed-beads-agent init container into the mounted volumes on every pod start: /workspace/.beads/metadata.json /workspace/scratch/ /home/agent/.claude/agents/beads-task-runner.md ``` ## What ### New init container: `seed-beads-agent` - Positioned AFTER `git-init`, BEFORE the main container. - Uses the same service image (`${local.image}:${local.image_tag}`) — the seed files are baked in at `/usr/share/agent-seed/`. - Runs as default uid 1000 (the PVCs are already chowned by `fix-perms`). - Shell body: mkdir -p /workspace/.beads /workspace/scratch /home/agent/.claude/agents cp /usr/share/agent-seed/beads-metadata.json /workspace/.beads/metadata.json cp /usr/share/agent-seed/beads-task-runner.md /home/agent/.claude/agents/beads-task-runner.md - Mounts: `workspace` at `/workspace`, `claude-home` at `/home/agent/.claude`. - Resources: 32Mi requests / 64Mi limits (matches `fix-perms`/`copy-claude-creds`). ### Formatting - `terraform fmt -recursive` also normalised whitespace in the token-expiry locals block and the CronJob container definition. No semantic change. ## What is NOT in this change - No image tag bump. The Dockerfile refactor that produces the `/usr/share/agent-seed/` path lands in the claude-agent-service repo and will roll in on the next CI build. Until that build ships and the tag is bumped in this file, the new init container will `cp` from a path that doesn't exist yet — so do NOT apply this commit until the corresponding image tag bump is ready. The commit is declarative prep. - No changes to storage class, RBAC, Service, or any other init. - The main container mounts remain unchanged — only the init containers prepare volume contents. ## Test Plan ### Automated ``` $ terraform fmt -check -recursive stacks/claude-agent-service/ (no output — clean) $ terraform -chdir=stacks/claude-agent-service/ init -backend=false Terraform has been successfully initialized! $ terraform -chdir=stacks/claude-agent-service/ validate Warning: Deprecated Resource (pre-existing; use kubernetes_namespace_v1) Success! The configuration is valid, but there were some validation warnings as shown above. ``` ### Manual Verification (after image bump + apply) 1. Bump `local.image_tag` in main.tf to the SHA of a build that has `/usr/share/agent-seed/*` (verify with `docker inspect $IMAGE | jq ...` or `kubectl run tmp --image ... -- ls /usr/share/agent-seed`). 2. `scripts/tg apply stacks/claude-agent-service` 3. `kubectl -n claude-agent get pods -w` — all init containers complete. 4. `kubectl -n claude-agent exec deploy/claude-agent-service -c claude-agent-service -- ls -la /workspace/.beads/metadata.json /home/agent/.claude/agents/beads-task-runner.md /workspace/scratch` Expected: all three paths exist; first two are regular files with the expected content, `scratch` is a directory. 5. `kubectl -n claude-agent exec deploy/claude-agent-service -c claude-agent-service -- jq -r .dolt_server_host /workspace/.beads/metadata.json` Expected: `dolt.beads-server.svc.cluster.local`. Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/claude-agent-service/main.tf | 46 +++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/stacks/claude-agent-service/main.tf b/stacks/claude-agent-service/main.tf index 53a3d695..e07e1d75 100644 --- a/stacks/claude-agent-service/main.tf +++ b/stacks/claude-agent-service/main.tf @@ -330,6 +330,42 @@ resource "kubernetes_deployment" "claude_agent" { } } + # Seed beads metadata + beads-task-runner agent into runtime volumes. + # The Dockerfile stages these files at /usr/share/agent-seed/ (image + # layer, never mounted). Both /workspace (PVC) and /home/agent/.claude + # (emptyDir) are volume mounts that hide any image-layer content, so + # the files have to be copied in at pod start. Also creates the + # scratch directory the beads-task-runner rails expect. + init_container { + name = "seed-beads-agent" + image = "${local.image}:${local.image_tag}" + command = ["sh", "-c", <<-EOT + set -e + mkdir -p /workspace/.beads /workspace/scratch /home/agent/.claude/agents + cp /usr/share/agent-seed/beads-metadata.json /workspace/.beads/metadata.json + cp /usr/share/agent-seed/beads-task-runner.md /home/agent/.claude/agents/beads-task-runner.md + EOT + ] + + volume_mount { + name = "workspace" + mount_path = "/workspace" + } + volume_mount { + name = "claude-home" + mount_path = "/home/agent/.claude" + } + + resources { + requests = { + memory = "32Mi" + } + limits = { + memory = "64Mi" + } + } + } + container { name = "claude-agent-service" image = "${local.image}:${local.image_tag}" @@ -464,9 +500,9 @@ resource "kubernetes_service" "claude_agent" { locals { claude_oauth_token_mint_epochs = { # unix seconds (UTC) — when `claude setup-token` finished minting - "primary" = 1776528429 # 2026-04-18T12:07:09Z (TOKEN2) - "spare-1" = 1776528280 # 2026-04-18T12:04:40Z (TOKEN1) - "spare-2" = 1776528429 # 2026-04-18T12:07:09Z (TOKEN2 — redundant w/ primary) + "primary" = 1776528429 # 2026-04-18T12:07:09Z (TOKEN2) + "spare-1" = 1776528280 # 2026-04-18T12:04:40Z (TOKEN1) + "spare-2" = 1776528429 # 2026-04-18T12:07:09Z (TOKEN2 — redundant w/ primary) } claude_oauth_token_ttl_seconds = 365 * 24 * 60 * 60 } @@ -502,8 +538,8 @@ resource "kubernetes_cron_job_v1" "claude_oauth_expiry_monitor" { spec { restart_policy = "OnFailure" container { - name = "push-expiry" - image = "docker.io/curlimages/curl:8.11.0" + name = "push-expiry" + image = "docker.io/curlimages/curl:8.11.0" command = ["/bin/sh", "-c", <<-EOT set -e PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/claude-oauth-expiry-monitor'