# ============================================================================= # t3-afk — dedicated, in-cluster T3 Code instance: the EXECUTOR + COCKPIT for the # AFK implementation pipeline (slice #2 of claude-agent-service PRD #1). # # claude-agent-service (control plane) dispatches issues INTO this T3 instance # over its orchestration HTTP API; T3 runs the issue-implementer agent in a git # worktree and shows every worker in its cockpit. See: # claude-agent-service/docs/2026-06-14-afk-implementation-pipeline-design.md # claude-agent-service/docs/adr/0003-t3-thin-executor-and-cockpit.md # # PILOT SHORTCUT (chosen 2026-06-14): no custom-built image. We run stock # `node:24` (the full image ships git + python3/make/g++ for node-pty) and an # init container installs PINNED npm packages (t3@0.0.27 + the Claude CLI) onto # the SSD PVC, cached across restarts. Formalize a digest-pinned built image # post-GO. T3 is version-pinned (npm) and NOT Keel-enrolled. # ============================================================================= # No plan-time Vault reads — every secret flows through the ExternalSecret below # (CLAUDE_CODE_OAUTH_TOKEN / GITHUB_TOKEN / FORGEJO_TOKEN), injected as env at # runtime. Nothing here needs a secret value at plan time. # Wildcard TLS secret name — value comes from config.tfvars; consumed by the # ingress factory (every stack that uses the factory declares this). variable "tls_secret_name" {} locals { namespace = "t3-afk" # Stock node base — the FULL node:24 (not -slim) is buildpack-deps-based, so it # ships git + build-essential (python3/make/g++) that node-pty + the agent need. # Fully-qualified (docker.io/library/...) to satisfy the Kyverno # require-trusted-registries allowlist via `docker.io/*` — bare `node*` is NOT # on the bare-DockerHub-library list (alpine*/busybox*/python* are). image = "docker.io/library/node:24" # Pinned npm versions installed at startup (the reproducibility anchor for the # pilot until a digest-pinned image exists). t3_version = "0.0.27" claude_cli_version = "latest" # @anthropic-ai/claude-code labels = { app = "t3-afk" } } # --- Namespace --- resource "kubernetes_namespace" "t3_afk" { metadata { name = local.namespace labels = { tier = local.tiers.aux } } } # --- Secrets --- # The Claude provider authenticates with CLAUDE_CODE_OAUTH_TOKEN (T3 passes the # environment straight through to the embedded claude-agent-sdk + claude CLI). # GITHUB_TOKEN / FORGEJO_TOKEN authenticate the agent's `git push` from worktrees # (wired into ~/.gitconfig insteadOf rewrites in the container command). resource "kubernetes_manifest" "external_secret" { manifest = { apiVersion = "external-secrets.io/v1beta1" kind = "ExternalSecret" metadata = { name = "t3-afk-secrets" namespace = local.namespace } spec = { refreshInterval = "15m" secretStoreRef = { name = "vault-kv" kind = "ClusterSecretStore" } target = { name = "t3-afk-secrets" } data = [ { secretKey = "CLAUDE_CODE_OAUTH_TOKEN" remoteRef = { key = "claude-agent-service", property = "claude_oauth_token" } }, { secretKey = "GITHUB_TOKEN" remoteRef = { key = "viktor", property = "github_pat" } }, { # Shared viktor-scoped admin PAT (also used by Woodpecker + the # claude-agent pod). Lets the agent git push / open PRs on Forgejo. secretKey = "FORGEJO_TOKEN" remoteRef = { key = "ci/global", property = "forgejo_push_token" } }, ] } } depends_on = [kubernetes_namespace.t3_afk] } # issue-implementer behaviour. T3 hardcodes the claude_code system-prompt preset # (no API override), but loads settingSources [user,project,local] — so the # agent's standing instructions ride in the USER-level ~/.claude/CLAUDE.md, while # each target repo's own CLAUDE.md provides project context. ADR 0003. resource "kubernetes_config_map" "agent_claudemd" { metadata { name = "issue-implementer-claudemd" namespace = kubernetes_namespace.t3_afk.metadata[0].name } data = { "CLAUDE.md" = file("${path.module}/files/issue-implementer-CLAUDE.md") } } # --- Storage --- # SSD-NFS (small-file friendly) for the T3 base dir: state.sqlite + the # server-signing-key (losing it invalidates every issued bearer), per-thread git # worktrees, the npm global install, and caches. ADR 0004. module "data" { source = "../../modules/kubernetes/nfs_volume" name = "t3-afk-data" namespace = kubernetes_namespace.t3_afk.metadata[0].name nfs_server = "192.168.1.127" nfs_path = "/srv/nfs-ssd/t3-afk-data" storage = "30Gi" } # --- Deployment --- resource "kubernetes_deployment" "t3_afk" { # Slow first start (image pull + npm install init + ESO secret sync) can # exceed the default rollout-wait timeout; verify pod readiness out-of-band. wait_for_rollout = false metadata { name = "t3-afk" namespace = kubernetes_namespace.t3_afk.metadata[0].name labels = local.labels # keel.sh/policy=never must be a DEPLOYMENT-level annotation — that's where # Keel reads it. (A pod-template label is ignored by Keel, which is why the # earlier attempt failed.) The cluster's Kyverno inject-keel-annotations # policy is opt-OUT: it stamps policy=patch on any workload that doesn't # carry its own keel.sh/policy — and Keel then "patch"-downgraded # node:24 -> node:24.0.2 (below t3@0.0.27's required node >=24.10), which # crash-looped `t3 serve`. ADR 0003 (Keel-excluded). annotations = { "keel.sh/policy" = "never" } } spec { replicas = 1 # Single-writer state.sqlite — never run two pods against the same base dir. strategy { type = "Recreate" } selector { match_labels = local.labels } template { metadata { labels = local.labels } spec { security_context { run_as_user = 1000 # node run_as_group = 1000 fs_group = 1000 } # NFS mounts land root-owned; make /data writable by uid 1000. init_container { name = "fix-perms" image = "busybox:1.37" command = ["sh", "-c", "mkdir -p /data && chown -R 1000:1000 /data && chmod 0775 /data"] security_context { run_as_user = 0 } volume_mount { name = "data" mount_path = "/data" } resources { requests = { memory = "32Mi" } limits = { memory = "64Mi" } } } # Install pinned t3 + Claude CLI onto the PVC (cached; skipped if already # present). Runs as uid 1000 so the install is owned by the runtime user. init_container { name = "install-t3" image = local.image command = ["bash", "-c", <<-EOF set -e export npm_config_cache=/data/npm-cache export npm_config_prefix=/data/npm-global mkdir -p /data/npm-global /data/npm-cache if [ ! -x /data/npm-global/bin/t3 ]; then echo "installing t3@${local.t3_version} + claude CLI ..." npm install -g "t3@${local.t3_version}" "@anthropic-ai/claude-code@${local.claude_cli_version}" else echo "t3 already installed: $(/data/npm-global/bin/t3 --version 2>/dev/null || echo unknown)" fi EOF ] volume_mount { name = "data" mount_path = "/data" } resources { requests = { cpu = "200m", memory = "512Mi" } limits = { memory = "1Gi" } } } container { name = "t3" image = local.image # Configure git auth for the agent's pushes, then run T3 headless. # $$ escapes Terraform interpolation so the shell expands the env vars. command = ["bash", "-c", <<-EOF set -e export PATH=/data/npm-global/bin:$$PATH export npm_config_cache=/data/npm-cache # git identity + token rewrites so the agent can push from worktrees. git config --global user.name "issue-implementer (AFK)" git config --global user.email "afk-agent@viktorbarzin.me" git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "https://github.com/" git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "git@github.com:" if [ -n "$${FORGEJO_TOKEN}" ]; then git config --global url."https://$${FORGEJO_TOKEN}@forgejo.viktorbarzin.me/".insteadOf "https://forgejo.viktorbarzin.me/" fi exec t3 serve --mode web --host 0.0.0.0 --port 3773 --base-dir /data/t3 EOF ] port { container_port = 3773 } env_from { secret_ref { name = "t3-afk-secrets" } } env { name = "HOME" value = "/home/node" } env { name = "T3CODE_HOME" value = "/data/t3" } # T3's API needs auth even for liveness; use a TCP probe on the port. liveness_probe { tcp_socket { port = 3773 } initial_delay_seconds = 30 period_seconds = 30 } readiness_probe { tcp_socket { port = 3773 } initial_delay_seconds = 15 period_seconds = 10 } volume_mount { name = "data" mount_path = "/data" } # User-level agent instructions (settingSources: user). volume_mount { name = "agent-claudemd" mount_path = "/home/node/.claude/CLAUDE.md" sub_path = "CLAUDE.md" } # Burstable (tier-aux). A live agent thread (node + claude) is memory # heavy; size for a small number of concurrent threads on this pilot # instance. No CPU limit per cluster policy. resources { requests = { cpu = "1" memory = "2Gi" } # Capped at the tier-aux LimitRange max (4Gi/container). If real # workloads OOM, opt the namespace out via the # resource-governance/custom-limitrange label (as claude-agent-service # does) and raise this. limits = { memory = "4Gi" } } } volume { name = "data" persistent_volume_claim { claim_name = module.data.claim_name } } volume { name = "agent-claudemd" config_map { name = kubernetes_config_map.agent_claudemd.metadata[0].name } } } } } lifecycle { ignore_changes = [ spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 # Kyverno's inject-keel-annotations stamps pollSchedule/trigger alongside # the policy; we own keel.sh/policy=never above, but ignore these two so # they don't perpetually drift the plan. metadata[0].annotations["keel.sh/pollSchedule"], metadata[0].annotations["keel.sh/trigger"], ] } } # --- Service --- resource "kubernetes_service" "t3_afk" { metadata { name = "t3-afk" namespace = kubernetes_namespace.t3_afk.metadata[0].name labels = local.labels } spec { selector = local.labels port { port = 3773 target_port = 3773 } type = "ClusterIP" } } # --- Ingress --- # The cockpit has no built-in user auth, so Authentik forward-auth is the gate. module "ingress" { source = "../../modules/kubernetes/ingress_factory" auth = "required" dns_type = "proxied" namespace = kubernetes_namespace.t3_afk.metadata[0].name name = "t3-afk" service_name = kubernetes_service.t3_afk.metadata[0].name port = 3773 tls_secret_name = var.tls_secret_name }