infra/stacks/t3-afk/main.tf
Viktor Barzin d8c60d7ab8
All checks were successful
ci/woodpecker/push/default Pipeline was successful
t3-afk: dedicated in-cluster T3 Code instance (AFK executor + cockpit)
Slice #2 of claude-agent-service PRD #1 (AFK implementation pipeline). Dedicated
in-cluster T3 Code instance the control plane dispatches issues into; runs the
issue-implementer agent in a git worktree with a live cockpit. Applied + live
2026-06-14 (9 resources).

Pilot-fast: stock docker.io/library/node:24 + install pinned t3@0.0.27 + Claude
CLI at startup onto an SSD-NFS PVC. Authentik-gated ingress. issue-implementer
behaviour ships as a user-level ~/.claude/CLAUDE.md (T3 hardcodes the system
prompt; settingSources loads it) and forbids plan-mode/clarifying-questions so
unattended threads don't stall. Keel-excluded (ADR 0003). wait_for_rollout=false
(slow first start). Image fully-qualified for the Kyverno trusted-registries
allowlist; container mem limit 4Gi (tier-aux LimitRange cap).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-14 20:06:33 +00:00

348 lines
11 KiB
HCL

# =============================================================================
# t3-afk — dedicated, in-cluster T3 Code instance: the EXECUTOR + COCKPIT for the
# AFK implementation pipeline (slice #2 of claude-agent-service PRD #1).
#
# claude-agent-service (control plane) dispatches issues INTO this T3 instance
# over its orchestration HTTP API; T3 runs the issue-implementer agent in a git
# worktree and shows every worker in its cockpit. See:
# claude-agent-service/docs/2026-06-14-afk-implementation-pipeline-design.md
# claude-agent-service/docs/adr/0003-t3-thin-executor-and-cockpit.md
#
# PILOT SHORTCUT (chosen 2026-06-14): no custom-built image. We run stock
# `node:24` (the full image ships git + python3/make/g++ for node-pty) and an
# init container installs PINNED npm packages (t3@0.0.27 + the Claude CLI) onto
# the SSD PVC, cached across restarts. Formalize a digest-pinned built image
# post-GO. T3 is version-pinned (npm) and NOT Keel-enrolled.
# =============================================================================
# No plan-time Vault reads — every secret flows through the ExternalSecret below
# (CLAUDE_CODE_OAUTH_TOKEN / GITHUB_TOKEN / FORGEJO_TOKEN), injected as env at
# runtime. Nothing here needs a secret value at plan time.
# Wildcard TLS secret name — value comes from config.tfvars; consumed by the
# ingress factory (every stack that uses the factory declares this).
variable "tls_secret_name" {}
locals {
namespace = "t3-afk"
# Stock node base — the FULL node:24 (not -slim) is buildpack-deps-based, so it
# ships git + build-essential (python3/make/g++) that node-pty + the agent need.
# Fully-qualified (docker.io/library/...) to satisfy the Kyverno
# require-trusted-registries allowlist via `docker.io/*` — bare `node*` is NOT
# on the bare-DockerHub-library list (alpine*/busybox*/python* are).
image = "docker.io/library/node:24"
# Pinned npm versions installed at startup (the reproducibility anchor for the
# pilot until a digest-pinned image exists).
t3_version = "0.0.27"
claude_cli_version = "latest" # @anthropic-ai/claude-code
labels = {
app = "t3-afk"
}
}
# --- Namespace ---
resource "kubernetes_namespace" "t3_afk" {
metadata {
name = local.namespace
labels = {
tier = local.tiers.aux
}
}
}
# --- Secrets ---
# The Claude provider authenticates with CLAUDE_CODE_OAUTH_TOKEN (T3 passes the
# environment straight through to the embedded claude-agent-sdk + claude CLI).
# GITHUB_TOKEN / FORGEJO_TOKEN authenticate the agent's `git push` from worktrees
# (wired into ~/.gitconfig insteadOf rewrites in the container command).
resource "kubernetes_manifest" "external_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "t3-afk-secrets"
namespace = local.namespace
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = { name = "t3-afk-secrets" }
data = [
{
secretKey = "CLAUDE_CODE_OAUTH_TOKEN"
remoteRef = { key = "claude-agent-service", property = "claude_oauth_token" }
},
{
secretKey = "GITHUB_TOKEN"
remoteRef = { key = "viktor", property = "github_pat" }
},
{
# Shared viktor-scoped admin PAT (also used by Woodpecker + the
# claude-agent pod). Lets the agent git push / open PRs on Forgejo.
secretKey = "FORGEJO_TOKEN"
remoteRef = { key = "ci/global", property = "forgejo_push_token" }
},
]
}
}
depends_on = [kubernetes_namespace.t3_afk]
}
# issue-implementer behaviour. T3 hardcodes the claude_code system-prompt preset
# (no API override), but loads settingSources [user,project,local] — so the
# agent's standing instructions ride in the USER-level ~/.claude/CLAUDE.md, while
# each target repo's own CLAUDE.md provides project context. ADR 0003.
resource "kubernetes_config_map" "agent_claudemd" {
metadata {
name = "issue-implementer-claudemd"
namespace = kubernetes_namespace.t3_afk.metadata[0].name
}
data = {
"CLAUDE.md" = file("${path.module}/files/issue-implementer-CLAUDE.md")
}
}
# --- Storage ---
# SSD-NFS (small-file friendly) for the T3 base dir: state.sqlite + the
# server-signing-key (losing it invalidates every issued bearer), per-thread git
# worktrees, the npm global install, and caches. ADR 0004.
module "data" {
source = "../../modules/kubernetes/nfs_volume"
name = "t3-afk-data"
namespace = kubernetes_namespace.t3_afk.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs-ssd/t3-afk-data"
storage = "30Gi"
}
# --- Deployment ---
resource "kubernetes_deployment" "t3_afk" {
# Slow first start (image pull + npm install init + ESO secret sync) can
# exceed the default rollout-wait timeout; verify pod readiness out-of-band.
wait_for_rollout = false
metadata {
name = "t3-afk"
namespace = kubernetes_namespace.t3_afk.metadata[0].name
labels = local.labels
}
spec {
replicas = 1
# Single-writer state.sqlite — never run two pods against the same base dir.
strategy {
type = "Recreate"
}
selector {
match_labels = local.labels
}
template {
metadata {
labels = merge(local.labels, {
# Belt-and-braces: this namespace isn't Keel-enrolled, but pin the
# churny pre-1.0 T3 explicitly out of any auto-upgrade. ADR 0003.
"keel.sh/policy" = "never"
})
}
spec {
security_context {
run_as_user = 1000 # node
run_as_group = 1000
fs_group = 1000
}
# NFS mounts land root-owned; make /data writable by uid 1000.
init_container {
name = "fix-perms"
image = "busybox:1.37"
command = ["sh", "-c", "mkdir -p /data && chown -R 1000:1000 /data && chmod 0775 /data"]
security_context {
run_as_user = 0
}
volume_mount {
name = "data"
mount_path = "/data"
}
resources {
requests = { memory = "32Mi" }
limits = { memory = "64Mi" }
}
}
# Install pinned t3 + Claude CLI onto the PVC (cached; skipped if already
# present). Runs as uid 1000 so the install is owned by the runtime user.
init_container {
name = "install-t3"
image = local.image
command = ["bash", "-c", <<-EOF
set -e
export npm_config_cache=/data/npm-cache
export npm_config_prefix=/data/npm-global
mkdir -p /data/npm-global /data/npm-cache
if [ ! -x /data/npm-global/bin/t3 ]; then
echo "installing t3@${local.t3_version} + claude CLI ..."
npm install -g "t3@${local.t3_version}" "@anthropic-ai/claude-code@${local.claude_cli_version}"
else
echo "t3 already installed: $(/data/npm-global/bin/t3 --version 2>/dev/null || echo unknown)"
fi
EOF
]
volume_mount {
name = "data"
mount_path = "/data"
}
resources {
requests = { cpu = "200m", memory = "512Mi" }
limits = { memory = "1Gi" }
}
}
container {
name = "t3"
image = local.image
# Configure git auth for the agent's pushes, then run T3 headless.
# $$ escapes Terraform interpolation so the shell expands the env vars.
command = ["bash", "-c", <<-EOF
set -e
export PATH=/data/npm-global/bin:$$PATH
export npm_config_cache=/data/npm-cache
# git identity + token rewrites so the agent can push from worktrees.
git config --global user.name "issue-implementer (AFK)"
git config --global user.email "afk-agent@viktorbarzin.me"
git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "https://github.com/"
git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "git@github.com:"
if [ -n "$${FORGEJO_TOKEN}" ]; then
git config --global url."https://$${FORGEJO_TOKEN}@forgejo.viktorbarzin.me/".insteadOf "https://forgejo.viktorbarzin.me/"
fi
exec t3 serve --mode web --host 0.0.0.0 --port 3773 --base-dir /data/t3
EOF
]
port {
container_port = 3773
}
env_from {
secret_ref {
name = "t3-afk-secrets"
}
}
env {
name = "HOME"
value = "/home/node"
}
env {
name = "T3CODE_HOME"
value = "/data/t3"
}
# T3's API needs auth even for liveness; use a TCP probe on the port.
liveness_probe {
tcp_socket {
port = 3773
}
initial_delay_seconds = 30
period_seconds = 30
}
readiness_probe {
tcp_socket {
port = 3773
}
initial_delay_seconds = 15
period_seconds = 10
}
volume_mount {
name = "data"
mount_path = "/data"
}
# User-level agent instructions (settingSources: user).
volume_mount {
name = "agent-claudemd"
mount_path = "/home/node/.claude/CLAUDE.md"
sub_path = "CLAUDE.md"
}
# Burstable (tier-aux). A live agent thread (node + claude) is memory
# heavy; size for a small number of concurrent threads on this pilot
# instance. No CPU limit per cluster policy.
resources {
requests = {
cpu = "1"
memory = "2Gi"
}
# Capped at the tier-aux LimitRange max (4Gi/container). If real
# workloads OOM, opt the namespace out via the
# resource-governance/custom-limitrange label (as claude-agent-service
# does) and raise this.
limits = {
memory = "4Gi"
}
}
}
volume {
name = "data"
persistent_volume_claim {
claim_name = module.data.claim_name
}
}
volume {
name = "agent-claudemd"
config_map {
name = kubernetes_config_map.agent_claudemd.metadata[0].name
}
}
}
}
}
lifecycle {
ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
}
}
# --- Service ---
resource "kubernetes_service" "t3_afk" {
metadata {
name = "t3-afk"
namespace = kubernetes_namespace.t3_afk.metadata[0].name
labels = local.labels
}
spec {
selector = local.labels
port {
port = 3773
target_port = 3773
}
type = "ClusterIP"
}
}
# --- Ingress ---
# The cockpit has no built-in user auth, so Authentik forward-auth is the gate.
module "ingress" {
source = "../../modules/kubernetes/ingress_factory"
auth = "required"
dns_type = "proxied"
namespace = kubernetes_namespace.t3_afk.metadata[0].name
name = "t3-afk"
service_name = kubernetes_service.t3_afk.metadata[0].name
port = 3773
tls_secret_name = var.tls_secret_name
}