All checks were successful
ci/woodpecker/push/default Pipeline was successful
Slice #2 of claude-agent-service PRD #1 (AFK implementation pipeline). Dedicated in-cluster T3 Code instance the control plane dispatches issues into; runs the issue-implementer agent in a git worktree with a live cockpit. Applied + live 2026-06-14 (9 resources). Pilot-fast: stock docker.io/library/node:24 + install pinned t3@0.0.27 + Claude CLI at startup onto an SSD-NFS PVC. Authentik-gated ingress. issue-implementer behaviour ships as a user-level ~/.claude/CLAUDE.md (T3 hardcodes the system prompt; settingSources loads it) and forbids plan-mode/clarifying-questions so unattended threads don't stall. Keel-excluded (ADR 0003). wait_for_rollout=false (slow first start). Image fully-qualified for the Kyverno trusted-registries allowlist; container mem limit 4Gi (tier-aux LimitRange cap). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
348 lines
11 KiB
HCL
348 lines
11 KiB
HCL
# =============================================================================
|
|
# t3-afk — dedicated, in-cluster T3 Code instance: the EXECUTOR + COCKPIT for the
|
|
# AFK implementation pipeline (slice #2 of claude-agent-service PRD #1).
|
|
#
|
|
# claude-agent-service (control plane) dispatches issues INTO this T3 instance
|
|
# over its orchestration HTTP API; T3 runs the issue-implementer agent in a git
|
|
# worktree and shows every worker in its cockpit. See:
|
|
# claude-agent-service/docs/2026-06-14-afk-implementation-pipeline-design.md
|
|
# claude-agent-service/docs/adr/0003-t3-thin-executor-and-cockpit.md
|
|
#
|
|
# PILOT SHORTCUT (chosen 2026-06-14): no custom-built image. We run stock
|
|
# `node:24` (the full image ships git + python3/make/g++ for node-pty) and an
|
|
# init container installs PINNED npm packages (t3@0.0.27 + the Claude CLI) onto
|
|
# the SSD PVC, cached across restarts. Formalize a digest-pinned built image
|
|
# post-GO. T3 is version-pinned (npm) and NOT Keel-enrolled.
|
|
# =============================================================================
|
|
|
|
# No plan-time Vault reads — every secret flows through the ExternalSecret below
|
|
# (CLAUDE_CODE_OAUTH_TOKEN / GITHUB_TOKEN / FORGEJO_TOKEN), injected as env at
|
|
# runtime. Nothing here needs a secret value at plan time.
|
|
|
|
# Wildcard TLS secret name — value comes from config.tfvars; consumed by the
|
|
# ingress factory (every stack that uses the factory declares this).
|
|
variable "tls_secret_name" {}
|
|
|
|
locals {
|
|
namespace = "t3-afk"
|
|
# Stock node base — the FULL node:24 (not -slim) is buildpack-deps-based, so it
|
|
# ships git + build-essential (python3/make/g++) that node-pty + the agent need.
|
|
# Fully-qualified (docker.io/library/...) to satisfy the Kyverno
|
|
# require-trusted-registries allowlist via `docker.io/*` — bare `node*` is NOT
|
|
# on the bare-DockerHub-library list (alpine*/busybox*/python* are).
|
|
image = "docker.io/library/node:24"
|
|
# Pinned npm versions installed at startup (the reproducibility anchor for the
|
|
# pilot until a digest-pinned image exists).
|
|
t3_version = "0.0.27"
|
|
claude_cli_version = "latest" # @anthropic-ai/claude-code
|
|
labels = {
|
|
app = "t3-afk"
|
|
}
|
|
}
|
|
|
|
# --- Namespace ---
|
|
|
|
resource "kubernetes_namespace" "t3_afk" {
|
|
metadata {
|
|
name = local.namespace
|
|
labels = {
|
|
tier = local.tiers.aux
|
|
}
|
|
}
|
|
}
|
|
|
|
# --- Secrets ---
|
|
# The Claude provider authenticates with CLAUDE_CODE_OAUTH_TOKEN (T3 passes the
|
|
# environment straight through to the embedded claude-agent-sdk + claude CLI).
|
|
# GITHUB_TOKEN / FORGEJO_TOKEN authenticate the agent's `git push` from worktrees
|
|
# (wired into ~/.gitconfig insteadOf rewrites in the container command).
|
|
|
|
resource "kubernetes_manifest" "external_secret" {
|
|
manifest = {
|
|
apiVersion = "external-secrets.io/v1beta1"
|
|
kind = "ExternalSecret"
|
|
metadata = {
|
|
name = "t3-afk-secrets"
|
|
namespace = local.namespace
|
|
}
|
|
spec = {
|
|
refreshInterval = "15m"
|
|
secretStoreRef = {
|
|
name = "vault-kv"
|
|
kind = "ClusterSecretStore"
|
|
}
|
|
target = { name = "t3-afk-secrets" }
|
|
data = [
|
|
{
|
|
secretKey = "CLAUDE_CODE_OAUTH_TOKEN"
|
|
remoteRef = { key = "claude-agent-service", property = "claude_oauth_token" }
|
|
},
|
|
{
|
|
secretKey = "GITHUB_TOKEN"
|
|
remoteRef = { key = "viktor", property = "github_pat" }
|
|
},
|
|
{
|
|
# Shared viktor-scoped admin PAT (also used by Woodpecker + the
|
|
# claude-agent pod). Lets the agent git push / open PRs on Forgejo.
|
|
secretKey = "FORGEJO_TOKEN"
|
|
remoteRef = { key = "ci/global", property = "forgejo_push_token" }
|
|
},
|
|
]
|
|
}
|
|
}
|
|
depends_on = [kubernetes_namespace.t3_afk]
|
|
}
|
|
|
|
# issue-implementer behaviour. T3 hardcodes the claude_code system-prompt preset
|
|
# (no API override), but loads settingSources [user,project,local] — so the
|
|
# agent's standing instructions ride in the USER-level ~/.claude/CLAUDE.md, while
|
|
# each target repo's own CLAUDE.md provides project context. ADR 0003.
|
|
resource "kubernetes_config_map" "agent_claudemd" {
|
|
metadata {
|
|
name = "issue-implementer-claudemd"
|
|
namespace = kubernetes_namespace.t3_afk.metadata[0].name
|
|
}
|
|
data = {
|
|
"CLAUDE.md" = file("${path.module}/files/issue-implementer-CLAUDE.md")
|
|
}
|
|
}
|
|
|
|
# --- Storage ---
|
|
# SSD-NFS (small-file friendly) for the T3 base dir: state.sqlite + the
|
|
# server-signing-key (losing it invalidates every issued bearer), per-thread git
|
|
# worktrees, the npm global install, and caches. ADR 0004.
|
|
module "data" {
|
|
source = "../../modules/kubernetes/nfs_volume"
|
|
name = "t3-afk-data"
|
|
namespace = kubernetes_namespace.t3_afk.metadata[0].name
|
|
nfs_server = "192.168.1.127"
|
|
nfs_path = "/srv/nfs-ssd/t3-afk-data"
|
|
storage = "30Gi"
|
|
}
|
|
|
|
# --- Deployment ---
|
|
|
|
resource "kubernetes_deployment" "t3_afk" {
|
|
# Slow first start (image pull + npm install init + ESO secret sync) can
|
|
# exceed the default rollout-wait timeout; verify pod readiness out-of-band.
|
|
wait_for_rollout = false
|
|
|
|
metadata {
|
|
name = "t3-afk"
|
|
namespace = kubernetes_namespace.t3_afk.metadata[0].name
|
|
labels = local.labels
|
|
}
|
|
|
|
spec {
|
|
replicas = 1
|
|
# Single-writer state.sqlite — never run two pods against the same base dir.
|
|
strategy {
|
|
type = "Recreate"
|
|
}
|
|
|
|
selector {
|
|
match_labels = local.labels
|
|
}
|
|
|
|
template {
|
|
metadata {
|
|
labels = merge(local.labels, {
|
|
# Belt-and-braces: this namespace isn't Keel-enrolled, but pin the
|
|
# churny pre-1.0 T3 explicitly out of any auto-upgrade. ADR 0003.
|
|
"keel.sh/policy" = "never"
|
|
})
|
|
}
|
|
|
|
spec {
|
|
security_context {
|
|
run_as_user = 1000 # node
|
|
run_as_group = 1000
|
|
fs_group = 1000
|
|
}
|
|
|
|
# NFS mounts land root-owned; make /data writable by uid 1000.
|
|
init_container {
|
|
name = "fix-perms"
|
|
image = "busybox:1.37"
|
|
command = ["sh", "-c", "mkdir -p /data && chown -R 1000:1000 /data && chmod 0775 /data"]
|
|
security_context {
|
|
run_as_user = 0
|
|
}
|
|
volume_mount {
|
|
name = "data"
|
|
mount_path = "/data"
|
|
}
|
|
resources {
|
|
requests = { memory = "32Mi" }
|
|
limits = { memory = "64Mi" }
|
|
}
|
|
}
|
|
|
|
# Install pinned t3 + Claude CLI onto the PVC (cached; skipped if already
|
|
# present). Runs as uid 1000 so the install is owned by the runtime user.
|
|
init_container {
|
|
name = "install-t3"
|
|
image = local.image
|
|
command = ["bash", "-c", <<-EOF
|
|
set -e
|
|
export npm_config_cache=/data/npm-cache
|
|
export npm_config_prefix=/data/npm-global
|
|
mkdir -p /data/npm-global /data/npm-cache
|
|
if [ ! -x /data/npm-global/bin/t3 ]; then
|
|
echo "installing t3@${local.t3_version} + claude CLI ..."
|
|
npm install -g "t3@${local.t3_version}" "@anthropic-ai/claude-code@${local.claude_cli_version}"
|
|
else
|
|
echo "t3 already installed: $(/data/npm-global/bin/t3 --version 2>/dev/null || echo unknown)"
|
|
fi
|
|
EOF
|
|
]
|
|
volume_mount {
|
|
name = "data"
|
|
mount_path = "/data"
|
|
}
|
|
resources {
|
|
requests = { cpu = "200m", memory = "512Mi" }
|
|
limits = { memory = "1Gi" }
|
|
}
|
|
}
|
|
|
|
container {
|
|
name = "t3"
|
|
image = local.image
|
|
|
|
# Configure git auth for the agent's pushes, then run T3 headless.
|
|
# $$ escapes Terraform interpolation so the shell expands the env vars.
|
|
command = ["bash", "-c", <<-EOF
|
|
set -e
|
|
export PATH=/data/npm-global/bin:$$PATH
|
|
export npm_config_cache=/data/npm-cache
|
|
|
|
# git identity + token rewrites so the agent can push from worktrees.
|
|
git config --global user.name "issue-implementer (AFK)"
|
|
git config --global user.email "afk-agent@viktorbarzin.me"
|
|
git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "https://github.com/"
|
|
git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "git@github.com:"
|
|
if [ -n "$${FORGEJO_TOKEN}" ]; then
|
|
git config --global url."https://$${FORGEJO_TOKEN}@forgejo.viktorbarzin.me/".insteadOf "https://forgejo.viktorbarzin.me/"
|
|
fi
|
|
|
|
exec t3 serve --mode web --host 0.0.0.0 --port 3773 --base-dir /data/t3
|
|
EOF
|
|
]
|
|
|
|
port {
|
|
container_port = 3773
|
|
}
|
|
|
|
env_from {
|
|
secret_ref {
|
|
name = "t3-afk-secrets"
|
|
}
|
|
}
|
|
|
|
env {
|
|
name = "HOME"
|
|
value = "/home/node"
|
|
}
|
|
env {
|
|
name = "T3CODE_HOME"
|
|
value = "/data/t3"
|
|
}
|
|
|
|
# T3's API needs auth even for liveness; use a TCP probe on the port.
|
|
liveness_probe {
|
|
tcp_socket {
|
|
port = 3773
|
|
}
|
|
initial_delay_seconds = 30
|
|
period_seconds = 30
|
|
}
|
|
readiness_probe {
|
|
tcp_socket {
|
|
port = 3773
|
|
}
|
|
initial_delay_seconds = 15
|
|
period_seconds = 10
|
|
}
|
|
|
|
volume_mount {
|
|
name = "data"
|
|
mount_path = "/data"
|
|
}
|
|
# User-level agent instructions (settingSources: user).
|
|
volume_mount {
|
|
name = "agent-claudemd"
|
|
mount_path = "/home/node/.claude/CLAUDE.md"
|
|
sub_path = "CLAUDE.md"
|
|
}
|
|
|
|
# Burstable (tier-aux). A live agent thread (node + claude) is memory
|
|
# heavy; size for a small number of concurrent threads on this pilot
|
|
# instance. No CPU limit per cluster policy.
|
|
resources {
|
|
requests = {
|
|
cpu = "1"
|
|
memory = "2Gi"
|
|
}
|
|
# Capped at the tier-aux LimitRange max (4Gi/container). If real
|
|
# workloads OOM, opt the namespace out via the
|
|
# resource-governance/custom-limitrange label (as claude-agent-service
|
|
# does) and raise this.
|
|
limits = {
|
|
memory = "4Gi"
|
|
}
|
|
}
|
|
}
|
|
|
|
volume {
|
|
name = "data"
|
|
persistent_volume_claim {
|
|
claim_name = module.data.claim_name
|
|
}
|
|
}
|
|
|
|
volume {
|
|
name = "agent-claudemd"
|
|
config_map {
|
|
name = kubernetes_config_map.agent_claudemd.metadata[0].name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
lifecycle {
|
|
ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
|
}
|
|
}
|
|
|
|
# --- Service ---
|
|
|
|
resource "kubernetes_service" "t3_afk" {
|
|
metadata {
|
|
name = "t3-afk"
|
|
namespace = kubernetes_namespace.t3_afk.metadata[0].name
|
|
labels = local.labels
|
|
}
|
|
spec {
|
|
selector = local.labels
|
|
port {
|
|
port = 3773
|
|
target_port = 3773
|
|
}
|
|
type = "ClusterIP"
|
|
}
|
|
}
|
|
|
|
# --- Ingress ---
|
|
# The cockpit has no built-in user auth, so Authentik forward-auth is the gate.
|
|
module "ingress" {
|
|
source = "../../modules/kubernetes/ingress_factory"
|
|
auth = "required"
|
|
dns_type = "proxied"
|
|
namespace = kubernetes_namespace.t3_afk.metadata[0].name
|
|
name = "t3-afk"
|
|
service_name = kubernetes_service.t3_afk.metadata[0].name
|
|
port = 3773
|
|
tls_secret_name = var.tls_secret_name
|
|
}
|