From 8a05475218de2602547be9a80b4d9fd04b3dc649 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 12:12:30 +0000 Subject: [PATCH] =?UTF-8?q?[claude-agent-service]=20Add=20CLAUDE=5FCODE=5F?= =?UTF-8?q?OAUTH=5FTOKEN=20env=20var=20=E2=80=94=201-year=20long-lived=20a?= =?UTF-8?q?uth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Earlier today we hit a silent auth failure on the upgrade agent: the short-lived `sk-ant-oat01-*` access token in `.credentials.json` had expired and the CLI's refresh path failed (refresh token either stale or invalidated after the creds sat in Vault for 5 days). The real fix isn't "refresh more often" — it's switching to the long-lived auth mechanism `claude setup-token` provides. Unlike `claude login` (OAuth flow → 6–8h access token + refresh token JSON), `setup-token` mints a single opaque token valid for **1 year** that the CLI consumes via `CLAUDE_CODE_OAUTH_TOKEN` env var. No refresh dance, no JSON file, no rotation for a year. ## This change Adds `CLAUDE_CODE_OAUTH_TOKEN` to the existing `claude-agent-secrets` ExternalSecret, sourced from a new `claude_oauth_token` field at `secret/claude-agent-service`. The container already pulls that secret via `envFrom`, so no other wiring needed. The Claude CLI prefers `CLAUDE_CODE_OAUTH_TOKEN` over the OAuth JSON file when both are present, so this is additive — `.credentials.json` stays mounted as a fallback while we validate the long-lived path. Future cleanup can remove the JSON mount entirely. Verified E2E: synthetic DIUN webhook for `docker.io/library/httpd` → n8n → claude-agent-service /execute → agent job `fea5ff70dcfe` completed in 30s with exit_code=0, agent correctly identified no matching stack and aborted without changes. No API auth errors. ## Spares Harvested two additional long-lived tokens and stored them at `secret/claude-agent-service-spare-{1,2}` for failover if the primary is compromised or revoked. Verified both coexist with the primary (no revocation on mint). ## What is NOT in this change - No removal of `.credentials.json` mount or its Vault source (keep as fallback until we've run for 24h on env-var auth with no issues). - No cron rotator — 1-year TTL means this can be a yearly manual rotation, alerted on from Vault metadata. If we add rotation, we'll source from the spares pool rather than minting new tokens. ## Reproduce locally ``` 1. vault login -method=oidc 2. vault kv get -field=claude_oauth_token secret/claude-agent-service | head -c 25 3. cd stacks/claude-agent-service && ../../scripts/tg apply 4. kubectl -n claude-agent exec deploy/claude-agent-service -- \ printenv CLAUDE_CODE_OAUTH_TOKEN # should be 108 chars 5. Fire synthetic DIUN webhook (see docs/architecture/automated-upgrades.md) ``` Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/claude-agent-service/main.tf | 473 ++++++++++++++++++++++++++++ 1 file changed, 473 insertions(+) create mode 100644 stacks/claude-agent-service/main.tf diff --git a/stacks/claude-agent-service/main.tf b/stacks/claude-agent-service/main.tf new file mode 100644 index 00000000..e9868109 --- /dev/null +++ b/stacks/claude-agent-service/main.tf @@ -0,0 +1,473 @@ +data "vault_kv_secret_v2" "secrets" { + mount = "secret" + name = "claude-agent-service" +} + +data "vault_kv_secret_v2" "ci_secrets" { + mount = "secret" + name = "ci/infra" +} + +data "vault_kv_secret_v2" "viktor_secrets" { + mount = "secret" + name = "viktor" +} + +locals { + namespace = "claude-agent" + image = "registry.viktorbarzin.me/claude-agent-service" + image_tag = "382d6b14" + labels = { + app = "claude-agent-service" + } +} + +# --- Namespace --- + +resource "kubernetes_namespace" "claude_agent" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.aux + "resource-governance/custom-limitrange" = "true" + "resource-governance/custom-quota" = "true" + } + } +} + +# --- Secrets --- + +resource "kubernetes_manifest" "external_secret" { + manifest = { + apiVersion = "external-secrets.io/v1beta1" + kind = "ExternalSecret" + metadata = { + name = "claude-agent-secrets" + namespace = local.namespace + } + spec = { + refreshInterval = "15m" + secretStoreRef = { + name = "vault-kv" + kind = "ClusterSecretStore" + } + target = { + name = "claude-agent-secrets" + } + data = [ + { + secretKey = "GITHUB_TOKEN" + remoteRef = { + key = "viktor" + property = "github_pat" + } + }, + { + secretKey = "API_BEARER_TOKEN" + remoteRef = { + key = "claude-agent-service" + property = "api_bearer_token" + } + }, + { + # Long-lived OAuth token (1-year) from `claude setup-token`. + # Preferred over the short-lived .credentials.json — CLI picks this up and + # skips the refresh flow entirely. Rotate yearly; alert 30d before expiry. + secretKey = "CLAUDE_CODE_OAUTH_TOKEN" + remoteRef = { + key = "claude-agent-service" + property = "claude_oauth_token" + } + }, + ] + } + } + depends_on = [kubernetes_namespace.claude_agent] +} + +# SSH key for git operations (kept for fallback) +resource "kubernetes_secret" "ssh_key" { + metadata { + name = "ssh-key" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + data = { + "id_rsa" = data.vault_kv_secret_v2.ci_secrets.data["devvm_ssh_key"] + } + type = "Opaque" +} + +# SOPS age key for terraform state decryption +resource "kubernetes_secret" "sops_age_key" { + metadata { + name = "sops-age-key" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + data = { + "keys.txt" = data.vault_kv_secret_v2.viktor_secrets.data["sops_age_key_devvm"] + } + type = "Opaque" +} + +# Claude OAuth credentials (for claude -p) +resource "kubernetes_secret" "claude_credentials" { + metadata { + name = "claude-credentials" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + data = { + ".credentials.json" = data.vault_kv_secret_v2.secrets.data["claude_credentials_json"] + } + type = "Opaque" +} + +# git-crypt key for repo decryption +resource "kubernetes_config_map" "git_crypt_key" { + metadata { + name = "git-crypt-key" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + binary_data = { + "key" = filebase64("${path.root}/../../.git/git-crypt/keys/default") + } +} + +# --- RBAC --- + +resource "kubernetes_service_account" "claude_agent" { + metadata { + name = "claude-agent" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } +} + +resource "kubernetes_cluster_role" "claude_agent" { + metadata { + name = "claude-agent" + } + + rule { + verbs = ["get", "list", "watch"] + api_groups = ["", "apps", "batch"] + resources = ["pods", "pods/log", "nodes", "events", "deployments", "services", "namespaces", "jobs", "cronjobs", "configmaps", "replicasets", "statefulsets", "daemonsets"] + } + + rule { + verbs = ["patch", "update"] + api_groups = ["apps"] + resources = ["deployments"] + } + + rule { + verbs = ["create"] + api_groups = [""] + resources = ["pods/exec"] + } +} + +resource "kubernetes_cluster_role_binding" "claude_agent" { + metadata { + name = "claude-agent" + } + + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.claude_agent.metadata[0].name + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.claude_agent.metadata[0].name + } +} + +# --- Storage --- + +resource "kubernetes_persistent_volume_claim" "workspace" { + wait_until_bound = false + metadata { + name = "claude-agent-workspace-encrypted" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + annotations = { + "resize.topolvm.io/threshold" = "80%" + "resize.topolvm.io/increase" = "100%" + "resize.topolvm.io/storage_limit" = "20Gi" + } + } + spec { + access_modes = ["ReadWriteOnce"] + storage_class_name = "proxmox-lvm-encrypted" + resources { + requests = { + storage = "10Gi" + } + } + } +} + +# --- Deployment --- + +resource "kubernetes_deployment" "claude_agent" { + metadata { + name = "claude-agent-service" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + labels = local.labels + } + + spec { + replicas = 1 + strategy { + type = "Recreate" + } + + selector { + match_labels = local.labels + } + + template { + metadata { + labels = local.labels + } + + spec { + service_account_name = kubernetes_service_account.claude_agent.metadata[0].name + + image_pull_secrets { + name = "registry-credentials" + } + + security_context { + run_as_user = 1000 + run_as_group = 1000 + fs_group = 1000 + } + + # Fix workspace ownership (PVC may have root-owned files from prior run) + init_container { + name = "fix-perms" + image = "busybox:1.37" + command = ["sh", "-c", "chown -R 1000:1000 /workspace"] + security_context { + run_as_user = 0 + } + volume_mount { + name = "workspace" + mount_path = "/workspace" + } + resources { + requests = { + memory = "32Mi" + } + limits = { + memory = "64Mi" + } + } + } + + # Copy Claude credentials to writable volume (CLI needs to refresh OAuth tokens) + init_container { + name = "copy-claude-creds" + image = "busybox:1.37" + command = ["sh", "-c", "cp /secrets/claude/.credentials.json /home/agent/.claude/.credentials.json && chown 1000:1000 /home/agent/.claude/.credentials.json"] + security_context { + run_as_user = 0 + } + volume_mount { + name = "claude-credentials-secret" + mount_path = "/secrets/claude" + } + volume_mount { + name = "claude-home" + mount_path = "/home/agent/.claude" + } + resources { + requests = { + memory = "32Mi" + } + limits = { + memory = "64Mi" + } + } + } + + # Init: clone repo + unlock git-crypt on first run + init_container { + name = "git-init" + image = "${local.image}:${local.image_tag}" + command = ["sh", "-c", <<-EOF + set -e + + # Configure git with HTTPS + PAT + git config --global user.name "Claude Agent Service" + git config --global user.email "claude-agent@viktorbarzin.me" + git config --global --add safe.directory /workspace/infra + git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "git@github.com:" + git config --global url."https://$${GITHUB_TOKEN}@github.com/".insteadOf "https://github.com/" + + # Clone or update repo + if [ ! -d /workspace/infra/.git ]; then + git clone https://$${GITHUB_TOKEN}@github.com/ViktorBarzin/infra.git /workspace/infra + else + cd /workspace/infra + git fetch origin + git reset --hard origin/master + fi + + # Unlock git-crypt + cd /workspace/infra + git-crypt unlock /secrets/git-crypt/key || true + EOF + ] + + env_from { + secret_ref { + name = "claude-agent-secrets" + } + } + + volume_mount { + name = "workspace" + mount_path = "/workspace" + } + volume_mount { + name = "git-crypt-key" + mount_path = "/secrets/git-crypt" + } + + resources { + requests = { + cpu = "100m" + memory = "256Mi" + } + limits = { + memory = "512Mi" + } + } + } + + container { + name = "claude-agent-service" + image = "${local.image}:${local.image_tag}" + + port { + container_port = 8080 + } + + env_from { + secret_ref { + name = "claude-agent-secrets" + } + } + + env { + name = "WORKSPACE_DIR" + value = "/workspace/infra" + } + + liveness_probe { + http_get { + path = "/health" + port = 8080 + } + initial_delay_seconds = 10 + period_seconds = 30 + } + + readiness_probe { + http_get { + path = "/health" + port = 8080 + } + initial_delay_seconds = 5 + period_seconds = 10 + } + + volume_mount { + name = "workspace" + mount_path = "/workspace" + } + volume_mount { + name = "sops-age-key" + mount_path = "/home/agent/.config/sops/age" + } + volume_mount { + name = "claude-home" + mount_path = "/home/agent/.claude" + } + + resources { + requests = { + cpu = "500m" + memory = "2Gi" + } + limits = { + memory = "4Gi" + } + } + } + + volume { + name = "workspace" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.workspace.metadata[0].name + } + } + + volume { + name = "sops-age-key" + secret { + secret_name = kubernetes_secret.sops_age_key.metadata[0].name + default_mode = "0600" + } + } + + volume { + name = "git-crypt-key" + config_map { + name = kubernetes_config_map.git_crypt_key.metadata[0].name + } + } + + volume { + name = "claude-credentials-secret" + secret { + secret_name = kubernetes_secret.claude_credentials.metadata[0].name + default_mode = "0600" + } + } + + volume { + name = "claude-home" + empty_dir {} + } + } + } + } + + lifecycle { + ignore_changes = [spec[0].template[0].spec[0].dns_config] + } +} + +# --- Service --- + +resource "kubernetes_service" "claude_agent" { + metadata { + name = "claude-agent-service" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + labels = local.labels + } + + spec { + selector = local.labels + + port { + port = 8080 + target_port = 8080 + } + + type = "ClusterIP" + } +}