# k8s-version-upgrade — Automated K8s component (kubeadm/kubelet/kubectl) upgrade # # Detects new patch/minor versions via a weekly CronJob, then dispatches the # `k8s-version-upgrade` agent (infra/.claude/agents/k8s-version-upgrade.md) # through claude-agent-service for the actual rolling upgrade. # # Reuse points: # - claude-agent-service.claude-agent.svc:8080 — agent job runner # - Vault secret/k8s-upgrade/* — operator populates ssh_key + slack_webhook # - Prometheus + Pushgateway + Upgrade Gates alert group (in monitoring stack) # - update_k8s.sh — library script the agent shells into nodes with # # Notes: # - Schedule is Sun 12:00 UTC — well outside the kured Mon-Fri 02:00-06:00 # London window so OS reboots and K8s version rollouts can't overlap. # - Patch detection uses `apt-cache madison kubeadm` on master via SSH. # Minor detection probes the next-minor apt repo URL with HEAD. variable "schedule" { type = string default = "0 12 * * 0" # Sunday 12:00 UTC } # Toggle to suspend the detection CronJob without dropping the stack. variable "enabled" { type = bool default = true } # Mirrors `local.image_tag` in stacks/claude-agent-service/main.tf — keep in # sync when the claude-agent-service image is rebuilt. Reused here because the # detection CronJob only needs kubectl, ssh-client, curl, jq — all of which # the claude-agent-service image already ships. variable "claude_agent_service_image_tag" { type = string default = "2fd7670d" } # If true, the CronJob runs the detection sequence but does NOT POST to # claude-agent-service. Used for Test 1 to confirm detection works without # firing a real upgrade. variable "detection_dry_run" { type = bool default = false } locals { namespace = "k8s-upgrade" ca_image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.claude_agent_service_image_tag}" labels = { app = "k8s-version-check" } } # --- Namespace --- resource "kubernetes_namespace" "k8s_upgrade" { metadata { name = local.namespace labels = { tier = local.tiers.cluster } } lifecycle { # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] } } # --- ExternalSecret: ssh_key + slack_webhook + agent-service bearer --- # # Operator populates Vault `secret/k8s-upgrade/` with: # - ssh_key (PEM-encoded ed25519 private key) # - ssh_key_pub (the matching public key — distributed to nodes' authorized_keys) # - slack_webhook (Slack incoming-webhook URL, separate channel from kured for clean alerting) # # The claude-agent-service bearer token comes from secret/claude-agent-service # (reused — no parallel token needed). resource "kubernetes_manifest" "external_secret" { manifest = { apiVersion = "external-secrets.io/v1beta1" kind = "ExternalSecret" metadata = { name = "k8s-upgrade-creds" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } spec = { refreshInterval = "15m" secretStoreRef = { name = "vault-kv" kind = "ClusterSecretStore" } target = { name = "k8s-upgrade-creds" } data = [ { secretKey = "ssh_key" remoteRef = { key = "k8s-upgrade" property = "ssh_key" } }, { secretKey = "slack_webhook" remoteRef = { key = "k8s-upgrade" property = "slack_webhook" } }, { secretKey = "api_bearer_token" remoteRef = { key = "claude-agent-service" property = "api_bearer_token" } }, ] } } } # --- ServiceAccount + RBAC for the detection CronJob --- resource "kubernetes_service_account" "k8s_version_check" { metadata { name = "k8s-version-check" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } } # Cluster-wide read on nodes (for kubeletVersion comparison) resource "kubernetes_cluster_role" "k8s_version_check" { metadata { name = "k8s-version-check" } rule { api_groups = [""] resources = ["nodes"] verbs = ["get", "list"] } } resource "kubernetes_cluster_role_binding" "k8s_version_check" { metadata { name = "k8s-version-check" } role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" name = kubernetes_cluster_role.k8s_version_check.metadata[0].name } subject { kind = "ServiceAccount" name = kubernetes_service_account.k8s_version_check.metadata[0].name namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } } # Namespace-scoped: detection CronJob reads its own creds Secret. resource "kubernetes_role" "k8s_version_check_secrets" { metadata { name = "k8s-version-check-secrets" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } rule { api_groups = [""] resources = ["secrets"] resource_names = ["k8s-upgrade-creds"] verbs = ["get"] } } resource "kubernetes_role_binding" "k8s_version_check_secrets" { metadata { name = "k8s-version-check-secrets" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } role_ref { api_group = "rbac.authorization.k8s.io" kind = "Role" name = kubernetes_role.k8s_version_check_secrets.metadata[0].name } subject { kind = "ServiceAccount" name = kubernetes_service_account.k8s_version_check.metadata[0].name namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } } # --- Cross-namespace RBAC: claude-agent SA reads k8s-upgrade-creds + annotates ns --- # # The k8s-version-upgrade agent runs inside the claude-agent-service pod (SA # `claude-agent` in `claude-agent` ns). It needs: # - GET on this namespace's k8s-upgrade-creds Secret (to fetch ssh_key + slack) # - PATCH on the k8s-upgrade Namespace annotations (in-flight marker) resource "kubernetes_role" "claude_agent_reads_creds" { metadata { name = "claude-agent-reads-creds" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } rule { api_groups = [""] resources = ["secrets"] resource_names = ["k8s-upgrade-creds"] verbs = ["get"] } } resource "kubernetes_role_binding" "claude_agent_reads_creds" { metadata { name = "claude-agent-reads-creds" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name } role_ref { api_group = "rbac.authorization.k8s.io" kind = "Role" name = kubernetes_role.claude_agent_reads_creds.metadata[0].name } subject { kind = "ServiceAccount" name = "claude-agent" namespace = "claude-agent" } } # The base claude-agent ClusterRole grants get/list/watch on most resources # but not the mutating verbs the upgrade agent needs. Rather than fork the # upstream stack, we add a sibling ClusterRole here scoped to exactly the # verbs+resources required: # - patch on namespace k8s-upgrade (in-flight annotation) # - create on batch/jobs (trigger etcd snapshot Job from cronjob/backup-etcd) # - patch on nodes (cordon/uncordon — drain needs this) # - create on pods/eviction (drain evicts pods) resource "kubernetes_cluster_role" "claude_agent_upgrade_ops" { metadata { name = "claude-agent-upgrade-ops" } # Annotate the k8s-upgrade namespace rule { api_groups = [""] resources = ["namespaces"] resource_names = ["k8s-upgrade"] verbs = ["patch", "update"] } # Trigger etcd snapshot Jobs (from cronjob/backup-etcd in default ns). # Cluster-scoped because we may also create test Jobs in k8s-upgrade ns. rule { api_groups = ["batch"] resources = ["jobs"] verbs = ["create", "delete"] } # Cordon / uncordon nodes rule { api_groups = [""] resources = ["nodes"] verbs = ["patch", "update"] } # Drain (evict pods) rule { api_groups = [""] resources = ["pods/eviction"] verbs = ["create"] } # Delete pods stuck during drain (sometimes evict isn't enough) rule { api_groups = [""] resources = ["pods"] verbs = ["delete"] } } resource "kubernetes_cluster_role_binding" "claude_agent_upgrade_ops" { metadata { name = "claude-agent-upgrade-ops" } role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" name = kubernetes_cluster_role.claude_agent_upgrade_ops.metadata[0].name } subject { kind = "ServiceAccount" name = "claude-agent" namespace = "claude-agent" } } # --- Detection CronJob --- # # Weekly: compares running cluster version against latest available patch # (apt-cache madison kubeadm on master) and latest available minor (HEAD on # next-minor pkgs.k8s.io repo). When a target is detected, POSTs to # claude-agent-service to kick the upgrade agent. resource "kubernetes_cron_job_v1" "k8s_version_check" { metadata { name = "k8s-version-check" namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name labels = local.labels } spec { schedule = var.schedule concurrency_policy = "Forbid" successful_jobs_history_limit = 3 failed_jobs_history_limit = 3 starting_deadline_seconds = 600 suspend = !var.enabled job_template { metadata { labels = local.labels } spec { backoff_limit = 0 ttl_seconds_after_finished = 86400 template { metadata { labels = local.labels } spec { service_account_name = kubernetes_service_account.k8s_version_check.metadata[0].name restart_policy = "Never" image_pull_secrets { name = "registry-credentials" } container { name = "version-check" image = local.ca_image command = ["/bin/bash", "-c", <<-EOT set -euo pipefail echo "==> k8s-version-check ($(date -u +%FT%TZ))" # 1. Load SSH key from K8s Secret mkdir -p /tmp /usr/local/bin/kubectl get secret k8s-upgrade-creds \ -o jsonpath='{.data.ssh_key}' | base64 -d > /tmp/k8s-upgrade-ssh-key chmod 400 /tmp/k8s-upgrade-ssh-key SLACK=$(/usr/local/bin/kubectl get secret k8s-upgrade-creds \ -o jsonpath='{.data.slack_webhook}' | base64 -d) AGENT_TOKEN=$(/usr/local/bin/kubectl get secret k8s-upgrade-creds \ -o jsonpath='{.data.api_bearer_token}' | base64 -d) SSH="ssh -i /tmp/k8s-upgrade-ssh-key \ -o StrictHostKeyChecking=accept-new \ -o UserKnownHostsFile=/tmp/known_hosts" slack() { curl -sS -X POST -H 'Content-Type: application/json' \ --data "$(jq -nc --arg t "[k8s-version-check] $1" '{text: $t}')" \ "$SLACK" || true } # 2. Detect running version RUNNING=$(/usr/local/bin/kubectl get nodes \ -o jsonpath='{.items[0].status.nodeInfo.kubeletVersion}' \ | tr -d v) RUNNING_MINOR=$(echo "$RUNNING" | awk -F. '{print $1"."$2}') echo "Running version: v$RUNNING (minor $RUNNING_MINOR)" # 3. Detect highest available patch within the running minor track. # Refresh the local apt cache first — without this, a newly-published # patch won't show up via `apt-cache madison` until something else # triggers an `apt-get update`. LATEST_PATCH=$($SSH wizard@k8s-master \ "sudo apt-get update -qq -o Dir::Etc::sourcelist='sources.list.d/kubernetes.list' -o Dir::Etc::sourceparts='-' -o APT::Get::List-Cleanup='0' >/dev/null 2>&1 ; \ apt-cache madison kubeadm 2>/dev/null \ | awk '{print \$3}' \ | sed 's/-.*//' \ | grep '^$RUNNING_MINOR\\.' \ | sort -V | tail -1" || echo "") echo "Latest patch (apt): v$LATEST_PATCH" # 4. Detect next available minor by probing the apt repo URL. NEXT_MINOR_NUM=$(( $(echo "$RUNNING_MINOR" | cut -d. -f2) + 1 )) NEXT_MINOR="1.$NEXT_MINOR_NUM" NEXT_MINOR_AVAILABLE="no" if curl -sIo /dev/null -w '%%{http_code}' \ "https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Release" \ | grep -q '^200$'; then NEXT_MINOR_AVAILABLE="yes" fi echo "Next minor v$NEXT_MINOR available: $NEXT_MINOR_AVAILABLE" # 5. Decide what to do TARGET="" KIND="" if [ -n "$LATEST_PATCH" ] && [ "$LATEST_PATCH" != "$RUNNING" ]; then TARGET="$LATEST_PATCH" KIND="patch" elif [ "$NEXT_MINOR_AVAILABLE" = "yes" ]; then # Probe the minor track to get its latest patch. NEXT_MINOR_PATCH=$($SSH wizard@k8s-master \ "curl -sf 'https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Packages' \ | grep -oE 'Version: [0-9.-]+' \ | awk '{print \$2}' | sed 's/-.*//' \ | sort -V | tail -1" || echo "") if [ -n "$NEXT_MINOR_PATCH" ]; then TARGET="$NEXT_MINOR_PATCH" KIND="minor" fi fi # 6. Push the discovery metric to Pushgateway PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/k8s-version-check' { echo "# TYPE k8s_upgrade_available gauge" if [ -n "$TARGET" ]; then echo "k8s_upgrade_available{kind=\"$KIND\",running=\"$RUNNING\",target=\"$TARGET\"} 1" else echo "k8s_upgrade_available{kind=\"none\",running=\"$RUNNING\",target=\"$RUNNING\"} 0" fi echo "# TYPE k8s_version_check_last_run_timestamp gauge" echo "k8s_version_check_last_run_timestamp $(date +%s)" } | curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed" # 7. Decide whether to dispatch if [ -z "$TARGET" ]; then echo "No upgrade needed (running=$RUNNING, latest_patch=$LATEST_PATCH, next_minor_available=$NEXT_MINOR_AVAILABLE)" exit 0 fi slack "K8s upgrade available: v$RUNNING → v$TARGET ($KIND)" # DRY_RUN_OVERRIDE wins over DRY_RUN — but a Job copied from # this CronJob can't add new env vars (spec is immutable). The # operator path for "trigger detection without dispatch" is # toggling the CronJob's `var.detection_dry_run` then applying. # Documented in the runbook. EFFECTIVE_DRY_RUN="$${DRY_RUN_OVERRIDE:-$DRY_RUN}" if [ "$EFFECTIVE_DRY_RUN" = "true" ]; then echo "dry_run=true — not POSTing to claude-agent-service" slack "DRY_RUN — skipping agent dispatch" exit 0 fi # 8. POST to claude-agent-service PAYLOAD=$(jq -nc \ --arg target "$TARGET" \ --arg kind "$KIND" \ '{ prompt: ("Run the k8s-version-upgrade agent. Inputs: " + ({target_version: $target, kind: $kind, dry_run: false, stages: "all"} | tostring)), agent: ".claude/agents/k8s-version-upgrade", max_budget_usd: 30 }') echo "Dispatching agent: $PAYLOAD" RESP=$(curl -sS -w '\n%%{http_code}' -X POST \ -H "Authorization: Bearer $AGENT_TOKEN" \ -H 'Content-Type: application/json' \ -d "$PAYLOAD" \ http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute) CODE=$(printf '%s' "$RESP" | tail -n1) BODY=$(printf '%s' "$RESP" | sed '$d') if [ "$CODE" = "200" ] || [ "$CODE" = "202" ]; then JOB_ID=$(printf '%s' "$BODY" | jq -r '.job_id // .id // "unknown"') slack "Agent dispatched: job=$JOB_ID (target=v$TARGET kind=$KIND)" echo "OK — job=$JOB_ID" else slack "ERROR dispatching agent: HTTP $CODE — $BODY" echo "dispatch failed: HTTP $CODE — $BODY" >&2 exit 1 fi EOT ] env { name = "DRY_RUN" value = tostring(var.detection_dry_run) } env { name = "HOME" value = "/tmp" } resources { requests = { cpu = "50m" memory = "128Mi" } limits = { memory = "256Mi" } } } } } } } } lifecycle { # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] } }