fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/stacks/k8s-version-upgrade/job-template.yaml
+++ b/stacks/k8s-version-upgrade/job-template.yaml
@ -0,0 +1,88 @@
+# k8s-upgrade-chain Job template.
+#
+# Rendered by `envsubst` inside upgrade-step.sh (and the detection CronJob)
+# before `kubectl apply`. All ${VAR} placeholders are envsubst-side; this file
+# is NOT processed by Terraform.
+#
+# Required environment for envsubst:
+#   JOB_NAME            unique-per-(phase, target_version[, target_node])
+#   PHASE_NEXT          phase the Job runs (preflight|master|worker|postflight)
+#   TARGET_NODE_NEXT    node the Job operates on (empty for preflight/postflight)
+#   TARGET_VERSION      X.Y.Z
+#   TARGET_VERSION_LABEL  X-Y-Z (label-safe)
+#   KIND                patch | minor
+#   IMAGE               container image to run upgrade-step.sh
+#   SCHEDULING_BLOCK    YAML fragment with nodeSelector/tolerations (may be empty)
+#
+# Idempotency: name is deterministic per (phase, target_version[, target_node])
+# so `kubectl apply` reconciles to a single Job per run.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  namespace: k8s-upgrade
+  labels:
+    app: k8s-upgrade-chain
+    phase: ${PHASE_NEXT}
+    target-version: "${TARGET_VERSION_LABEL}"
+spec:
+  ttlSecondsAfterFinished: 604800   # 7 days for postmortem review
+  backoffLimit: 1
+  template:
+    metadata:
+      labels:
+        app: k8s-upgrade-chain
+        phase: ${PHASE_NEXT}
+    spec:
+      serviceAccountName: k8s-upgrade-job
+      restartPolicy: Never
+${SCHEDULING_BLOCK}
+      imagePullSecrets:
+        - name: registry-credentials
+      containers:
+        - name: upgrade-step
+          image: ${IMAGE}
+          env:
+            - name: PHASE
+              value: "${PHASE_NEXT}"
+            - name: TARGET_NODE
+              value: "${TARGET_NODE_NEXT}"
+            - name: TARGET_VERSION
+              value: "${TARGET_VERSION}"
+            - name: KIND
+              value: "${KIND}"
+            - name: IMAGE
+              value: "${IMAGE}"
+            - name: HOME
+              value: "/tmp"
+          command: ["/bin/bash", "/scripts/upgrade-step.sh"]
+          volumeMounts:
+            - name: creds
+              mountPath: /secrets/k8s-upgrade
+              readOnly: true
+            - name: scripts
+              mountPath: /scripts
+              readOnly: true
+            - name: template
+              mountPath: /template
+              readOnly: true
+          resources:
+            requests:
+              cpu: "100m"
+              memory: "256Mi"
+            limits:
+              memory: "512Mi"
+      volumes:
+        - name: creds
+          secret:
+            secretName: k8s-upgrade-creds
+            # 0444 so the non-root container can read; upgrade-step.sh copies
+            # the SSH key to /tmp/ssh_key with mode 0400 for openssh.
+            defaultMode: 0444
+        - name: scripts
+          configMap:
+            name: k8s-upgrade-scripts
+            defaultMode: 0755
+        - name: template
+          configMap:
+            name: k8s-upgrade-job-template
--- a/stacks/k8s-version-upgrade/main.tf
+++ b/stacks/k8s-version-upgrade/main.tf
@ -0,0 +1,523 @@
+# k8s-version-upgrade — Automated K8s component (kubeadm/kubelet/kubectl) upgrade
+#
+# Architecture: detection CronJob → chain of small Jobs, one per phase. Each
+# Job's pod runs on a node that is NOT its drain target — eliminates the
+# self-preemption bug that killed the agent-based v1 (2026-05-11 incident).
+#
+# Chain (Job 0 → Job 6):
+#   preflight  (pinned: k8s-node1)
+#   master     (pinned: k8s-node1; drains k8s-master)
+#   worker     (pinned: k8s-node1; drains k8s-node4 → 3 → 2)
+#   worker     (pinned: k8s-master + control-plane toleration; drains k8s-node1 last)
+#   postflight (no pinning)
+#
+# Each phase Job's container runs scripts/upgrade-step.sh which:
+#   - dispatches on $PHASE
+#   - spawns the next Job via envsubst on job-template.yaml
+#   - uses deterministic naming (k8s-upgrade-${phase}-${target_version}[-${node}])
+#     so re-running on failure reconciles to a single Job per run.
+#
+# Reuse points:
+#   - claude-agent-service image (kubectl + ssh + jq + curl + envsubst)
+#   - Vault secret/k8s-upgrade/* (ssh_key, slack_webhook)
+#   - Prometheus + Pushgateway + Upgrade Gates alerts
+#   - default/backup-etcd CronJob (snapshot trigger)
+#   - infra/scripts/update_k8s.sh (per-node upgrade body)
+
+variable "schedule" {
+  type    = string
+  # Daily 12:00 UTC — outside kured window (kured runs 02:00-06:00
+  # London). Was weekly Sunday until 2026-05-18; daily picks up upstream
+  # patch releases the same day they land. Concurrency is bounded by the
+  # CronJob's Forbid policy + Job-name idempotency (the detection job
+  # skips spawning a preflight Job if one already exists).
+  default = "0 12 * * *"
+}
+
+variable "enabled" {
+  type    = bool
+  default = true
+}
+
+# Mirrors `local.image_tag` in stacks/claude-agent-service/main.tf — bump
+# in lockstep with claude-agent-service rebuilds. The image ships kubectl,
+# ssh-client, curl, jq, envsubst — everything the upgrade Jobs need.
+variable "image_tag" {
+  type    = string
+  default = "2fd7670d"
+}
+
+# When true, detection runs but does NOT spawn the preflight Job.
+variable "detection_dry_run" {
+  type    = bool
+  default = false
+}
+
+locals {
+  namespace = "k8s-upgrade"
+  image     = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.image_tag}"
+  labels = {
+    app = "k8s-version-upgrade"
+  }
+}
+
+# --- Namespace ---
+
+resource "kubernetes_namespace" "k8s_upgrade" {
+  metadata {
+    name = local.namespace
+    labels = {
+      tier = local.tiers.cluster
+      "keel.sh/enrolled" = "true"
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label
+    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
+  }
+}
+
+# --- ExternalSecret: SSH key + Slack webhook ---
+#
+# Operator populates Vault `secret/k8s-upgrade/` with:
+#   - ssh_key       (ed25519 PRIVATE key, used to SSH wizard@<node> from Jobs)
+#   - ssh_key_pub   (matching public key, deployed to nodes' authorized_keys)
+#   - slack_webhook (incoming-webhook URL)
+#
+# No claude-agent bearer needed — the chain no longer POSTs to that service.
+resource "kubernetes_manifest" "external_secret" {
+  manifest = {
+    apiVersion = "external-secrets.io/v1beta1"
+    kind       = "ExternalSecret"
+    metadata = {
+      name      = "k8s-upgrade-creds"
+      namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+    }
+    spec = {
+      refreshInterval = "15m"
+      secretStoreRef = {
+        name = "vault-kv"
+        kind = "ClusterSecretStore"
+      }
+      target = {
+        name = "k8s-upgrade-creds"
+      }
+      data = [
+        {
+          secretKey = "ssh_key"
+          remoteRef = {
+            key      = "k8s-upgrade"
+            property = "ssh_key"
+          }
+        },
+        {
+          secretKey = "slack_webhook"
+          remoteRef = {
+            key      = "k8s-upgrade"
+            property = "slack_webhook"
+          }
+        },
+      ]
+    }
+  }
+}
+
+# --- Unified ServiceAccount + RBAC ---
+#
+# One SA serves BOTH the detection CronJob and every phase Job:
+#   - detection CronJob: needs nodes:get/list + secrets:get + jobs:create
+#     (to spawn Job 0 = preflight)
+#   - phase Jobs: same + pods/eviction:create + pods:delete + namespaces:patch
+#
+# Cluster-scoped because the chain spans the whole cluster (drain works on
+# any node, and the preflight Job creates a Job in `default` ns from
+# `cronjob/backup-etcd`).
+
+resource "kubernetes_service_account" "k8s_upgrade_job" {
+  metadata {
+    name      = "k8s-upgrade-job"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+  }
+}
+
+resource "kubernetes_cluster_role" "k8s_upgrade_job" {
+  metadata {
+    name = "k8s-upgrade-job"
+  }
+  # Read nodes (version comparison + readiness check)
+  rule {
+    api_groups = [""]
+    resources  = ["nodes"]
+    verbs      = ["get", "list", "patch", "update"]
+  }
+  # Drain — evict pods
+  rule {
+    api_groups = [""]
+    resources  = ["pods/eviction"]
+    verbs      = ["create"]
+  }
+  # Drain fallback — direct delete (predrain_unstick bypasses PDBs)
+  rule {
+    api_groups = [""]
+    resources  = ["pods"]
+    verbs      = ["get", "list", "delete"]
+  }
+  # Read PDBs to find drain-blocking pods
+  rule {
+    api_groups = ["policy"]
+    resources  = ["poddisruptionbudgets"]
+    verbs      = ["get", "list"]
+  }
+  # Read DaemonSets/StatefulSets/ReplicaSets/Deployments so `kubectl drain
+  # --ignore-daemonsets` can classify each pod's owner. Without daemonsets
+  # GET permission, drain bails with "cannot delete daemonsets ... is
+  # forbidden" for every daemonset-managed pod on the node. (2026-05-20)
+  #
+  # `patch` on deployments added 2026-05-23: phase_master scales tigera-operator
+  # to 0 before drain (operator crashloops during apiserver static-pod swaps,
+  # generates I/O storm that breaks kubeadm's 5-min watch) and back to 1
+  # after master is upgraded. Until HA control plane lands (beads code-n0ow),
+  # this is how we keep autonomous upgrades unblocked.
+  rule {
+    api_groups = ["apps"]
+    resources  = ["daemonsets", "statefulsets", "replicasets", "deployments"]
+    verbs      = ["get", "list"]
+  }
+  rule {
+    api_groups = ["apps"]
+    resources  = ["deployments", "deployments/scale"]
+    verbs      = ["patch", "update"]
+  }
+  # Chain dispatch — create the next Job; reconcile via apply on retry.
+  # In `default` ns to also create the etcd-snapshot Job from cronjob/backup-etcd.
+  rule {
+    api_groups = ["batch"]
+    resources  = ["jobs"]
+    verbs      = ["create", "get", "list", "delete", "patch", "watch"]
+  }
+  # Pull CronJob spec for `kubectl create job --from=cronjob/backup-etcd`
+  rule {
+    api_groups = ["batch"]
+    resources  = ["cronjobs"]
+    verbs      = ["get", "list"]
+  }
+  # Annotate the k8s-upgrade namespace (in-flight marker + snapshot path)
+  rule {
+    api_groups     = [""]
+    resources      = ["namespaces"]
+    resource_names = [local.namespace]
+    verbs          = ["get", "patch", "update"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "k8s_upgrade_job" {
+  metadata {
+    name = "k8s-upgrade-job"
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role.k8s_upgrade_job.metadata[0].name
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+  }
+}
+
+# Namespaced: read the credentials Secret in k8s-upgrade (SSH key + Slack URL)
+# + read the kill-switch ConfigMap (one-touch emergency-stop for the chain).
+resource "kubernetes_role" "k8s_upgrade_job_ns" {
+  metadata {
+    name      = "k8s-upgrade-job-ns"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+  }
+  rule {
+    api_groups     = [""]
+    resources      = ["secrets"]
+    resource_names = ["k8s-upgrade-creds"]
+    verbs          = ["get"]
+  }
+  # Kill-switch ConfigMap. Existence halts the chain (any phase) — see the
+  # "Kill-switch" block at the top of scripts/upgrade-step.sh.
+  rule {
+    api_groups     = [""]
+    resources      = ["configmaps"]
+    resource_names = ["k8s-upgrade-killswitch"]
+    verbs          = ["get", "list", "watch"]
+  }
+}
+
+resource "kubernetes_role_binding" "k8s_upgrade_job_ns" {
+  metadata {
+    name      = "k8s-upgrade-job-ns"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "Role"
+    name      = kubernetes_role.k8s_upgrade_job_ns.metadata[0].name
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+  }
+}
+
+# --- ConfigMaps: scripts + Job template ---
+
+resource "kubernetes_config_map" "k8s_upgrade_scripts" {
+  metadata {
+    name      = "k8s-upgrade-scripts"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+    labels    = local.labels
+  }
+  data = {
+    "upgrade-step.sh" = file("${path.module}/scripts/upgrade-step.sh")
+    "update_k8s.sh"   = file("${path.module}/../../scripts/update_k8s.sh")
+  }
+}
+
+resource "kubernetes_config_map" "k8s_upgrade_job_template" {
+  metadata {
+    name      = "k8s-upgrade-job-template"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+    labels    = local.labels
+  }
+  data = {
+    "job-template.yaml" = file("${path.module}/job-template.yaml")
+  }
+}
+
+# --- Detection CronJob ---
+#
+# Probes for available patch/minor targets weekly. When one is found, renders
+# Job 0 (preflight) from the same job-template the chain uses. The CronJob no
+# longer POSTs to claude-agent-service; the whole pipeline now runs inside the
+# cluster via Job-chaining.
+
+resource "kubernetes_cron_job_v1" "k8s_version_check" {
+  metadata {
+    name      = "k8s-version-check"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+    labels    = local.labels
+  }
+  spec {
+    schedule                      = var.schedule
+    concurrency_policy            = "Forbid"
+    successful_jobs_history_limit = 3
+    failed_jobs_history_limit     = 3
+    starting_deadline_seconds     = 600
+    suspend                       = !var.enabled
+    job_template {
+      metadata {
+        labels = local.labels
+      }
+      spec {
+        backoff_limit              = 0
+        ttl_seconds_after_finished = 86400
+        template {
+          metadata {
+            labels = local.labels
+          }
+          spec {
+            service_account_name = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
+            restart_policy       = "Never"
+            image_pull_secrets {
+              name = "registry-credentials"
+            }
+            volume {
+              name = "creds"
+              secret {
+                secret_name = "k8s-upgrade-creds"
+                # 0444 — non-root container needs read; SSH key gets re-installed
+                # with mode 0400 in the inline command before any ssh call.
+                default_mode = "0444"
+              }
+            }
+            volume {
+              name = "template"
+              config_map {
+                name = kubernetes_config_map.k8s_upgrade_job_template.metadata[0].name
+              }
+            }
+            container {
+              name  = "version-check"
+              image = local.image
+              command = ["/bin/bash", "-c", <<-EOT
+                set -euo pipefail
+                echo "==> k8s-version-check ($(date -u +%FT%TZ))"
+
+                SLACK=$(cat /secrets/k8s-upgrade/slack_webhook)
+                install -m 0400 /secrets/k8s-upgrade/ssh_key /tmp/ssh_key
+                SSH="ssh -i /tmp/ssh_key -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/tmp/known_hosts -o ConnectTimeout=10"
+
+                slack() {
+                  curl -sS -X POST -H 'Content-Type: application/json' \
+                    --data "$(jq -nc --arg t "[k8s-version-check] $1" '{text: $t}')" \
+                    "$SLACK" || true
+                }
+
+                # Kill-switch — see scripts/upgrade-step.sh for full docs.
+                # ConfigMap existence halts the chain (any phase).
+                if /usr/local/bin/kubectl -n k8s-upgrade get configmap k8s-upgrade-killswitch >/dev/null 2>&1; then
+                  reason=$(/usr/local/bin/kubectl -n k8s-upgrade get configmap k8s-upgrade-killswitch \
+                    -o jsonpath='{.data.reason}' 2>/dev/null || echo "(no reason set)")
+                  slack "version-check HALTED by kill-switch: $reason"
+                  echo "HALTED. Resume: kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch"
+                  exit 0
+                fi
+
+                # 1. Detect running version — use the OLDEST kubelet across
+                # all nodes so partial chains (e.g. master upgraded but
+                # workers still pending) don't trick the chain into
+                # thinking the upgrade is complete. Was `.items[0]` (master
+                # only) which made the chain skip when workers were behind.
+                # Fixed 2026-05-23 after node4-only chain failure.
+                RUNNING=$(/usr/local/bin/kubectl get nodes \
+                  -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' \
+                  | tr -d v | sort -V | head -1)
+                RUNNING_MINOR=$(echo "$RUNNING" | awk -F. '{print $1"."$2}')
+                echo "Running version (oldest kubelet): v$RUNNING (minor $RUNNING_MINOR)"
+
+                # 2. Latest patch within current minor (refresh master's apt cache)
+                LATEST_PATCH=$($SSH wizard@k8s-master.viktorbarzin.lan \
+                  "sudo apt-get update -qq -o Dir::Etc::sourcelist='sources.list.d/kubernetes.list' -o Dir::Etc::sourceparts='-' -o APT::Get::List-Cleanup='0' >/dev/null 2>&1 ; \
+                   apt-cache madison kubeadm 2>/dev/null \
+                    | awk '{print \$3}' \
+                    | sed 's/-.*//' \
+                    | grep '^$RUNNING_MINOR\\.' \
+                    | sort -V | tail -1" || echo "")
+                echo "Latest patch: v$LATEST_PATCH"
+
+                # 3. Next-minor probe
+                NEXT_MINOR_NUM=$(( $(echo "$RUNNING_MINOR" | cut -d. -f2) + 1 ))
+                NEXT_MINOR="1.$NEXT_MINOR_NUM"
+                NEXT_MINOR_AVAILABLE="no"
+                if curl -sIo /dev/null -w '%%{http_code}' \
+                    "https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Release" \
+                    | grep -q '^200$'; then
+                  NEXT_MINOR_AVAILABLE="yes"
+                fi
+                echo "Next minor v$NEXT_MINOR available: $NEXT_MINOR_AVAILABLE"
+
+                # 4. Choose target
+                TARGET=""
+                KIND=""
+                if [ -n "$LATEST_PATCH" ] && [ "$LATEST_PATCH" != "$RUNNING" ]; then
+                  TARGET="$LATEST_PATCH"
+                  KIND="patch"
+                elif [ "$NEXT_MINOR_AVAILABLE" = "yes" ]; then
+                  NEXT_MINOR_PATCH=$($SSH wizard@k8s-master.viktorbarzin.lan \
+                    "curl -sf 'https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Packages' \
+                      | grep -oE 'Version: [0-9.-]+' \
+                      | awk '{print \$2}' | sed 's/-.*//' \
+                      | sort -V | tail -1" || echo "")
+                  if [ -n "$NEXT_MINOR_PATCH" ]; then
+                    TARGET="$NEXT_MINOR_PATCH"
+                    KIND="minor"
+                  fi
+                fi
+
+                # 5. Pushgateway discovery metric
+                PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/k8s-version-check'
+                {
+                  echo "# TYPE k8s_upgrade_available gauge"
+                  if [ -n "$TARGET" ]; then
+                    echo "k8s_upgrade_available{kind=\"$KIND\",running=\"$RUNNING\",target=\"$TARGET\"} 1"
+                  else
+                    echo "k8s_upgrade_available{kind=\"none\",running=\"$RUNNING\",target=\"$RUNNING\"} 0"
+                  fi
+                  echo "# TYPE k8s_version_check_last_run_timestamp gauge"
+                  echo "k8s_version_check_last_run_timestamp $(date +%s)"
+                } | curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed"
+
+                # 6. Decide whether to spawn Job 0
+                if [ -z "$TARGET" ]; then
+                  echo "No upgrade needed"
+                  exit 0
+                fi
+
+                slack "K8s upgrade available: v$RUNNING → v$TARGET ($KIND)"
+
+                if [ "$DRY_RUN" = "true" ]; then
+                  slack "DRY_RUN — not spawning preflight Job"
+                  exit 0
+                fi
+
+                # 7. Spawn Job 0 (preflight) via envsubst on the job-template
+                #    Idempotency: deterministic name reconciles via `apply`.
+                JOB_NAME="k8s-upgrade-preflight-$${TARGET//./-}"
+
+                if /usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" >/dev/null 2>&1; then
+                  slack "Preflight Job $JOB_NAME already exists (rerunning detection mid-flight?)"
+                  exit 0
+                fi
+
+                export JOB_NAME PHASE_NEXT=preflight TARGET_NODE_NEXT="" \
+                       TARGET_VERSION="$TARGET" TARGET_VERSION_LABEL="$${TARGET//./-}" \
+                       KIND="$KIND" IMAGE="$${IMAGE}" \
+                       SCHEDULING_BLOCK=$'      nodeSelector:\n        kubernetes.io/hostname: k8s-node1'
+
+                python3 -c 'import os,sys;sys.stdout.write(os.path.expandvars(sys.stdin.read()))' \
+                  < /template/job-template.yaml \
+                  | /usr/local/bin/kubectl apply -f -
+
+                slack "Spawned $JOB_NAME (target=v$TARGET kind=$KIND)"
+              EOT
+              ]
+              env {
+                name  = "DRY_RUN"
+                value = tostring(var.detection_dry_run)
+              }
+              env {
+                name  = "IMAGE"
+                value = local.image
+              }
+              env {
+                name  = "HOME"
+                value = "/tmp"
+              }
+              volume_mount {
+                name       = "creds"
+                mount_path = "/secrets/k8s-upgrade"
+                read_only  = true
+              }
+              volume_mount {
+                name       = "template"
+                mount_path = "/template"
+                read_only  = true
+              }
+              resources {
+                requests = {
+                  cpu    = "50m"
+                  memory = "128Mi"
+                }
+                limits = {
+                  memory = "256Mi"
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
+    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
+  }
+}
+
+# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
+# CI retrigger v2 2026-05-16T13:46:35+00:00
+
+# CI retrigger v3 2026-05-16T14:06:39Z
+
+# CI retrigger v4 2026-05-16T14:13:59Z
+
+# CI retrigger v5 2026-05-16T23:10:38Z
+
+# CI retrigger v6 2026-05-16T23:18:58Z
--- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
+++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
@ -0,0 +1,564 @@
+#!/usr/bin/env bash
+#
+# Universal upgrade-step body. Each Job in the k8s-version-upgrade chain runs
+# this once, dispatching on $PHASE. On success it computes the next phase and
+# spawns the next Job. The chain is:
+#
+#   preflight  (run on k8s-node1)
+#     ↓
+#   master     (drains k8s-master; run on k8s-node1)
+#     ↓
+#   worker k8s-node4   (run on k8s-node1)
+#     ↓
+#   worker k8s-node3   (run on k8s-node1)
+#     ↓
+#   worker k8s-node2   (run on k8s-node1)
+#     ↓
+#   worker k8s-node1   (drains k8s-node1; run on k8s-master with control-plane toleration)
+#     ↓
+#   postflight (no node pinning)
+#
+# k8s-node1 hosts every Job except the one that drains k8s-node1 itself.
+# k8s-node1 is therefore upgraded LAST.
+#
+# Required env vars (set on the Job pod by job-template.yaml):
+#   PHASE              preflight | master | worker | postflight
+#   TARGET_NODE        k8s-master | k8s-nodeN  (empty for preflight/postflight)
+#   TARGET_VERSION     X.Y.Z
+#   KIND               patch | minor
+#   IMAGE              container image to use for next Job in the chain
+
+set -euo pipefail
+
+NS=k8s-upgrade
+SSH_KEY=/secrets/k8s-upgrade/ssh_key
+SLACK_FILE=/secrets/k8s-upgrade/slack_webhook
+PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/k8s-version-upgrade'
+PROM='http://prometheus-server.monitoring.svc.cluster.local:80'
+KUBECTL=kubectl
+JOB_TEMPLATE=/template/job-template.yaml
+UPDATE_K8S_SH=/scripts/update_k8s.sh
+
+# Pod-side DNS: the cluster's CoreDNS has search domains
+# `<ns>.svc.cluster.local svc.cluster.local cluster.local` (plus ndots=2 via
+# Kyverno mutation). Unqualified `k8s-master` falls through all of these and
+# then queries the upstream DNS (Technitium) for bare `k8s-master`, which
+# returns NXDOMAIN. The FQDN `k8s-master.viktorbarzin.lan` is what Technitium
+# actually serves. Suffix every node SSH target with this domain.
+NODE_DOMAIN=".viktorbarzin.lan"
+
+# SSH key must be 0400 — refresh from secret mount (defaultMode does this but
+# bind-mount semantics can preserve loose perms; chmod is idempotent).
+install -m 0400 "$SSH_KEY" /tmp/ssh_key
+SSH_KEY=/tmp/ssh_key
+
+SSH_OPTS=(-i "$SSH_KEY"
+          -o StrictHostKeyChecking=accept-new
+          -o UserKnownHostsFile=/tmp/known_hosts
+          -o ConnectTimeout=10)
+
+SLACK_URL="$(cat "$SLACK_FILE")"
+
+slack() {
+  local msg="$1"
+  curl -sS -X POST -H 'Content-Type: application/json' \
+    --data "$(jq -nc --arg t "[k8s-upgrade-${PHASE}${TARGET_NODE:+:$TARGET_NODE}] $msg" \
+              '{text: $t}')" \
+    "$SLACK_URL" >/dev/null || echo "warn: slack post failed"
+}
+
+# Kill-switch — checked before every phase. If the ConfigMap
+# `k8s-upgrade-killswitch` exists in the `k8s-upgrade` namespace, the chain
+# halts immediately (exit 0, not 1 — this is an intentional pause, not a
+# failure). Restores via `kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch`.
+# Designed for "stop the storm" scenarios: emergency-press the brake from
+# any kubectl session in <5 seconds, no script changes needed.
+#
+# Create:  kubectl -n k8s-upgrade create configmap k8s-upgrade-killswitch \
+#               --from-literal=reason="why you stopped it"
+# Inspect: kubectl -n k8s-upgrade get cm k8s-upgrade-killswitch -o yaml
+# Resume:  kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch
+if $KUBECTL -n "$NS" get configmap k8s-upgrade-killswitch >/dev/null 2>&1; then
+  reason=$($KUBECTL -n "$NS" get configmap k8s-upgrade-killswitch \
+    -o jsonpath='{.data.reason}' 2>/dev/null || echo "(no reason set)")
+  slack "HALTED by kill-switch (phase=$PHASE target_node=${TARGET_NODE:-none}): $reason"
+  echo "HALTED by k8s-upgrade-killswitch ConfigMap. Reason: $reason"
+  echo "Resume: kubectl -n $NS delete cm k8s-upgrade-killswitch"
+  exit 0
+fi
+
+push() {
+  printf '# TYPE %s gauge\n%s %s\n' "$1" "$1" "$2" \
+    | curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed"
+}
+
+halt_on_alert_query() {
+  local extra_ignore="${1:-}"
+  # ALLOWLIST design (refactored 2026-05-23 from a denylist): halt only on
+  # alerts with severity=critical. Any warning/info-level alert is treated
+  # as informational and doesn't block the chain.
+  #
+  # Why this is the right model:
+  #   - The cluster has long-running warning-level alerts that are NOT
+  #     blockers for a k8s patch (e.g. GPU operator crashloop on the GPU
+  #     node, ingress latency spikes, IO-wait warnings).
+  #   - Maintaining a denylist of every "noisy" alert is a losing battle.
+  #   - Critical alerts are the only ones that should actually stop us
+  #     mid-chain (apiserver down, etcd down, node not ready, etc.).
+  #
+  # `extra_ignore` is now mostly historical — kept for backwards compat with
+  # `halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical"`-style calls. With severity-based
+  # filtering, RecentNodeReboot (severity=info) is filtered automatically.
+  # We still build the regex for any critical alert the caller wants to
+  # explicitly ignore (e.g. a known-broken thing we're aware of).
+  local ignore_regex=""
+  [ -n "$extra_ignore" ] && ignore_regex="^($extra_ignore)\$"
+
+  # `grep` returns 1 when nothing matches → under `set -o pipefail` that
+  # bubbles up and aborts the script via the caller's `alerts=$(...)`.
+  # Trailing `|| true` on each grep handles the no-matches case.
+  local critical_firing
+  critical_firing=$(curl -sf "$PROM/api/v1/alerts" \
+    | jq -r '.data.alerts[]
+              | select(.state == "firing" and .labels.severity == "critical")
+              | .labels.alertname' 2>/dev/null \
+    | sort -u || true)
+
+  if [ -n "$ignore_regex" ]; then
+    echo "$critical_firing" | { grep -vE "$ignore_regex" || true; }
+  else
+    echo "$critical_firing"
+  fi
+}
+
+wait_for_node_ready() {
+  local node="$1" want_version="$2" deadline=$(( $(date +%s) + 900 ))  # 15 min
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    local status kubelet
+    status=$($KUBECTL get node "$node" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)
+    kubelet=$($KUBECTL get node "$node" -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null | tr -d v || true)
+    if [ "$status" = "True" ] && [ "$kubelet" = "$want_version" ]; then
+      return 0
+    fi
+    sleep 15
+  done
+  return 1
+}
+
+# Pre-drain: find pods on $node whose PDB has zero disruptionsAllowed and
+# delete them directly. Drain's eviction API respects PDBs and will loop
+# forever on single-replica deployments with `minAvailable: 1` — common
+# pattern on this cluster (e.g. Anubis instances default to replicas=1). A
+# direct delete bypasses eviction; the parent Deployment recreates the pod
+# elsewhere (the node is already cordoned by drain).
+predrain_unstick() {
+  local node="$1"
+  $KUBECTL get pdb -A -o json | jq -r '
+    .items[]
+    | select(.status.disruptionsAllowed == 0)
+    | "\(.metadata.namespace) \(.spec.selector.matchLabels | to_entries | map("\(.key)=\(.value)") | join(","))"
+  ' | while read -r ns selector; do
+    [ -z "$selector" ] && continue
+    $KUBECTL -n "$ns" get pods --field-selector "spec.nodeName=$node,status.phase=Running" \
+      -l "$selector" -o name 2>/dev/null \
+      | while read -r pod; do
+          echo "predrain_unstick: deleting PDB-blocked $ns/$pod (drain would loop on it)"
+          $KUBECTL -n "$ns" delete "$pod" --wait=false || true
+        done
+  done
+}
+
+# Drain wrapper: kick predrain_unstick before drain, then again every 60s in
+# the background while drain runs (in case new pods land mid-drain). Drain
+# exits when the node has no non-daemonset workload.
+drain_node() {
+  local node="$1"
+  predrain_unstick "$node"
+  ( while kill -0 $$ 2>/dev/null; do sleep 60; predrain_unstick "$node"; done ) &
+  local watcher=$!
+  trap "kill $watcher 2>/dev/null || true" EXIT
+  $KUBECTL drain "$node" --ignore-daemonsets --delete-emptydir-data --force --grace-period=300
+  kill $watcher 2>/dev/null || true
+  trap - EXIT
+}
+
+# ---------------------------------------------------------------------------
+# Chain definition — what comes after the current phase
+# ---------------------------------------------------------------------------
+
+NEXT_PHASE=""
+NEXT_TARGET_NODE=""
+NEXT_RUN_ON=""
+
+case "${PHASE}:${TARGET_NODE:-}" in
+  preflight:)
+    NEXT_PHASE=master
+    NEXT_RUN_ON=k8s-node1 ;;
+  master:)
+    NEXT_PHASE=worker; NEXT_TARGET_NODE=k8s-node4
+    NEXT_RUN_ON=k8s-node1 ;;
+  worker:k8s-node4)
+    NEXT_PHASE=worker; NEXT_TARGET_NODE=k8s-node3
+    NEXT_RUN_ON=k8s-node1 ;;
+  worker:k8s-node3)
+    NEXT_PHASE=worker; NEXT_TARGET_NODE=k8s-node2
+    NEXT_RUN_ON=k8s-node1 ;;
+  worker:k8s-node2)
+    NEXT_PHASE=worker; NEXT_TARGET_NODE=k8s-node1
+    NEXT_RUN_ON=k8s-master ;;  # control-plane toleration required
+  worker:k8s-node1)
+    NEXT_PHASE=postflight
+    NEXT_RUN_ON="" ;;          # no node pinning for postflight
+  postflight:)
+    NEXT_PHASE="" ;;           # end of chain
+  *)
+    echo "ERROR: unknown phase/target combo: ${PHASE}/${TARGET_NODE:-}" >&2
+    exit 2 ;;
+esac
+
+spawn_next() {
+  [ -z "$NEXT_PHASE" ] && { echo "End of chain."; return 0; }
+
+  local job_name="k8s-upgrade-${NEXT_PHASE}-${TARGET_VERSION//./-}"
+  [ -n "${NEXT_TARGET_NODE:-}" ] && job_name="${job_name}-${NEXT_TARGET_NODE}"
+
+  if $KUBECTL -n "$NS" get job "$job_name" >/dev/null 2>&1; then
+    echo "Next Job $job_name already exists; idempotent skip."
+    return 0
+  fi
+
+  local scheduling_block=""
+  case "${NEXT_RUN_ON:-}" in
+    k8s-master)
+      scheduling_block=$'      nodeSelector:\n        kubernetes.io/hostname: k8s-master\n      tolerations:\n        - key: node-role.kubernetes.io/control-plane\n          operator: Exists\n          effect: NoSchedule' ;;
+    "")
+      scheduling_block="" ;;
+    *)
+      scheduling_block=$'      nodeSelector:\n        kubernetes.io/hostname: '"$NEXT_RUN_ON" ;;
+  esac
+
+  export JOB_NAME="$job_name"
+  export PHASE_NEXT="$NEXT_PHASE"
+  export TARGET_NODE_NEXT="${NEXT_TARGET_NODE:-}"
+  export TARGET_VERSION_LABEL="${TARGET_VERSION//./-}"
+  export SCHEDULING_BLOCK="$scheduling_block"
+  # TARGET_VERSION, KIND, IMAGE inherited from current env
+
+  echo "Spawning next Job: $job_name (phase=$NEXT_PHASE target=${NEXT_TARGET_NODE:-} run_on=${NEXT_RUN_ON:-anywhere})"
+  # python3 expandvars replaces $VAR / ${VAR} from env, same semantics as
+  # envsubst but available in the claude-agent-service image (which lacks
+  # gettext-base). Multi-line $SCHEDULING_BLOCK is preserved correctly.
+  python3 -c 'import os,sys;sys.stdout.write(os.path.expandvars(sys.stdin.read()))' \
+    <"$JOB_TEMPLATE" | $KUBECTL apply -f -
+}
+
+# ---------------------------------------------------------------------------
+# Phase bodies
+# ---------------------------------------------------------------------------
+
+phase_preflight() {
+  slack "Starting preflight (target v$TARGET_VERSION, kind=$KIND)"
+
+  # 1. All nodes Ready + no pressure
+  local bad_nodes
+  bad_nodes=$($KUBECTL get nodes -o json | jq -r '
+    .items[]
+    | select(
+        (.status.conditions[] | select(.type=="Ready").status) != "True"
+        or (.status.conditions[] | select(.type=="MemoryPressure").status) == "True"
+        or (.status.conditions[] | select(.type=="DiskPressure").status) == "True")
+    | .metadata.name')
+  if [ -n "$bad_nodes" ]; then
+    slack "ABORT preflight — nodes unhealthy: $bad_nodes"
+    exit 1
+  fi
+
+  # 2. Halt-on-alert. RecentNodeReboot is fully redundant with check 3
+  # (inline quiet-baseline) below — both surface "a node rebooted recently".
+  # Including it here meant the chain refused to start for 1h after EVERY
+  # kured reboot of any node (kured fires whenever /var/run/reboot-required
+  # is set, often daily). Now skipped — check 3 is the single source of truth
+  # for "is the cluster quiet enough to upgrade".
+  local alerts
+  alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
+  if [ -n "$alerts" ]; then
+    slack "ABORT preflight — firing alerts:\n$alerts"
+    exit 1
+  fi
+
+  # 3. Quiet-baseline check — fail if any node had a Ready transition in the
+  # last 10 min. Tightened from 3600s → 600s on 2026-05-21 after diagnosing
+  # that the previous 1h window meant the chain couldn't run after any
+  # reboot for an hour. 10min is sufficient for kubelet/control-plane to
+  # stabilise; the kured-sentinel-gate DaemonSet enforces the broader
+  # 24h-between-cluster-reboots invariant.
+  local recent=0
+  while IFS= read -r ts; do
+    [ -z "$ts" ] && continue
+    local diff=$(( $(date +%s) - $(date -d "$ts" +%s) ))
+    if [ "$diff" -lt 600 ]; then recent=1; break; fi
+  done < <($KUBECTL get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
+  if [ "$recent" -eq 1 ]; then
+    slack "ABORT preflight — node transitioned Ready <10min ago (settle window)"
+    exit 1
+  fi
+
+  # 4. kubeadm upgrade plan matches target
+  local plan_target
+  plan_target=$(ssh "${SSH_OPTS[@]}" "wizard@k8s-master$NODE_DOMAIN" 'sudo kubeadm upgrade plan' \
+    | grep -oE 'kubeadm upgrade apply v[0-9]+\.[0-9]+\.[0-9]+' \
+    | grep -oE 'v[0-9]+\.[0-9]+\.[0-9]+' | head -1 | tr -d v)
+  if [ "$plan_target" != "$TARGET_VERSION" ]; then
+    slack "ABORT preflight — kubeadm plan target $plan_target ≠ requested $TARGET_VERSION"
+    exit 1
+  fi
+
+  # 5. Push in-flight + started_timestamp metrics + ns annotations
+  $KUBECTL annotate ns "$NS" \
+    "viktorbarzin.me/k8s-upgrade-in-flight=$(date -u +%FT%TZ)" \
+    "viktorbarzin.me/k8s-upgrade-target=$TARGET_VERSION" \
+    --overwrite
+  push k8s_upgrade_in_flight 1
+  push k8s_upgrade_started_timestamp "$(date +%s)"
+  push k8s_upgrade_snapshot_taken 0
+
+  # 6. Trigger backup-etcd Job, wait, verify size
+  local snap_job="pre-upgrade-etcd-${TARGET_VERSION//./-}-$(date +%s)"
+  $KUBECTL -n default create job --from=cronjob/backup-etcd "$snap_job"
+  if ! $KUBECTL -n default wait --for=condition=complete --timeout=600s "job/$snap_job"; then
+    $KUBECTL -n default describe "job/$snap_job" | tail -30
+    slack "ABORT preflight — etcd snapshot Job did not complete in 10 min"
+    exit 1
+  fi
+  local snap_log size snap_file
+  snap_log=$($KUBECTL -n default logs "job/$snap_job" -c backup-manage --tail=20 || \
+             $KUBECTL -n default logs "job/$snap_job" --tail=20)
+  size=$(echo "$snap_log" | grep -E '^Backup done:' | grep -oE '\([0-9]+ bytes\)' | grep -oE '[0-9]+' || true)
+  snap_file=$(echo "$snap_log" | grep -E '^Backup done:' | awk '{print $3}' || true)
+  if [ -z "$size" ] || [ "$size" -lt 1024 ]; then
+    slack "ABORT preflight — etcd snapshot empty (size='${size:-unknown}')"
+    exit 1
+  fi
+  $KUBECTL annotate ns "$NS" \
+    "viktorbarzin.me/k8s-upgrade-snapshot-path=nfs://192.168.1.127:/srv/nfs/etcd-backup/$snap_file" \
+    --overwrite
+  push k8s_upgrade_snapshot_taken 1
+
+  # 7. Containerd skew fix on master (if master < workers)
+  local master_ctr worker_max=0.0.0
+  master_ctr=$(ssh "${SSH_OPTS[@]}" "wizard@k8s-master$NODE_DOMAIN" "containerd --version | awk '{print \$3}' | tr -d v")
+  for n in k8s-node1 k8s-node2 k8s-node3 k8s-node4; do
+    local v
+    v=$(ssh "${SSH_OPTS[@]}" "wizard@$n$NODE_DOMAIN" "containerd --version | awk '{print \$3}' | tr -d v")
+    [ "$(printf '%s\n%s' "$v" "$worker_max" | sort -V | tail -1)" = "$v" ] && worker_max="$v"
+  done
+  if [ "$(printf '%s\n%s' "$master_ctr" "$worker_max" | sort -V | head -1)" = "$master_ctr" ] \
+     && [ "$master_ctr" != "$worker_max" ]; then
+    slack "Master containerd $master_ctr < workers $worker_max — bumping"
+    ssh "${SSH_OPTS[@]}" "wizard@k8s-master$NODE_DOMAIN" \
+      "sudo apt-mark unhold containerd.io && sudo apt-get install -y containerd.io='$worker_max-1' \
+       && sudo apt-mark hold containerd.io && sudo systemctl restart containerd"
+    wait_for_node_ready k8s-master "$($KUBECTL get node k8s-master -o jsonpath='{.status.nodeInfo.kubeletVersion}' | tr -d v)" \
+      || { slack "ABORT — k8s-master not Ready after containerd bump"; exit 1; }
+    slack "Master containerd: $master_ctr → $worker_max. Master Ready."
+  fi
+
+  # 8. Apt repo URL rewrite (minor only)
+  if [ "$KIND" = "minor" ]; then
+    local target_minor="${TARGET_VERSION%.*}"
+    for n in k8s-master k8s-node1 k8s-node2 k8s-node3 k8s-node4; do
+      ssh "${SSH_OPTS[@]}" "wizard@$n$NODE_DOMAIN" \
+        "echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$target_minor/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list \
+         && curl -fsSL 'https://pkgs.k8s.io/core:/stable:/v$target_minor/deb/Release.key' \
+              | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg --batch --yes \
+         && sudo apt-get update"
+    done
+    slack "Apt repo rewritten to v$target_minor/deb on all 5 nodes"
+  fi
+
+  slack "Preflight clean. Snapshot at nfs://...$snap_file ($size bytes). Dispatching master Job."
+}
+
+phase_master() {
+  # Idempotency: skip the whole phase if k8s-master is already on target.
+  # The chain can re-run after a partial failure (e.g. workers got cut
+  # short); without this short-circuit we re-drain and re-kubeadm an
+  # already-upgraded master for no reason. Added 2026-05-23.
+  local current_v
+  current_v=$($KUBECTL get node k8s-master -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null | tr -d v)
+  if [ "$current_v" = "$TARGET_VERSION" ]; then
+    slack "k8s-master already on v$TARGET_VERSION (kubelet=$current_v) — skipping master phase"
+    echo "k8s-master already on v$TARGET_VERSION — skipping"
+    return 0
+  fi
+
+  slack "Draining k8s-master"
+
+  # Re-check halt-on-alert before drain. Always ignore RecentNodeReboot —
+  # the chain itself causes node reboots, so this alert firing is expected
+  # mid-chain (e.g. master was already upgraded+rebooted before this phase).
+  local alerts
+  alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
+  [ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; }
+
+  # Quiesce noisy operators that crashloop when apiserver briefly disappears
+  # during the static-pod manifest swaps. The crashloop generates a disk-I/O
+  # storm (~500 MB/s observed from tigera-operator alone) that slows the
+  # apiserver↔kubelet status sync past kubeadm's hardcoded 5-min watch on
+  # `kubernetes.io/config.hash`, causing kubeadm to roll back the upgrade.
+  #
+  # The data plane (calico-node DaemonSet, calico-typha, calico-kube-controllers)
+  # keeps running unchanged — only the OPERATOR (a config reconciler) goes away
+  # briefly. Restored at the end of the phase below.
+  #
+  # If the chain dies between quiesce and restore (e.g. kubeadm fails),
+  # manually restore with:
+  #   kubectl -n tigera-operator scale deploy tigera-operator --replicas=1
+  #
+  # Long-term fix: HA control plane (3 masters) so apiserver never goes down
+  # — see docs/plans/2026-05-21-ha-control-plane-{design,plan}.md (beads code-n0ow).
+  echo "Quiescing tigera-operator before master upgrade (it crashes on apiserver outage)"
+  $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=0 2>&1 || true
+
+  drain_node k8s-master
+
+  slack "Running update_k8s.sh on k8s-master (--role master --release $TARGET_VERSION)"
+  ssh "${SSH_OPTS[@]}" "wizard@k8s-master$NODE_DOMAIN" 'bash -s' \
+    < "$UPDATE_K8S_SH" -- --role master --release "$TARGET_VERSION"
+
+  $KUBECTL uncordon k8s-master
+
+  wait_for_node_ready k8s-master "$TARGET_VERSION" \
+    || { slack "ABORT — k8s-master not Ready or wrong version after upgrade"; exit 1; }
+
+  local not_ready
+  # `grep -v Running` returns 1 when all pods are Running (happy path);
+  # under `set -o pipefail` that aborts the script. Wrap in `|| true`.
+  not_ready=$($KUBECTL -n kube-system get pods -l 'tier=control-plane' --no-headers 2>/dev/null \
+    | { grep -v Running || true; } | wc -l)
+  if [ "$not_ready" -gt 0 ]; then
+    slack "ABORT — $not_ready control-plane pods not Running after master upgrade"
+    exit 1
+  fi
+
+  alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
+  [ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; }
+
+  # Restore tigera-operator (quiesced before drain). It reconciles in seconds.
+  echo "Restoring tigera-operator"
+  $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=1 2>&1 || true
+
+  slack "Master on v$TARGET_VERSION, control-plane Running. Dispatching worker chain."
+}
+
+phase_worker() {
+  [ -z "$TARGET_NODE" ] && { echo "ERROR: worker phase requires TARGET_NODE"; exit 2; }
+
+  # Idempotency: skip if target node is already on target version. Same
+  # rationale as phase_master — chains re-running after partial completion
+  # shouldn't re-drain an already-upgraded worker. Added 2026-05-23.
+  local current_v
+  current_v=$($KUBECTL get node "$TARGET_NODE" -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null | tr -d v)
+  if [ "$current_v" = "$TARGET_VERSION" ]; then
+    slack "$TARGET_NODE already on v$TARGET_VERSION (kubelet=$current_v) — skipping worker phase"
+    echo "$TARGET_NODE already on v$TARGET_VERSION — skipping"
+    return 0
+  fi
+
+  slack "Draining $TARGET_NODE"
+
+  # Halt-on-alert wait (up to 30 min). Ignore RecentNodeReboot — the chain
+  # just rebooted a node, that's the cause and is expected.
+  local attempt alerts
+  for attempt in $(seq 1 30); do
+    alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
+    [ -z "$alerts" ] && break
+    echo "Waiting for alerts to clear (attempt $attempt/30): $alerts"
+    sleep 60
+  done
+  [ -n "$alerts" ] && { slack "ABORT $TARGET_NODE — alerts firing after 30min: $alerts"; exit 1; }
+
+  drain_node "$TARGET_NODE"
+
+  slack "Running update_k8s.sh on $TARGET_NODE (--role worker --release $TARGET_VERSION)"
+  ssh "${SSH_OPTS[@]}" "wizard@$TARGET_NODE$NODE_DOMAIN" 'bash -s' \
+    < "$UPDATE_K8S_SH" -- --role worker --release "$TARGET_VERSION"
+
+  $KUBECTL uncordon "$TARGET_NODE"
+
+  wait_for_node_ready "$TARGET_NODE" "$TARGET_VERSION" \
+    || { slack "ABORT — $TARGET_NODE not Ready or wrong version"; exit 1; }
+
+  # Daemonsets back on the node
+  local missing=0
+  for ds in calico-node kube-proxy; do
+    local count
+    count=$($KUBECTL get pods -A -o wide --field-selector "spec.nodeName=$TARGET_NODE,status.phase=Running" --no-headers \
+      | awk -v d="$ds" '$2 ~ d {n++} END{print n+0}')
+    [ "$count" -lt 1 ] && missing=$((missing+1))
+  done
+  [ "$missing" -gt 0 ] && { slack "WARN $TARGET_NODE — $missing daemonset(s) missing"; }
+
+  # 10-min soak with halt-on-alert (RecentNodeReboot ignored — we know we restarted it)
+  echo "Soaking $TARGET_NODE for 10 min..."
+  for i in $(seq 1 10); do
+    alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
+    [ -n "$alerts" ] && { slack "ABORT $TARGET_NODE mid-soak — alerts: $alerts"; exit 1; }
+    sleep 60
+  done
+
+  slack "$TARGET_NODE on v$TARGET_VERSION. Soaked clean (10 min)."
+}
+
+phase_postflight() {
+  slack "Running postflight"
+
+  # All 5 nodes at target
+  local versions wrong
+  versions=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.metadata.name}:{.status.nodeInfo.kubeletVersion}{"\n"}{end}')
+  # `grep -v` returns 1 when all nodes are on target (the happy path —
+  # exactly when postflight SHOULD succeed); under `set -o pipefail` that
+  # would abort the script right at the moment of victory.
+  wrong=$(echo "$versions" | { grep -v ":v${TARGET_VERSION}\$" || true; } | wc -l)
+  if [ "$wrong" -ne 0 ]; then
+    slack "ABORT postflight — $wrong node(s) off target:\n$versions"
+    exit 1
+  fi
+
+  # No alerts firing. Ignore RecentNodeReboot — by definition we just
+  # rebooted every node; this alert clears naturally in <1h.
+  local alerts
+  alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
+  [ -n "$alerts" ] && slack "Postflight WARN — alerts still firing (cluster on target, please check):\n$alerts"
+
+  # Pod-ready ratio
+  local ratio
+  ratio=$(curl -sf "$PROM/api/v1/query" \
+            --data-urlencode 'query=sum(kube_pod_status_ready{condition="true"}) / sum(kube_pod_status_phase{phase="Running"})' \
+          | jq -r '.data.result[0].value[1] // "0"')
+
+  # Clear annotations + gauges
+  $KUBECTL annotate ns "$NS" \
+    'viktorbarzin.me/k8s-upgrade-in-flight-' \
+    'viktorbarzin.me/k8s-upgrade-target-' \
+    'viktorbarzin.me/k8s-upgrade-snapshot-path-' || true
+  push k8s_upgrade_in_flight 0
+  push k8s_upgrade_snapshot_taken 0
+  push k8s_upgrade_started_timestamp 0
+
+  slack ":white_check_mark: K8s upgrade complete: cluster on v$TARGET_VERSION (pod-ready ratio $ratio)"
+}
+
+# ---------------------------------------------------------------------------
+# Dispatch
+# ---------------------------------------------------------------------------
+
+case "$PHASE" in
+  preflight)  phase_preflight ;;
+  master)     phase_master ;;
+  worker)     phase_worker ;;
+  postflight) phase_postflight ;;
+  *) echo "ERROR: unknown PHASE: $PHASE" >&2; exit 2 ;;
+esac
+
+spawn_next
--- a/stacks/k8s-version-upgrade/terragrunt.hcl
+++ b/stacks/k8s-version-upgrade/terragrunt.hcl
@ -0,0 +1,23 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+# ExternalSecret hits ESO which needs to be alive when the manifest applies.
+dependency "external_secrets" {
+  config_path  = "../external-secrets"
+  skip_outputs = true
+}
+
+# Upgrade Gates rules (incl. K8sVersionSkew + EtcdPreUpgradeSnapshotMissing)
+# live in the monitoring stack — make the relationship visible so reapplies
+# don't race the alerts being available.
+dependency "monitoring" {
+  config_path  = "../monitoring"
+  skip_outputs = true
+}
+
+# Note: stacks/claude-agent-service has no terragrunt.hcl yet (manual apply
+# pattern) — its ServiceAccount + Namespace are referenced by name from this
+# stack's RoleBindings, which is fine because RoleBindings allow forward
+# references. Apply order: claude-agent-service first (or already deployed),
+# then this stack.