From 4713c3a6d9b7596bbc6faf7c084b5486ce4c3c75 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 23 May 2026 08:40:11 +0000
Subject: [PATCH] k8s-version-upgrade: tigera quiesce + etcd-skip retry +
 IO-wait alert ignore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three changes unblocking the autonomous chain for k8s patch upgrades:

1. **phase_master quiesces tigera-operator before drain, restores after.**
   Tigera crashes immediately if apiserver is unreachable (no retry logic)
   and crashlooping it during master static-pod swaps generates ~500MB/s
   disk I/O that pushes kubeadm's 5-min static-pod-hash watch past its
   limit. Quiesce removes the storm contributor; calico data plane keeps
   running unchanged (data plane is the DaemonSet+Typha, operator is just
   the reconciler).

2. **update_k8s.sh retries with --etcd-upgrade=false on the 2nd attempt.**
   For patch upgrades (1.34.7→1.34.8), etcd's image doesn't change — kubeadm
   writes an identical manifest, hash doesn't update, watch times out and
   rolls back forever. The skip-etcd retry sidesteps it for the legitimate
   no-change case while still doing a full etcd upgrade on the first
   attempt (correct for minor-version bumps).

3. **halt_on_alert_query also ignores IngressTTFBHigh + NodeHighIOWait.**
   Both are symptoms-not-causes: ingress latency spikes briefly during any
   pod-restart wave; high IOwait is exactly what upgrade activity causes
   (chicken-and-egg). The inline quiet-baseline check (Ready transition
   <10min) is the real cluster-churn gate.

RBAC: k8s-upgrade-job ClusterRole gains `patch` on deployments + scale
subresource so the chain can do the scale-to-0/back-to-1 on tigera.

These three together get the chain past the cascade that's been blocking
1.34.7→1.34.8 for a week. Long-term fix is still HA control plane
(beads code-n0ow); these are the bridge.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 scripts/update_k8s.sh                         | 15 +++++--
 stacks/k8s-version-upgrade/main.tf            | 11 ++++++
 .../scripts/upgrade-step.sh                   | 39 ++++++++++++++++++-
 3 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/scripts/update_k8s.sh b/scripts/update_k8s.sh
index f98ea30a..6e01d654 100755
--- a/scripts/update_k8s.sh
+++ b/scripts/update_k8s.sh
@@ -89,17 +89,26 @@ if [[ "$ROLE" == "master" ]]; then
     # sync latency post-master-reboot can exceed it). The etcd image IS
     # actually updated by then, so a 2nd attempt sees etcd already on
     # target and skips it. Up to 3 attempts with a 30s delay between.
+    # First attempt: full kubeadm upgrade (incl. etcd). On the static-pod-
+    # hash 5min-timeout failure, retry with --etcd-upgrade=false. The
+    # timeout happens reliably for patch upgrades where etcd's image
+    # doesn't change (kubeadm writes identical manifest → hash doesn't
+    # change → kubeadm waits forever for a change that will never come).
+    # Skipping the etcd phase on retry is safe IF etcd is already on the
+    # right version (which is the only case where this timeout fires).
     attempt=1
-    while ! sudo kubeadm upgrade apply "v$RELEASE" -y; do
+    extra_flags=""
+    while ! sudo kubeadm upgrade apply "v$RELEASE" -y $extra_flags; do
         if (( attempt >= 3 )); then
             echo "ERROR: kubeadm upgrade apply failed after 3 attempts" >&2
             exit 1
         fi
-        echo "==> kubeadm apply attempt $attempt failed (likely static-pod-hash 5m timeout). Sleeping 30s then retrying — the previous attempt's manifest writes usually take hold on the 2nd try."
+        echo "==> kubeadm apply attempt $attempt failed. Retrying with --etcd-upgrade=false (etcd image is unchanged for patch upgrades; kubeadm's static-pod-hash watch is the only thing failing)."
+        extra_flags="--etcd-upgrade=false"
         sleep 30
         attempt=$(( attempt + 1 ))
     done
-    echo "==> kubeadm upgrade apply succeeded on attempt $attempt"
+    echo "==> kubeadm upgrade apply succeeded on attempt $attempt (flags: '$extra_flags')"
 else
     echo "==> Worker path: kubeadm upgrade node"
     sudo kubeadm upgrade node
diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf
index 530db98e..373cc2bc 100644
--- a/stacks/k8s-version-upgrade/main.tf
+++ b/stacks/k8s-version-upgrade/main.tf
@@ -172,11 +172,22 @@ resource "kubernetes_cluster_role" "k8s_upgrade_job" {
   # --ignore-daemonsets` can classify each pod's owner. Without daemonsets
   # GET permission, drain bails with "cannot delete daemonsets ... is
   # forbidden" for every daemonset-managed pod on the node. (2026-05-20)
+  #
+  # `patch` on deployments added 2026-05-23: phase_master scales tigera-operator
+  # to 0 before drain (operator crashloops during apiserver static-pod swaps,
+  # generates I/O storm that breaks kubeadm's 5-min watch) and back to 1
+  # after master is upgraded. Until HA control plane lands (beads code-n0ow),
+  # this is how we keep autonomous upgrades unblocked.
   rule {
     api_groups = ["apps"]
     resources  = ["daemonsets", "statefulsets", "replicasets", "deployments"]
     verbs      = ["get", "list"]
   }
+  rule {
+    api_groups = ["apps"]
+    resources  = ["deployments", "deployments/scale"]
+    verbs      = ["patch", "update"]
+  }
   # Chain dispatch — create the next Job; reconcile via apply on retry.
   # In `default` ns to also create the etcd-snapshot Job from cronjob/backup-etcd.
   rule {
diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
index 74c93bc7..267cb2c0 100644
--- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
+++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
@@ -94,7 +94,21 @@ push() {
 
 halt_on_alert_query() {
   local extra_ignore="${1:-}"
-  local regex='^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor'
+  # Always-ignored alerts — present in steady-state OR are themselves caused
+  # by what the chain does, so they should never halt a chain phase:
+  #   Watchdog              — Prometheus meta-alert, always firing
+  #   RebootRequired        — long-running info, not actionable mid-chain
+  #   KuredNodeWasNotDrained — kured info-level, doesn't block upgrade
+  #   InfoInhibitor         — used to inhibit other alerts, always present
+  #   IngressTTFBHigh       — Traefik latency. Symptoms-not-causes; upgrades
+  #                           routinely spike latency briefly. Halting on
+  #                           this would prevent the chain from running in
+  #                           any moderately busy cluster. (2026-05-23)
+  #   NodeHighIOWait        — chicken-and-egg with our own upgrade I/O. The
+  #                           inline quiet-baseline check (Ready transition
+  #                           <10min) is the real cluster-churn gate; iowait
+  #                           is too noisy to be a hard gate. (2026-05-23)
+  local regex='^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor|IngressTTFBHigh|NodeHighIOWait'
   [ -n "$extra_ignore" ] && regex="$regex|$extra_ignore"
   regex="$regex)$"
 
@@ -366,6 +380,25 @@ phase_master() {
   alerts=$(halt_on_alert_query RecentNodeReboot)
   [ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; }
 
+  # Quiesce noisy operators that crashloop when apiserver briefly disappears
+  # during the static-pod manifest swaps. The crashloop generates a disk-I/O
+  # storm (~500 MB/s observed from tigera-operator alone) that slows the
+  # apiserver↔kubelet status sync past kubeadm's hardcoded 5-min watch on
+  # `kubernetes.io/config.hash`, causing kubeadm to roll back the upgrade.
+  #
+  # The data plane (calico-node DaemonSet, calico-typha, calico-kube-controllers)
+  # keeps running unchanged — only the OPERATOR (a config reconciler) goes away
+  # briefly. Restored at the end of the phase below.
+  #
+  # If the chain dies between quiesce and restore (e.g. kubeadm fails),
+  # manually restore with:
+  #   kubectl -n tigera-operator scale deploy tigera-operator --replicas=1
+  #
+  # Long-term fix: HA control plane (3 masters) so apiserver never goes down
+  # — see docs/plans/2026-05-21-ha-control-plane-{design,plan}.md (beads code-n0ow).
+  echo "Quiescing tigera-operator before master upgrade (it crashes on apiserver outage)"
+  $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=0 2>&1 || true
+
   drain_node k8s-master
 
   slack "Running update_k8s.sh on k8s-master (--role master --release $TARGET_VERSION)"
@@ -390,6 +423,10 @@ phase_master() {
   alerts=$(halt_on_alert_query RecentNodeReboot)
   [ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; }
 
+  # Restore tigera-operator (quiesced before drain). It reconciles in seconds.
+  echo "Restoring tigera-operator"
+  $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=1 2>&1 || true
+
   slack "Master on v$TARGET_VERSION, control-plane Running. Dispatching worker chain."
 }