diff --git a/scripts/update_k8s.sh b/scripts/update_k8s.sh index f98ea30a..6e01d654 100755 --- a/scripts/update_k8s.sh +++ b/scripts/update_k8s.sh @@ -89,17 +89,26 @@ if [[ "$ROLE" == "master" ]]; then # sync latency post-master-reboot can exceed it). The etcd image IS # actually updated by then, so a 2nd attempt sees etcd already on # target and skips it. Up to 3 attempts with a 30s delay between. + # First attempt: full kubeadm upgrade (incl. etcd). On the static-pod- + # hash 5min-timeout failure, retry with --etcd-upgrade=false. The + # timeout happens reliably for patch upgrades where etcd's image + # doesn't change (kubeadm writes identical manifest → hash doesn't + # change → kubeadm waits forever for a change that will never come). + # Skipping the etcd phase on retry is safe IF etcd is already on the + # right version (which is the only case where this timeout fires). attempt=1 - while ! sudo kubeadm upgrade apply "v$RELEASE" -y; do + extra_flags="" + while ! sudo kubeadm upgrade apply "v$RELEASE" -y $extra_flags; do if (( attempt >= 3 )); then echo "ERROR: kubeadm upgrade apply failed after 3 attempts" >&2 exit 1 fi - echo "==> kubeadm apply attempt $attempt failed (likely static-pod-hash 5m timeout). Sleeping 30s then retrying — the previous attempt's manifest writes usually take hold on the 2nd try." + echo "==> kubeadm apply attempt $attempt failed. Retrying with --etcd-upgrade=false (etcd image is unchanged for patch upgrades; kubeadm's static-pod-hash watch is the only thing failing)." + extra_flags="--etcd-upgrade=false" sleep 30 attempt=$(( attempt + 1 )) done - echo "==> kubeadm upgrade apply succeeded on attempt $attempt" + echo "==> kubeadm upgrade apply succeeded on attempt $attempt (flags: '$extra_flags')" else echo "==> Worker path: kubeadm upgrade node" sudo kubeadm upgrade node diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf index 530db98e..373cc2bc 100644 --- a/stacks/k8s-version-upgrade/main.tf +++ b/stacks/k8s-version-upgrade/main.tf @@ -172,11 +172,22 @@ resource "kubernetes_cluster_role" "k8s_upgrade_job" { # --ignore-daemonsets` can classify each pod's owner. Without daemonsets # GET permission, drain bails with "cannot delete daemonsets ... is # forbidden" for every daemonset-managed pod on the node. (2026-05-20) + # + # `patch` on deployments added 2026-05-23: phase_master scales tigera-operator + # to 0 before drain (operator crashloops during apiserver static-pod swaps, + # generates I/O storm that breaks kubeadm's 5-min watch) and back to 1 + # after master is upgraded. Until HA control plane lands (beads code-n0ow), + # this is how we keep autonomous upgrades unblocked. rule { api_groups = ["apps"] resources = ["daemonsets", "statefulsets", "replicasets", "deployments"] verbs = ["get", "list"] } + rule { + api_groups = ["apps"] + resources = ["deployments", "deployments/scale"] + verbs = ["patch", "update"] + } # Chain dispatch — create the next Job; reconcile via apply on retry. # In `default` ns to also create the etcd-snapshot Job from cronjob/backup-etcd. rule { diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh index 74c93bc7..267cb2c0 100644 --- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh +++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh @@ -94,7 +94,21 @@ push() { halt_on_alert_query() { local extra_ignore="${1:-}" - local regex='^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor' + # Always-ignored alerts — present in steady-state OR are themselves caused + # by what the chain does, so they should never halt a chain phase: + # Watchdog — Prometheus meta-alert, always firing + # RebootRequired — long-running info, not actionable mid-chain + # KuredNodeWasNotDrained — kured info-level, doesn't block upgrade + # InfoInhibitor — used to inhibit other alerts, always present + # IngressTTFBHigh — Traefik latency. Symptoms-not-causes; upgrades + # routinely spike latency briefly. Halting on + # this would prevent the chain from running in + # any moderately busy cluster. (2026-05-23) + # NodeHighIOWait — chicken-and-egg with our own upgrade I/O. The + # inline quiet-baseline check (Ready transition + # <10min) is the real cluster-churn gate; iowait + # is too noisy to be a hard gate. (2026-05-23) + local regex='^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor|IngressTTFBHigh|NodeHighIOWait' [ -n "$extra_ignore" ] && regex="$regex|$extra_ignore" regex="$regex)$" @@ -366,6 +380,25 @@ phase_master() { alerts=$(halt_on_alert_query RecentNodeReboot) [ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; } + # Quiesce noisy operators that crashloop when apiserver briefly disappears + # during the static-pod manifest swaps. The crashloop generates a disk-I/O + # storm (~500 MB/s observed from tigera-operator alone) that slows the + # apiserver↔kubelet status sync past kubeadm's hardcoded 5-min watch on + # `kubernetes.io/config.hash`, causing kubeadm to roll back the upgrade. + # + # The data plane (calico-node DaemonSet, calico-typha, calico-kube-controllers) + # keeps running unchanged — only the OPERATOR (a config reconciler) goes away + # briefly. Restored at the end of the phase below. + # + # If the chain dies between quiesce and restore (e.g. kubeadm fails), + # manually restore with: + # kubectl -n tigera-operator scale deploy tigera-operator --replicas=1 + # + # Long-term fix: HA control plane (3 masters) so apiserver never goes down + # — see docs/plans/2026-05-21-ha-control-plane-{design,plan}.md (beads code-n0ow). + echo "Quiescing tigera-operator before master upgrade (it crashes on apiserver outage)" + $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=0 2>&1 || true + drain_node k8s-master slack "Running update_k8s.sh on k8s-master (--role master --release $TARGET_VERSION)" @@ -390,6 +423,10 @@ phase_master() { alerts=$(halt_on_alert_query RecentNodeReboot) [ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; } + # Restore tigera-operator (quiesced before drain). It reconciles in seconds. + echo "Restoring tigera-operator" + $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=1 2>&1 || true + slack "Master on v$TARGET_VERSION, control-plane Running. Dispatching worker chain." }