From 02ea5da8dc408938b20746048b01fd36bc524a15 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 23 May 2026 09:53:57 +0000
Subject: [PATCH] k8s-version-upgrade: skip phase_master/phase_worker if node
 already on target
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chain wasn't idempotent — re-running on a partially-upgraded cluster
would re-drain + re-kubeadm + re-apt an already-upgraded node, causing
unnecessary disruption (5-10 min per no-op node) and risking alert
re-fires during the unnecessary drain.

Today's chain hit this twice: after fixing the version-detection bug
(commit a0f3e155), the chain correctly resumed but re-did master AND
node4 even though both were already on v1.34.8. node4 got cordoned,
drained, and is now soaking for 10 min for no reason.

Fix: at the top of phase_master and phase_worker, read the node's
current kubelet version. If it equals TARGET_VERSION, skip the whole
phase (return 0 — spawn_next will fire downstream). Chain advances
without disturbing the already-upgraded node.

In-flight effect: the current node4 worker pod has the old script
mounted from configmap snapshot, so it'll continue. If it fails and
retries, the new pod will see node4 on v1.34.8 and short-circuit.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../scripts/upgrade-step.sh                   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
index 13959bc7..95783d26 100644
--- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
+++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
@@ -380,6 +380,18 @@ phase_preflight() {
 }
 
 phase_master() {
+  # Idempotency: skip the whole phase if k8s-master is already on target.
+  # The chain can re-run after a partial failure (e.g. workers got cut
+  # short); without this short-circuit we re-drain and re-kubeadm an
+  # already-upgraded master for no reason. Added 2026-05-23.
+  local current_v
+  current_v=$($KUBECTL get node k8s-master -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null | tr -d v)
+  if [ "$current_v" = "$TARGET_VERSION" ]; then
+    slack "k8s-master already on v$TARGET_VERSION (kubelet=$current_v) — skipping master phase"
+    echo "k8s-master already on v$TARGET_VERSION — skipping"
+    return 0
+  fi
+
   slack "Draining k8s-master"
 
   # Re-check halt-on-alert before drain. Always ignore RecentNodeReboot —
@@ -441,6 +453,18 @@ phase_master() {
 
 phase_worker() {
   [ -z "$TARGET_NODE" ] && { echo "ERROR: worker phase requires TARGET_NODE"; exit 2; }
+
+  # Idempotency: skip if target node is already on target version. Same
+  # rationale as phase_master — chains re-running after partial completion
+  # shouldn't re-drain an already-upgraded worker. Added 2026-05-23.
+  local current_v
+  current_v=$($KUBECTL get node "$TARGET_NODE" -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null | tr -d v)
+  if [ "$current_v" = "$TARGET_VERSION" ]; then
+    slack "$TARGET_NODE already on v$TARGET_VERSION (kubelet=$current_v) — skipping worker phase"
+    echo "$TARGET_NODE already on v$TARGET_VERSION — skipping"
+    return 0
+  fi
+
   slack "Draining $TARGET_NODE"
 
   # Halt-on-alert wait (up to 30 min). Ignore RecentNodeReboot — the chain