From 0c8b46df55e06343c597c8a23866ad708c4df0de Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 20 May 2026 20:59:10 +0000 Subject: [PATCH] k8s-version-upgrade: fix two more grep-pipefail bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same `grep -v` / `set -o pipefail` interaction as commit 10b261d2, in two more callsites the previous fix didn't cover: Line 354 (phase_master): control-plane Running check — `grep -v Running | wc -l` returns 1 when all pods are Running (the happy path), aborting the chain right after master upgrades. Line 419 (phase_postflight): on-target node check — `grep -v ":v$TARGET_VERSION$" | wc -l` returns 1 when all nodes are on the target version (the happy path, exactly when postflight should succeed). Aborts at the moment of victory. Forensics on yesterday's master Job failure (see commit message of 10b261d2 for context): the master Job spawned 16s after the previous fix's TF apply, before configmap propagation completed on the kubelet. With those two latent bugs also looming, the chain would have died post-master-upgrade and again at postflight even if propagation had been timely. Wrapping each grep in `{ ... || true; }` so a no-matches result returns success. Co-Authored-By: Claude Opus 4.7 --- stacks/k8s-version-upgrade/scripts/upgrade-step.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh index 62c9bb75..29031b5d 100644 --- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh +++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh @@ -351,8 +351,10 @@ phase_master() { || { slack "ABORT — k8s-master not Ready or wrong version after upgrade"; exit 1; } local not_ready + # `grep -v Running` returns 1 when all pods are Running (happy path); + # under `set -o pipefail` that aborts the script. Wrap in `|| true`. not_ready=$($KUBECTL -n kube-system get pods -l 'tier=control-plane' --no-headers 2>/dev/null \ - | grep -v Running | wc -l) + | { grep -v Running || true; } | wc -l) if [ "$not_ready" -gt 0 ]; then slack "ABORT — $not_ready control-plane pods not Running after master upgrade" exit 1 @@ -416,7 +418,10 @@ phase_postflight() { # All 5 nodes at target local versions wrong versions=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.metadata.name}:{.status.nodeInfo.kubeletVersion}{"\n"}{end}') - wrong=$(echo "$versions" | grep -v ":v${TARGET_VERSION}\$" | wc -l) + # `grep -v` returns 1 when all nodes are on target (the happy path — + # exactly when postflight SHOULD succeed); under `set -o pipefail` that + # would abort the script right at the moment of victory. + wrong=$(echo "$versions" | { grep -v ":v${TARGET_VERSION}\$" || true; } | wc -l) if [ "$wrong" -ne 0 ]; then slack "ABORT postflight — $wrong node(s) off target:\n$versions" exit 1