From 83fc15c22b2f8ef850f6d0181a5963a3aa01cf16 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 19 May 2026 22:19:06 +0000 Subject: [PATCH] k8s-version-upgrade: fix pipefail abort when no alerts are firing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit halt_on_alert_query() ends with `grep -vE "$regex" | sort -u`. When zero alerts are firing (the desired healthy state), grep matches nothing and exits 1. Under `set -o pipefail`, the whole pipeline returns 1; under `set -e`, the caller's `alerts=$(...)` assignment fails and aborts the script in ~1s with no diagnostic output. The chain effectively required at least one non-meta alert to be firing to make any forward progress. Today (2026-05-19) the cluster is fully clean post-MySQL recovery, the daily 12:00 UTC detection spawned the preflight Job, and it died instantly — blocking the 1.34.7 → 1.34.8 patch chain. Fix: wrap the grep in `{ ... || true; }` so a no-matches result returns success. Preflight verified end-to-end after the fix — the chain is now in flight (preflight ✓, master phase running). Co-Authored-By: Claude Opus 4.7 --- stacks/k8s-version-upgrade/scripts/upgrade-step.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh index e0b71bba..62c9bb75 100644 --- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh +++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh @@ -78,9 +78,14 @@ halt_on_alert_query() { [ -n "$extra_ignore" ] && regex="$regex|$extra_ignore" regex="$regex)$" + # `grep -vE` returns 1 when nothing matches, which under `set -o pipefail` + # bubbles up and (via the caller's `alerts=$(...)`) aborts the whole script. + # Trailing `|| true` keeps a no-alerts-firing cluster from looking like a + # script error. Discovered 2026-05-19 when the chain wouldn't fire on a + # genuinely-clean cluster (every alert was Watchdog/RebootRequired/etc.). curl -sf "$PROM/api/v1/alerts" \ | jq -r '.data.alerts[] | select(.state == "firing") | .labels.alertname' \ - | grep -vE "$regex" | sort -u + | { grep -vE "$regex" || true; } | sort -u } wait_for_node_ready() {