From 044444d328afdc0989a525e75da7d57ffcb8dcc4 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 16 Jun 2026 15:39:06 +0000 Subject: [PATCH] cluster-health: helm check #18 catches pending/failed releases (helm list -a) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit check_helm_releases used `helm list` without -a, which HIDES pending-upgrade and failed releases — so on 2026-06-16 check #18 reported "All deployed" while the prometheus release sat in pending-upgrade for ~4 days, silently blocking every monitoring terragrunt apply (frozen alert/rule config). Add -a to surface them and flag pending-* (FAIL, blocks applies) + failed (WARN); deployed/uninstalled/ superseded stay green. Co-Authored-By: Claude Opus 4.8 --- scripts/cluster_healthcheck.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index 4b686abc..51a13b5d 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -1093,7 +1093,7 @@ check_helm_releases() { section 18 "Helm Release Health" local releases detail="" had_issue=false status="PASS" - releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || { + releases=$(helm list -A -a --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || { [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" warn "Cannot list Helm releases" json_add "helm_releases" "WARN" "Cannot list" @@ -1108,9 +1108,14 @@ for r in data: name = r.get("name", "?") ns = r.get("namespace", "?") st = r.get("status", "unknown") - if st != "deployed": - level = "FAIL" if st.startswith("pending") else "WARN" - print(f"{level}:{ns}/{name}:{st}") + # helm list -a (above) surfaces pending-*/failed releases that plain + # `helm list` HIDES; a stuck pending-upgrade silently blocks every + # terragrunt apply of the stack (2026-06-16 prometheus incident, ~4 days + # of frozen monitoring config). Ignore deployed/uninstalled/superseded. + if st.startswith("pending"): + print(f"FAIL:{ns}/{name}:{st}") + elif st == "failed": + print(f"WARN:{ns}/{name}:{st}") ' 2>/dev/null) || true if [[ -z "$bad_releases" ]]; then