cluster-health: helm check #18 catches pending/failed releases (helm list -a)
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All checks were successful
ci/woodpecker/push/default Pipeline was successful
check_helm_releases used `helm list` without -a, which HIDES pending-upgrade and failed releases — so on 2026-06-16 check #18 reported "All deployed" while the prometheus release sat in pending-upgrade for ~4 days, silently blocking every monitoring terragrunt apply (frozen alert/rule config). Add -a to surface them and flag pending-* (FAIL, blocks applies) + failed (WARN); deployed/uninstalled/ superseded stay green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
e74f4208f5
commit
044444d328
1 changed files with 9 additions and 4 deletions
|
|
@ -1093,7 +1093,7 @@ check_helm_releases() {
|
||||||
section 18 "Helm Release Health"
|
section 18 "Helm Release Health"
|
||||||
local releases detail="" had_issue=false status="PASS"
|
local releases detail="" had_issue=false status="PASS"
|
||||||
|
|
||||||
releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
|
releases=$(helm list -A -a --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
|
||||||
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
||||||
warn "Cannot list Helm releases"
|
warn "Cannot list Helm releases"
|
||||||
json_add "helm_releases" "WARN" "Cannot list"
|
json_add "helm_releases" "WARN" "Cannot list"
|
||||||
|
|
@ -1108,9 +1108,14 @@ for r in data:
|
||||||
name = r.get("name", "?")
|
name = r.get("name", "?")
|
||||||
ns = r.get("namespace", "?")
|
ns = r.get("namespace", "?")
|
||||||
st = r.get("status", "unknown")
|
st = r.get("status", "unknown")
|
||||||
if st != "deployed":
|
# helm list -a (above) surfaces pending-*/failed releases that plain
|
||||||
level = "FAIL" if st.startswith("pending") else "WARN"
|
# `helm list` HIDES; a stuck pending-upgrade silently blocks every
|
||||||
print(f"{level}:{ns}/{name}:{st}")
|
# terragrunt apply of the stack (2026-06-16 prometheus incident, ~4 days
|
||||||
|
# of frozen monitoring config). Ignore deployed/uninstalled/superseded.
|
||||||
|
if st.startswith("pending"):
|
||||||
|
print(f"FAIL:{ns}/{name}:{st}")
|
||||||
|
elif st == "failed":
|
||||||
|
print(f"WARN:{ns}/{name}:{st}")
|
||||||
' 2>/dev/null) || true
|
' 2>/dev/null) || true
|
||||||
|
|
||||||
if [[ -z "$bad_releases" ]]; then
|
if [[ -z "$bad_releases" ]]; then
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue