2026-02-22 00:00:47 +00:00
#!/usr/bin/env bash
2026-02-22 01:09:02 +00:00
# Cluster health check script (pod-compatible version).
# Runs 24 diagnostic checks against the Kubernetes cluster and prints
# a colour-coded report with PASS / WARN / FAIL for each section.
# Optionally posts results to Slack.
2026-02-22 00:00:47 +00:00
#
2026-02-22 01:09:02 +00:00
# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]
2026-02-22 00:00:47 +00:00
#
# Environment:
2026-02-22 01:09:02 +00:00
# KUBECONFIG — path to kubeconfig (used in pod environment)
# SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack)
# UPTIME_KUMA_PASSWORD — Uptime Kuma admin password
2026-02-22 00:00:47 +00:00
set -euo pipefail
2026-02-22 01:09:02 +00:00
# --- Colors ---
RED = '\033[0;31m'
GREEN = '\033[0;32m'
YELLOW = '\033[0;33m'
BLUE = '\033[0;34m'
BOLD = '\033[1m'
NC = '\033[0m'
2026-02-22 00:00:47 +00:00
# --- Globals ---
2026-02-22 01:09:02 +00:00
PASS_COUNT = 0
WARN_COUNT = 0
FAIL_COUNT = 0
FIX = false
QUIET = false
JSON = false
2026-02-22 00:00:47 +00:00
SEND_SLACK = true
2026-02-22 01:09:02 +00:00
KUBECONFIG_PATH = " ${ KUBECONFIG :- $( pwd ) /config } "
KUBECTL = ""
JSON_RESULTS = ( )
TOTAL_CHECKS = 24
2026-02-22 00:00:47 +00:00
# --- Helpers ---
2026-02-22 01:09:02 +00:00
info( ) { [ [ " $JSON " = = true ] ] && return 0; echo -e " ${ BLUE } [INFO] ${ NC } $* " ; }
pass( ) { PASS_COUNT = $(( PASS_COUNT + 1 )) ; [ [ " $JSON " = = true ] ] && return 0; [ [ " $QUIET " = = true ] ] && return 0; echo -e " ${ GREEN } [PASS] ${ NC } $* " ; }
warn( ) { WARN_COUNT = $(( WARN_COUNT + 1 )) ; [ [ " $JSON " = = true ] ] && return 0; echo -e " ${ YELLOW } [WARN] ${ NC } $* " ; }
fail( ) { FAIL_COUNT = $(( FAIL_COUNT + 1 )) ; [ [ " $JSON " = = true ] ] && return 0; echo -e " ${ RED } [FAIL] ${ NC } $* " ; }
section( ) {
local num = " $1 " title = " $2 "
[ [ " $JSON " = = true ] ] && return 0
[ [ " $QUIET " = = true ] ] && return 0
echo ""
echo -e " ${ BOLD } [ $num / $TOTAL_CHECKS ] $title ${ NC } "
}
section_always( ) {
local num = " $1 " title = " $2 "
[ [ " $JSON " = = true ] ] && return 0
echo ""
echo -e " ${ BOLD } [ $num / $TOTAL_CHECKS ] $title ${ NC } "
}
json_add( ) {
local name = " $1 " status = " $2 " detail = " $3 "
local escaped
escaped = $( echo " $detail " | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))' )
JSON_RESULTS += ( " {\"check\":\" $name \",\"status\":\" $status \",\"detail\": $escaped } " )
}
# count lines in a variable, returning 0 for empty strings
2026-02-22 00:00:47 +00:00
count_lines( ) {
local input = " $1 "
if [ [ -z " $input " ] ] ; then
echo 0
else
echo " $input " | wc -l | tr -d ' '
fi
}
2026-02-22 01:09:02 +00:00
# --- Argument parsing ---
parse_args( ) {
while [ [ $# -gt 0 ] ] ; do
case " $1 " in
--fix) FIX = true; shift ; ;
--quiet| -q) QUIET = true; shift ; ;
--json) JSON = true; shift ; ;
--no-slack) SEND_SLACK = false; shift ; ;
--kubeconfig) KUBECONFIG_PATH = " $2 " ; shift 2 ; ;
-h| --help)
echo " Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack] "
echo ""
echo "Flags:"
echo " --fix Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)"
echo " --quiet, -q Only show WARN and FAIL sections"
echo " --json Machine-readable JSON output"
echo " --kubeconfig PATH Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)"
echo " --no-slack Skip Slack notification"
exit 0
; ;
*)
echo " Unknown option: $1 " >& 2
exit 1
; ;
esac
done
KUBECTL = " kubectl --kubeconfig $KUBECONFIG_PATH "
}
# --- 1. Node Status ---
check_nodes( ) {
section 1 "Node Status"
local nodes not_ready versions unique_versions detail = ""
nodes = $( $KUBECTL get nodes --no-headers 2>& 1) || { fail "Cannot reach cluster" ; json_add "node_status" "FAIL" "Cannot reach cluster" ; return 0; }
not_ready = $( echo " $nodes " | awk '$2 != "Ready" {print $1}' || true )
versions = $( echo " $nodes " | awk '{print $5}' | sort -u)
unique_versions = $( echo " $versions " | wc -l | tr -d ' ' )
if [ [ -n " $not_ready " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 1 "Node Status"
fail " NotReady nodes: $not_ready "
detail = " NotReady: $not_ready "
json_add "node_status" "FAIL" " $detail "
elif [ [ " $unique_versions " -gt 1 ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 1 "Node Status"
warn " Version mismatch across nodes: $( echo " $versions " | tr '\n' ' ' ) "
detail = " Version mismatch: $( echo " $versions " | tr '\n' ' ' ) "
json_add "node_status" "WARN" " $detail "
else
pass " All nodes Ready, version $( echo " $versions " | head -1) "
detail = "All nodes Ready"
json_add "node_status" "PASS" " $detail "
fi
}
# --- 2. Node Resources ---
check_resources( ) {
section 2 "Node Resources"
local top detail = "" had_issue = false status = "PASS"
top = $( $KUBECTL top nodes --no-headers 2>& 1) || { fail "metrics-server unavailable" ; json_add "node_resources" "FAIL" "metrics-server unavailable" ; return 0; }
while IFS = read -r line; do
local node cpu_pct mem_pct
node = $( echo " $line " | awk '{print $1}' )
cpu_pct = $( echo " $line " | awk '{print $3}' | tr -d '%' )
mem_pct = $( echo " $line " | awk '{print $5}' | tr -d '%' )
# Skip nodes where metrics are not yet available
if [ [ " $cpu_pct " = = *"unknown" * ] ] || [ [ " $mem_pct " = = *"unknown" * ] ] ; then
detail += " $node metrics unavailable; "
continue
fi
if [ [ " $cpu_pct " -gt 90 ] ] || [ [ " $mem_pct " -gt 90 ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 2 "Node Resources"
fail " $node : CPU ${ cpu_pct } %, Mem ${ mem_pct } % "
detail += " $node CPU= ${ cpu_pct } % Mem= ${ mem_pct } % [FAIL]; "
had_issue = true
status = "FAIL"
elif [ [ " $cpu_pct " -gt 80 ] ] || [ [ " $mem_pct " -gt 80 ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 2 "Node Resources"
warn " $node : CPU ${ cpu_pct } %, Mem ${ mem_pct } % "
detail += " $node CPU= ${ cpu_pct } % Mem= ${ mem_pct } % [WARN]; "
had_issue = true
[ [ " $status " != "FAIL" ] ] && status = "WARN"
else
detail += " $node CPU= ${ cpu_pct } % Mem= ${ mem_pct } % [OK]; "
fi
done <<< " $top "
[ [ " $had_issue " = = false ] ] && pass "All nodes below 80% CPU and memory"
json_add "node_resources" " $status " " $detail "
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 3. Node Conditions ---
check_conditions( ) {
section 3 "Node Conditions"
local conditions detail = ""
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
conditions = $( $KUBECTL get nodes -o json | python3 -c '
2026-02-22 00:00:47 +00:00
import json, sys
data = json.load( sys.stdin)
for node in data[ "items" ] :
name = node[ "metadata" ] [ "name" ]
for c in node[ "status" ] [ "conditions" ] :
2026-02-22 01:09:02 +00:00
if c[ "type" ] in ( "MemoryPressure" ,"DiskPressure" ,"PIDPressure" ) and c[ "status" ] = = "True" :
print( name + ": " + c[ "type" ] )
' 2>& 1) || true
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ -n " $conditions " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 3 "Node Conditions"
while IFS = read -r line; do
fail " $line "
done <<< " $conditions "
detail = " $conditions "
json_add "node_conditions" "FAIL" " $detail "
else
pass "No pressure conditions on any node"
json_add "node_conditions" "PASS" "No pressure conditions"
fi
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 4. Problematic Pods ---
check_pods( ) {
section 4 "Problematic Pods"
local bad count detail = "" status = "PASS"
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
bad = $( {
$KUBECTL get pods -A --no-headers --field-selector= status.phase!= Running,status.phase!= Succeeded 2>/dev/null \
| grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
$KUBECTL get pods -A --no-headers 2>/dev/null \
| grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
} | awk '!seen[$1,$2]++' | sed '/^$/d' ) || true
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
count = $( count_lines " $bad " )
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled
if [ [ " $FIX " = = true && " $count " -gt 0 ] ] ; then
local fixed_count = 0
while IFS = read -r line; do
[ [ -z " $line " ] ] && continue
local ns pod pod_status restarts restarts_clean
ns = $( echo " $line " | awk '{print $1}' )
pod = $( echo " $line " | awk '{print $2}' )
pod_status = $( echo " $line " | awk '{print $4}' )
restarts = $( echo " $line " | awk '{print $5}' )
restarts_clean = $( echo " $restarts " | grep -oE '^[0-9]+' || echo "0" )
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $pod_status " = = "CrashLoopBackOff" ] ] && [ [ " $restarts_clean " -gt 10 ] ] ; then
info " Deleting CrashLoopBackOff pod $ns / $pod (restarts: $restarts_clean ) "
2026-02-22 00:00:47 +00:00
$KUBECTL delete pod -n " $ns " " $pod " --grace-period= 0 2>/dev/null || true
2026-02-22 01:09:02 +00:00
fixed_count = $(( fixed_count + 1 ))
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
done <<< " $bad "
if [ [ " $fixed_count " -gt 0 ] ] ; then
info " Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts "
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
fi
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $count " -eq 0 ] ] ; then
pass "No problematic pods"
detail = "None"
elif [ [ " $count " -le 10 ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 4 "Problematic Pods"
warn " $count problematic pod(s): "
[ [ " $JSON " != true ] ] && echo " $bad " | while IFS = read -r line; do echo " $line " ; done
detail = " $count pods "
status = "WARN"
else
[ [ " $QUIET " = = true ] ] && section_always 4 "Problematic Pods"
fail " $count problematic pods (showing first 10): "
[ [ " $JSON " != true ] ] && echo " $bad " | head -10 | while IFS = read -r line; do echo " $line " ; done
detail = " $count pods "
status = "FAIL"
fi
json_add "problematic_pods" " $status " " $detail "
}
# --- 5. Evicted/Failed Pods ---
check_evicted( ) {
section 5 "Evicted/Failed Pods"
local evicted count detail = "" status = "PASS"
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
evicted = $( $KUBECTL get pods -A --no-headers --field-selector= status.phase= Failed 2>/dev/null || true )
count = $( count_lines " $evicted " )
if [ [ " $count " -eq 0 ] ] ; then
pass "No evicted or failed pods"
detail = "0"
elif [ [ " $count " -le 50 ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 5 "Evicted/Failed Pods"
warn " $count evicted/failed pod(s) "
detail = " $count pods "
status = "WARN"
else
[ [ " $QUIET " = = true ] ] && section_always 5 "Evicted/Failed Pods"
fail " $count evicted/failed pods "
detail = " $count pods "
status = "FAIL"
fi
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $FIX " = = true && " $count " -gt 0 ] ] ; then
info " Deleting $count evicted/failed pods... "
2026-02-22 00:00:47 +00:00
$KUBECTL delete pods -A --field-selector= status.phase= Failed 2>/dev/null || true
2026-02-22 01:09:02 +00:00
info "Deleted evicted/failed pods"
fi
json_add "evicted_pods" " $status " " $detail "
}
# --- 6. DaemonSets ---
check_daemonsets( ) {
section 6 "DaemonSets"
local ds detail = "" had_issue = false
ds = $( $KUBECTL get daemonsets -A --no-headers 2>& 1) || { fail "Cannot list DaemonSets" ; json_add "daemonsets" "FAIL" "Cannot list" ; return 0; }
while IFS = read -r line; do
local ns name desired ready
ns = $( echo " $line " | awk '{print $1}' )
name = $( echo " $line " | awk '{print $2}' )
desired = $( echo " $line " | awk '{print $3}' )
ready = $( echo " $line " | awk '{print $5}' )
if [ [ " $desired " != " $ready " ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 6 "DaemonSets"
fail " $ns / $name : desired= $desired ready= $ready "
detail += " $ns / $name desired= $desired ready= $ready ; "
had_issue = true
fi
done <<< " $ds "
if [ [ " $had_issue " = = false ] ] ; then
pass "All DaemonSets healthy (desired == ready)"
json_add "daemonsets" "PASS" "All healthy"
2026-02-22 00:00:47 +00:00
else
2026-02-22 01:09:02 +00:00
json_add "daemonsets" "FAIL" " $detail "
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 7. Deployments ---
check_deployments( ) {
section 7 "Deployments"
local deps detail = "" had_issue = false
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
deps = $( $KUBECTL get deployments -A --no-headers 2>& 1) || { fail "Cannot list Deployments" ; json_add "deployments" "FAIL" "Cannot list" ; return 0; }
2026-02-22 00:00:47 +00:00
while IFS = read -r line; do
2026-02-22 01:09:02 +00:00
local ns name ready current desired
2026-02-22 00:00:47 +00:00
ns = $( echo " $line " | awk '{print $1}' )
name = $( echo " $line " | awk '{print $2}' )
2026-02-22 01:09:02 +00:00
ready = $( echo " $line " | awk '{print $3}' )
current = $( echo " $ready " | cut -d/ -f1)
desired = $( echo " $ready " | cut -d/ -f2)
2026-02-22 00:00:47 +00:00
if [ [ " $current " != " $desired " ] ] ; then
2026-02-22 01:09:02 +00:00
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 7 "Deployments"
fail " $ns / $name : $current / $desired ready "
detail += " $ns / $name $current / $desired ; "
had_issue = true
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
done <<< " $deps "
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $had_issue " = = false ] ] ; then
pass "All deployments fully available"
json_add "deployments" "PASS" "All available"
else
json_add "deployments" "FAIL" " $detail "
fi
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 8. PVC Status ---
check_pvcs( ) {
section 8 "PVC Status"
local pvcs detail = "" had_issue = false
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
pvcs = $( $KUBECTL get pvc -A --no-headers 2>& 1) || true
if [ [ -z " $pvcs " || " $pvcs " = = *"No resources found" * ] ] ; then
pass "No PVCs in cluster"
json_add "pvcs" "PASS" "No PVCs"
return 0
fi
2026-02-22 00:00:47 +00:00
while IFS = read -r line; do
2026-02-22 01:09:02 +00:00
local ns name status
2026-02-22 00:00:47 +00:00
ns = $( echo " $line " | awk '{print $1}' )
name = $( echo " $line " | awk '{print $2}' )
status = $( echo " $line " | awk '{print $3}' )
if [ [ " $status " != "Bound" ] ] ; then
2026-02-22 01:09:02 +00:00
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 8 "PVC Status"
fail " $ns / $name : $status "
detail += " $ns / $name = $status ; "
had_issue = true
2026-02-22 00:00:47 +00:00
fi
done <<< " $pvcs "
2026-02-22 01:09:02 +00:00
if [ [ " $had_issue " = = false ] ] ; then
pass "All PVCs Bound"
json_add "pvcs" "PASS" "All Bound"
else
json_add "pvcs" "FAIL" " $detail "
fi
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 9. HPA Health ---
check_hpa( ) {
section 9 "HPA Health"
local hpas detail = "" had_issue = false status = "PASS"
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
hpas = $( $KUBECTL get hpa -A --no-headers 2>& 1) || true
if [ [ -z " $hpas " || " $hpas " = = *"No resources found" * ] ] ; then
pass "No HPAs configured"
json_add "hpa" "PASS" "No HPAs"
return 0
fi
2026-02-22 00:00:47 +00:00
while IFS = read -r line; do
2026-02-22 01:09:02 +00:00
local ns name targets
ns = $( echo " $line " | awk '{print $1}' )
name = $( echo " $line " | awk '{print $2}' )
targets = $( echo " $line " | awk '{print $3}' )
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if echo " $targets " | grep -q '<unknown>' ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 9 "HPA Health"
fail " $ns / $name : targets= $targets (unknown metrics) "
detail += " $ns / $name =unknown; "
had_issue = true
status = "FAIL"
else
# Parse percentage values from targets like "45%/80%, 30%/50%"
local pcts
pcts = $( echo " $targets " | grep -oE '[0-9]+%/' | tr -d '%/' || true )
if [ [ -n " $pcts " ] ] ; then
while IFS = read -r pct; do
[ [ -z " $pct " ] ] && continue
if [ [ " $pct " -gt 150 ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 9 "HPA Health"
fail " $ns / $name : utilization at ${ pct } % "
detail += " $ns / $name = ${ pct } %; "
had_issue = true
status = "FAIL"
break
elif [ [ " $pct " -gt 100 ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 9 "HPA Health"
warn " $ns / $name : utilization at ${ pct } % "
detail += " $ns / $name = ${ pct } %; "
had_issue = true
[ [ " $status " != "FAIL" ] ] && status = "WARN"
break
fi
done <<< " $pcts "
fi
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
done <<< " $hpas "
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
[ [ " $had_issue " = = false ] ] && pass "All HPAs healthy"
json_add "hpa" " $status " " ${ detail :- All healthy } "
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 10. CronJob Failures ---
check_cronjobs( ) {
section 10 "CronJob Failures"
local failures detail = ""
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
failures = $( $KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
2026-02-22 00:00:47 +00:00
import json, sys
from datetime import datetime, timezone, timedelta
data = json.load( sys.stdin)
cutoff = datetime.now( timezone.utc) - timedelta( hours = 24)
for job in data.get( "items" , [ ] ) :
meta = job.get( "metadata" , { } )
ns = meta.get( "namespace" , "" )
name = meta.get( "name" , "" )
owners = meta.get( "ownerReferences" , [ ] )
is_cronjob = any( o.get( "kind" ) = = "CronJob" for o in owners)
if not is_cronjob:
continue
conditions = job.get( "status" , { } ) .get( "conditions" , [ ] )
for c in conditions:
if c.get( "type" ) = = "Failed" and c.get( "status" ) = = "True" :
ts = c.get( "lastTransitionTime" , "" )
if ts:
try:
t = datetime.fromisoformat( ts.replace( "Z" , "+00:00" ) )
if t > cutoff:
print( f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}" )
2026-02-22 01:09:02 +00:00
except:
2026-02-22 00:00:47 +00:00
print( f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}" )
' 2>/dev/null) || true
2026-02-22 01:09:02 +00:00
if [ [ -z " $failures " ] ] ; then
pass "No CronJob failures in last 24h"
json_add "cronjob_failures" "PASS" "None"
else
[ [ " $QUIET " = = true ] ] && section_always 10 "CronJob Failures"
local count
count = $( count_lines " $failures " )
fail " $count CronJob failure(s) in last 24h: "
[ [ " $JSON " != true ] ] && echo " $failures " | while IFS = read -r line; do echo " $line " ; done
json_add "cronjob_failures" "FAIL" " $count failures "
fi
}
# --- 11. CrowdSec ---
check_crowdsec( ) {
section 11 "CrowdSec Agents"
local cs_pods not_running
cs_pods = $( $KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true )
if [ [ -z " $cs_pods " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 11 "CrowdSec Agents"
warn "CrowdSec namespace not found or empty"
json_add "crowdsec" "WARN" "No CrowdSec pods found"
return 0
fi
not_running = $( echo " $cs_pods " | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true )
if [ [ -n " $not_running " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 11 "CrowdSec Agents"
while IFS = read -r line; do
fail " CrowdSec pod not running: $line "
done <<< " $not_running "
json_add "crowdsec" "FAIL" " $not_running "
else
local total
total = $( count_lines " $cs_pods " )
pass " All $total CrowdSec pods running "
json_add "crowdsec" "PASS" " $total pods running "
fi
}
# --- 12. Ingress ---
check_ingresses( ) {
section 12 "Ingress Routes"
local ingresses no_lb detail = "" had_issue = false
ingresses = $( $KUBECTL get ingress -A --no-headers 2>/dev/null || true )
if [ [ -n " $ingresses " ] ] ; then
no_lb = $( echo " $ingresses " | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true )
if [ [ -n " $no_lb " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 12 "Ingress Routes"
while IFS = read -r line; do
fail " Ingress missing LB IP: $line "
done <<< " $no_lb "
detail = " Missing LB: $no_lb "
had_issue = true
fi
fi
# Check Traefik LB service
local traefik_svc_ip
traefik_svc_ip = $( $KUBECTL get svc -n traefik traefik -o jsonpath = '{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true )
if [ [ -z " $traefik_svc_ip " ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 12 "Ingress Routes"
fail "Traefik LoadBalancer has no external IP"
detail += "Traefik LB missing IP; "
had_issue = true
else
detail += " Traefik LB= $traefik_svc_ip ; "
fi
if [ [ " $had_issue " = = false ] ] ; then
pass " All ingresses have LB assignment (Traefik LB= $traefik_svc_ip ) "
json_add "ingresses" "PASS" " $detail "
else
json_add "ingresses" "FAIL" " $detail "
fi
}
# --- 13. Prometheus Alerts ---
check_alerts( ) {
section 13 "Prometheus Alerts"
local alerts firing_count
# Try alertmanager first, then prometheus server
alerts = $( $KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true )
if [ [ -z " $alerts " ] ] ; then
alerts = $( $KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true )
fi
if [ [ -z " $alerts " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 13 "Prometheus Alerts"
warn "Could not query Prometheus/Alertmanager"
json_add "prometheus_alerts" "WARN" "Cannot query"
return 0
fi
firing_count = $( echo " $alerts " | python3 -c '
import json, sys
try:
data = json.load( sys.stdin)
if isinstance( data, list) :
active = [ a for a in data if a.get( "status" , { } ) .get( "state" ) = = "active" ]
count = len( active)
names = [ a.get( "labels" , { } ) .get( "alertname" , "?" ) for a in active]
print( f"{count}:" + "," .join( names) if count > 0 else "0:" )
elif isinstance( data, dict) and "data" in data:
alerts_list = data[ "data" ] .get( "alerts" , [ ] )
firing = [ a for a in alerts_list if a.get( "state" ) = = "firing" ]
count = len( firing)
names = [ a.get( "labels" , { } ) .get( "alertname" , "?" ) for a in firing]
print( f"{count}:" + "," .join( names) if count > 0 else "0:" )
else :
print( "0:" )
except:
print( "-1:" )
' 2>/dev/null || echo "-1:" )
local count names
count = $( echo " $firing_count " | cut -d: -f1)
names = $( echo " $firing_count " | cut -d: -f2-)
if [ [ " $count " = = "-1" ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 13 "Prometheus Alerts"
warn "Failed to parse alert data"
json_add "prometheus_alerts" "WARN" "Parse error"
elif [ [ " $count " -eq 0 ] ] ; then
pass "No firing alerts"
json_add "prometheus_alerts" "PASS" "0 firing"
elif [ [ " $count " -le 3 ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 13 "Prometheus Alerts"
warn " $count firing alert(s): $names "
json_add "prometheus_alerts" "WARN" " $count firing: $names "
else
[ [ " $QUIET " = = true ] ] && section_always 13 "Prometheus Alerts"
fail " $count firing alerts: $names "
json_add "prometheus_alerts" "FAIL" " $count firing: $names "
fi
}
# --- 14. Uptime Kuma ---
check_uptime_kuma( ) {
section 14 "Uptime Kuma Monitors"
local result
result = $( python3 -c '
import sys, os
try:
from uptime_kuma_api import UptimeKumaApi
except ImportError:
print( "ERROR:uptime-kuma-api not installed" )
sys.exit( 0)
try:
2026-02-22 01:37:28 +00:00
api = UptimeKumaApi( "https://uptime.viktorbarzin.me" , timeout = 30)
2026-02-22 01:09:02 +00:00
password = os.environ.get( "UPTIME_KUMA_PASSWORD" , "" )
if not password:
print( "ERROR:UPTIME_KUMA_PASSWORD not set" )
sys.exit( 0)
api.login( "admin" , password)
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
monitors = api.get_monitors( )
2026-02-22 01:37:28 +00:00
# Build id->name map and track active/paused
id_to_name = { }
2026-02-22 01:09:02 +00:00
paused_count = 0
for m in monitors:
2026-02-22 01:37:28 +00:00
mid = m.get( "id" )
2026-02-22 01:09:02 +00:00
name = m.get( "name" , "unknown" )
active = m.get( "active" , True)
if not active:
paused_count += 1
2026-02-22 01:37:28 +00:00
else :
id_to_name[ mid] = name
# Use bulk heartbeat fetch (single API call) instead of per-monitor calls
heartbeats = api.get_heartbeats( )
down = [ ]
up_count = 0
for mid, name in id_to_name.items( ) :
beats = heartbeats.get( mid, [ ] )
if beats:
2026-03-03 21:10:26 +00:00
last_beat = beats[ -1]
# Handle nested lists (some monitors return list of lists)
if isinstance( last_beat, list) :
last_beat = last_beat[ -1] if last_beat else { }
status = last_beat.get( "status" , 0) if isinstance( last_beat, dict) else 0
2026-02-22 01:37:28 +00:00
# Handle both enum and int (MonitorStatus.UP == 1)
if status = = 1:
up_count += 1
elif status = = 3:
paused_count += 1
2026-02-22 01:09:02 +00:00
else :
2026-02-22 01:37:28 +00:00
down.append( name)
2026-02-22 01:09:02 +00:00
else :
2026-02-22 01:37:28 +00:00
# No heartbeats = unknown, treat as down
2026-02-22 01:09:02 +00:00
down.append( name)
api.disconnect( )
down_count = len( down)
total_active = up_count + down_count
down_names = ", " .join( down) if down else ""
print( f"{down_count}:{up_count}:{paused_count}:{total_active}:{down_names}" )
except Exception as e:
print( f"CONN_ERROR:{e}" )
' 2>/dev/null) || result = "CONN_ERROR:python execution failed"
if [ [ " $result " = = "ERROR:" * ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 14 "Uptime Kuma Monitors"
warn " Uptime Kuma: ${ result #ERROR : } "
json_add "uptime_kuma" "WARN" " ${ result #ERROR : } "
elif [ [ " $result " = = "CONN_ERROR:" * ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 14 "Uptime Kuma Monitors"
warn " Cannot connect to Uptime Kuma: ${ result #CONN_ERROR : } "
json_add "uptime_kuma" "WARN" "Connection failed"
else
local down_count up_count paused_count total_active down_names
down_count = $( echo " $result " | cut -d: -f1)
up_count = $( echo " $result " | cut -d: -f2)
paused_count = $( echo " $result " | cut -d: -f3)
total_active = $( echo " $result " | cut -d: -f4)
down_names = $( echo " $result " | cut -d: -f5-)
if [ [ " $down_count " -eq 0 ] ] ; then
pass " All $total_active active monitors up ( $paused_count paused) "
json_add "uptime_kuma" "PASS" " $total_active up, $paused_count paused "
elif [ [ " $down_count " -le 3 ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 14 "Uptime Kuma Monitors"
warn " $down_count / $total_active monitor(s) down: $down_names "
json_add "uptime_kuma" "WARN" " $down_count down: $down_names "
else
[ [ " $QUIET " = = true ] ] && section_always 14 "Uptime Kuma Monitors"
fail " $down_count / $total_active monitors down: $down_names "
json_add "uptime_kuma" "FAIL" " $down_count down: $down_names "
fi
fi
}
# --- 15. ResourceQuota Pressure ---
check_resourcequota( ) {
section 15 "ResourceQuota Pressure"
local quotas detail = "" had_issue = false status = "PASS"
quotas = $( $KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured" ; json_add "resourcequota" "PASS" "No quotas" ; return 0; }
local pressure
pressure = $( echo " $quotas " | python3 -c '
import json, sys, re
def parse_cpu( val) :
"" "Convert CPU value to millicores." ""
val = str( val)
if val.endswith( "m" ) :
return float( val[ :-1] )
return float( val) * 1000
def parse_mem( val) :
"" "Convert memory value to bytes." ""
val = str( val)
units = { "Ki" : 1024, "Mi" : 1024**2, "Gi" : 1024**3, "Ti" : 1024**4}
for suffix, mult in units.items( ) :
if val.endswith( suffix) :
return float( val[ :-len( suffix) ] ) * mult
# Plain bytes or numeric
return float( val)
data = json.load( sys.stdin)
for item in data.get( "items" , [ ] ) :
ns = item[ "metadata" ] [ "namespace" ]
name = item[ "metadata" ] [ "name" ]
status = item.get( "status" , { } )
hard = status.get( "hard" , { } )
used = status.get( "used" , { } )
for resource, hard_val in hard.items( ) :
used_val = used.get( resource, "0" )
try:
if "cpu" in resource:
h = parse_cpu( hard_val)
u = parse_cpu( used_val)
elif "memory" in resource or "storage" in resource:
h = parse_mem( hard_val)
u = parse_mem( used_val)
elif resource = = "pods" :
h = float( hard_val)
u = float( used_val)
else :
continue
if h <= 0:
continue
pct = ( u / h) * 100
if pct > 80:
level = "FAIL" if pct > 95 else "WARN"
print( f"{level}:{ns}/{name}:{resource}:{pct:.0f}%" )
except ( ValueError, ZeroDivisionError) :
pass
' 2>/dev/null) || true
if [ [ -z " $pressure " ] ] ; then
pass "All ResourceQuotas below 80% usage"
json_add "resourcequota" "PASS" "All below 80%"
else
[ [ " $QUIET " = = true ] ] && section_always 15 "ResourceQuota Pressure"
while IFS = read -r line; do
local level ns_res resource pct
level = $( echo " $line " | cut -d: -f1)
ns_res = $( echo " $line " | cut -d: -f2)
resource = $( echo " $line " | cut -d: -f3)
pct = $( echo " $line " | cut -d: -f4)
if [ [ " $level " = = "FAIL" ] ] ; then
fail " $ns_res : $resource at $pct "
status = "FAIL"
else
warn " $ns_res : $resource at $pct "
[ [ " $status " != "FAIL" ] ] && status = "WARN"
fi
detail += " $ns_res $resource = $pct ; "
had_issue = true
done <<< " $pressure "
json_add "resourcequota" " $status " " $detail "
fi
}
# --- 16. StatefulSets ---
check_statefulsets( ) {
section 16 "StatefulSets"
local sts detail = "" had_issue = false
sts = $( $KUBECTL get statefulsets -A --no-headers 2>& 1) || true
if [ [ -z " $sts " || " $sts " = = *"No resources found" * ] ] ; then
pass "No StatefulSets in cluster"
json_add "statefulsets" "PASS" "No StatefulSets"
return 0
fi
2026-02-22 00:00:47 +00:00
while IFS = read -r line; do
2026-02-22 01:09:02 +00:00
local ns name ready current desired
2026-02-22 00:00:47 +00:00
ns = $( echo " $line " | awk '{print $1}' )
name = $( echo " $line " | awk '{print $2}' )
2026-02-22 01:09:02 +00:00
ready = $( echo " $line " | awk '{print $3}' )
current = $( echo " $ready " | cut -d/ -f1)
desired = $( echo " $ready " | cut -d/ -f2)
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $current " != " $desired " ] ] ; then
[ [ " $had_issue " = = false && " $QUIET " = = true ] ] && section_always 16 "StatefulSets"
fail " $ns / $name : $current / $desired ready "
detail += " $ns / $name $current / $desired ; "
had_issue = true
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
done <<< " $sts "
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $had_issue " = = false ] ] ; then
pass "All StatefulSets fully available"
json_add "statefulsets" "PASS" "All available"
else
json_add "statefulsets" "FAIL" " $detail "
fi
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 17. Node Disk Usage ---
check_node_disk( ) {
section 17 "Node Disk Usage"
local node_json detail = "" had_issue = false status = "PASS"
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
node_json = $( $KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info" ; json_add "node_disk" "FAIL" "Cannot get nodes" ; return 0; }
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
local disk_info
disk_info = $( echo " $node_json " | python3 -c '
import json, sys
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
def parse_storage( val) :
"" "Convert storage value to bytes." ""
val = str( val)
units = { "Ki" : 1024, "Mi" : 1024**2, "Gi" : 1024**3, "Ti" : 1024**4}
for suffix, mult in units.items( ) :
if val.endswith( suffix) :
return float( val[ :-len( suffix) ] ) * mult
return float( val)
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
data = json.load( sys.stdin)
for node in data[ "items" ] :
name = node[ "metadata" ] [ "name" ]
cap = node[ "status" ] .get( "capacity" , { } )
alloc = node[ "status" ] .get( "allocatable" , { } )
es_cap = cap.get( "ephemeral-storage" , "0" )
es_alloc = alloc.get( "ephemeral-storage" , "0" )
try:
c = parse_storage( es_cap)
a = parse_storage( es_alloc)
if c > 0:
used_pct = ( ( c - a) / c) * 100
if used_pct > 80:
level = "FAIL" if used_pct > 90 else "WARN"
print( f"{level}:{name}:{used_pct:.0f}" )
except ( ValueError, ZeroDivisionError) :
pass
' 2>/dev/null) || true
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ -z " $disk_info " ] ] ; then
pass "All nodes below 80% ephemeral-storage usage"
json_add "node_disk" "PASS" "All below 80%"
else
[ [ " $QUIET " = = true ] ] && section_always 17 "Node Disk Usage"
while IFS = read -r line; do
local level node pct
level = $( echo " $line " | cut -d: -f1)
node = $( echo " $line " | cut -d: -f2)
pct = $( echo " $line " | cut -d: -f3)
if [ [ " $level " = = "FAIL" ] ] ; then
fail " $node : ephemeral-storage at ${ pct } % "
status = "FAIL"
else
warn " $node : ephemeral-storage at ${ pct } % "
[ [ " $status " != "FAIL" ] ] && status = "WARN"
fi
detail += " $node = ${ pct } %; "
had_issue = true
done <<< " $disk_info "
json_add "node_disk" " $status " " $detail "
fi
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- 18. Helm Release Health ---
check_helm_releases( ) {
section 18 "Helm Release Health"
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# Helm may not be available in the pod environment
if ! command -v helm & >/dev/null; then
pass "Helm not available (skipped)"
json_add "helm_releases" "PASS" "Helm not available"
return 0
fi
local releases detail = "" had_issue = false status = "PASS"
releases = $( helm list -A --kubeconfig " $KUBECONFIG_PATH " --all -o json 2>/dev/null) || {
[ [ " $QUIET " = = true ] ] && section_always 18 "Helm Release Health"
warn "Cannot list Helm releases"
json_add "helm_releases" "WARN" "Cannot list"
return 0
}
local bad_releases
bad_releases = $( echo " $releases " | python3 -c '
import json, sys
data = json.load( sys.stdin)
for r in data:
name = r.get( "name" , "?" )
ns = r.get( "namespace" , "?" )
st = r.get( "status" , "unknown" )
if st != "deployed" :
level = "FAIL" if st.startswith( "pending" ) else "WARN"
print( f"{level}:{ns}/{name}:{st}" )
' 2>/dev/null) || true
if [ [ -z " $bad_releases " ] ] ; then
pass "All Helm releases in deployed state"
json_add "helm_releases" "PASS" "All deployed"
else
[ [ " $QUIET " = = true ] ] && section_always 18 "Helm Release Health"
while IFS = read -r line; do
local level release_name release_status
level = $( echo " $line " | cut -d: -f1)
release_name = $( echo " $line " | cut -d: -f2)
release_status = $( echo " $line " | cut -d: -f3)
if [ [ " $level " = = "FAIL" ] ] ; then
fail " Helm release $release_name : $release_status (blocks terraform) "
status = "FAIL"
else
warn " Helm release $release_name : $release_status "
[ [ " $status " != "FAIL" ] ] && status = "WARN"
fi
detail += " $release_name = $release_status ; "
had_issue = true
done <<< " $bad_releases "
json_add "helm_releases" " $status " " $detail "
fi
}
# --- 19. Kyverno Policy Engine ---
check_kyverno( ) {
section 19 "Kyverno Policy Engine"
local kv_pods not_running
kv_pods = $( $KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true )
if [ [ -z " $kv_pods " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 19 "Kyverno Policy Engine"
fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
json_add "kyverno" "FAIL" "No Kyverno pods found"
return 0
fi
not_running = $( echo " $kv_pods " | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true )
if [ [ -n " $not_running " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 19 "Kyverno Policy Engine"
while IFS = read -r line; do
fail " Kyverno pod not running: $line "
done <<< " $not_running "
json_add "kyverno" "FAIL" " $not_running "
else
local total
total = $( count_lines " $kv_pods " )
pass " All $total Kyverno pods running "
json_add "kyverno" "PASS" " $total pods running "
fi
}
# --- 20. NFS Connectivity ---
check_nfs( ) {
section 20 "NFS Connectivity"
# Try native tools first (available locally), fall back to kubectl-based check (pod environment)
if command -v showmount & >/dev/null; then
if showmount -e 10.0.10.15 & >/dev/null; then
pass "NFS server 10.0.10.15 reachable (exports listed)"
json_add "nfs" "PASS" "NFS reachable"
return 0
fi
fi
if command -v nc & >/dev/null; then
if nc -z -G 3 10.0.10.15 2049 & >/dev/null; then
pass "NFS server 10.0.10.15 port 2049 open"
json_add "nfs" "PASS" "NFS port open"
return 0
fi
fi
# Fallback: check if NFS-backed pods are running (works in pod environment)
local nfs_pods
nfs_pods = $( $KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
import json, sys
data = json.load( sys.stdin)
count = 0
for pod in data.get( "items" , [ ] ) :
for vol in pod.get( "spec" , { } ) .get( "volumes" , [ ] ) :
if "nfs" in vol:
if pod.get( "status" , { } ) .get( "phase" ) = = "Running" :
count += 1
break
print( count)
' 2>/dev/null) || nfs_pods = "0"
if [ [ " $nfs_pods " -gt 0 ] ] ; then
pass " NFS healthy ( $nfs_pods pods using NFS volumes are running) "
json_add "nfs" "PASS" " $nfs_pods NFS pods running "
2026-02-22 00:00:47 +00:00
else
2026-02-22 01:09:02 +00:00
[ [ " $QUIET " = = true ] ] && section_always 20 "NFS Connectivity"
warn "Cannot verify NFS (showmount not available, no NFS pods found)"
json_add "nfs" "WARN" "Cannot verify"
fi
}
# --- 21. DNS Resolution ---
check_dns( ) {
section 21 "DNS Resolution"
local internal_ok = false external_ok = false detail = ""
# Try dig first (available locally), fall back to python3 (pod environment)
2026-03-12 09:08:34 +00:00
# Use system resolver (no @server) so it works from any host or pod
2026-02-22 01:09:02 +00:00
if command -v dig & >/dev/null; then
2026-03-12 09:08:34 +00:00
if dig viktorbarzin.me +short +time= 3 +tries= 1 2>/dev/null | grep -q .; then
2026-02-22 01:09:02 +00:00
internal_ok = true
fi
2026-03-12 09:08:34 +00:00
if dig google.com +short +time= 3 +tries= 1 2>/dev/null | grep -q .; then
2026-02-22 01:09:02 +00:00
external_ok = true
fi
else
# Fallback: use python3 for DNS resolution (works in pod environment)
local result
result = $( python3 -c "
import socket
try:
socket.getaddrinfo( 'viktorbarzin.me' , 443)
print( 'INTERNAL_OK' )
except Exception:
print( 'INTERNAL_FAIL' )
try:
socket.getaddrinfo( 'google.com' , 443)
print( 'EXTERNAL_OK' )
except Exception:
print( 'EXTERNAL_FAIL' )
" 2>/dev/null) || result=" "
if echo " $result " | grep -q "INTERNAL_OK" ; then
internal_ok = true
fi
if echo " $result " | grep -q "EXTERNAL_OK" ; then
external_ok = true
fi
fi
if [ [ " $internal_ok " = = true && " $external_ok " = = true ] ] ; then
pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
json_add "dns" "PASS" "Both resolve"
elif [ [ " $internal_ok " = = true || " $external_ok " = = true ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 21 "DNS Resolution"
if [ [ " $internal_ok " = = false ] ] ; then
warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
detail = "Internal failed"
2026-02-22 00:00:47 +00:00
else
2026-02-22 01:09:02 +00:00
warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
detail = "External failed"
fi
json_add "dns" "WARN" " $detail "
else
[ [ " $QUIET " = = true ] ] && section_always 21 "DNS Resolution"
fail "DNS not resolving — both internal and external failed"
json_add "dns" "FAIL" "Both failed"
fi
}
# --- 22. TLS Certificate Expiry ---
check_tls_certs( ) {
section 22 "TLS Certificate Expiry"
local secrets detail = "" had_issue = false status = "PASS"
secrets = $( $KUBECTL get secrets -A -o json 2>/dev/null) || {
[ [ " $QUIET " = = true ] ] && section_always 22 "TLS Certificate Expiry"
warn "Cannot list secrets"
json_add "tls_certs" "WARN" "Cannot list secrets"
return 0
}
local cert_issues
cert_issues = $( echo " $secrets " | python3 -c '
import json, sys, base64, subprocess, hashlib
from datetime import datetime, timezone
data = json.load( sys.stdin)
seen_fingerprints = set( )
results = [ ]
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
for item in data.get( "items" , [ ] ) :
if item.get( "type" ) != "kubernetes.io/tls" :
continue
ns = item[ "metadata" ] [ "namespace" ]
name = item[ "metadata" ] [ "name" ]
cert_data = item.get( "data" , { } ) .get( "tls.crt" , "" )
if not cert_data:
continue
# Deduplicate by cert fingerprint
raw = base64.b64decode( cert_data)
fp = hashlib.sha256( raw) .hexdigest( ) [ :16]
if fp in seen_fingerprints:
continue
seen_fingerprints.add( fp)
# Parse certificate expiry with openssl
try:
result = subprocess.run(
[ "openssl" , "x509" , "-noout" , "-enddate" , "-subject" ] ,
input = raw, capture_output = True, timeout = 5
)
output = result.stdout.decode( )
for line in output.splitlines( ) :
if line.startswith( "notAfter=" ) :
date_str = line.split( "=" , 1) [ 1]
# Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
try:
expiry = datetime.strptime( date_str.strip( ) , "%b %d %H:%M:%S %Y %Z" )
expiry = expiry.replace( tzinfo = timezone.utc)
days_left = ( expiry - datetime.now( timezone.utc) ) .days
if days_left <= 7:
print( f"FAIL:{ns}/{name}:{days_left}d" )
elif days_left <= 30:
print( f"WARN:{ns}/{name}:{days_left}d" )
except ValueError:
pass
except ( subprocess.TimeoutExpired, Exception) :
pass
' 2>/dev/null) || true
if [ [ -z " $cert_issues " ] ] ; then
pass "All TLS certificates valid for >30 days"
json_add "tls_certs" "PASS" "All valid >30d"
else
[ [ " $QUIET " = = true ] ] && section_always 22 "TLS Certificate Expiry"
while IFS = read -r line; do
local level cert_name days
level = $( echo " $line " | cut -d: -f1)
cert_name = $( echo " $line " | cut -d: -f2)
days = $( echo " $line " | cut -d: -f3)
if [ [ " $level " = = "FAIL" ] ] ; then
fail " TLS cert $cert_name expires in $days "
status = "FAIL"
else
warn " TLS cert $cert_name expires in $days "
[ [ " $status " != "FAIL" ] ] && status = "WARN"
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
detail += " $cert_name = $days ; "
had_issue = true
done <<< " $cert_issues "
json_add "tls_certs" " $status " " $detail "
fi
}
# --- 23. GPU Health ---
check_gpu( ) {
section 23 "GPU Health"
local gpu_pods not_running
gpu_pods = $( $KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true )
if [ [ -z " $gpu_pods " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 23 "GPU Health"
warn "NVIDIA namespace not found or empty"
json_add "gpu" "WARN" "No GPU pods found"
return 0
fi
# Check specifically for device-plugin (critical for GPU scheduling)
local device_plugin_down = false
local other_down = false
local detail = ""
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
while IFS = read -r line; do
local pod_name pod_status
pod_name = $( echo " $line " | awk '{print $1}' )
pod_status = $( echo " $line " | awk '{print $3}' )
if [ [ " $pod_status " != "Running" && " $pod_status " != "Completed" ] ] ; then
if echo " $pod_name " | grep -q "device-plugin" ; then
device_plugin_down = true
detail += " device-plugin $pod_name : $pod_status ; "
else
other_down = true
detail += " $pod_name : $pod_status ; "
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
fi
done <<< " $gpu_pods "
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
if [ [ " $device_plugin_down " = = true ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 23 "GPU Health"
fail "GPU device-plugin is down — GPU workloads cannot schedule"
json_add "gpu" "FAIL" " $detail "
elif [ [ " $other_down " = = true ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 23 "GPU Health"
warn " Some GPU pods not running: $detail "
json_add "gpu" "WARN" " $detail "
else
local total
total = $( count_lines " $gpu_pods " )
pass " All $total GPU pods running "
json_add "gpu" "PASS" " $total pods running "
fi
}
# --- 24. Cloudflare Tunnel ---
check_cloudflare_tunnel( ) {
section 24 "Cloudflare Tunnel"
local cf_pods running_count total_count
cf_pods = $( $KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true )
if [ [ -z " $cf_pods " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel namespace not found or empty — external access broken"
json_add "cloudflare_tunnel" "FAIL" "No pods found"
return 0
fi
total_count = $( count_lines " $cf_pods " )
running_count = $( echo " $cf_pods " | awk '$3 == "Running"' | wc -l | tr -d ' ' )
if [ [ " $running_count " -eq 0 ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 24 "Cloudflare Tunnel"
fail " Cloudflare tunnel: 0/ $total_count pods running — external access broken "
json_add "cloudflare_tunnel" "FAIL" " 0/ $total_count running "
elif [ [ " $running_count " -lt " $total_count " ] ] ; then
[ [ " $QUIET " = = true ] ] && section_always 24 "Cloudflare Tunnel"
warn " Cloudflare tunnel: $running_count / $total_count pods running (degraded) "
json_add "cloudflare_tunnel" "WARN" " $running_count / $total_count running "
else
pass " Cloudflare tunnel: all $total_count pods running "
json_add "cloudflare_tunnel" "PASS" " $total_count pods running "
fi
}
# --- Summary ---
print_summary( ) {
if [ [ " $JSON " = = true ] ] ; then
echo "{"
echo " \"timestamp\": \" $( date -u +%Y-%m-%dT%H:%M:%SZ) \", "
echo " \"pass\": $PASS_COUNT , "
echo " \"warn\": $WARN_COUNT , "
echo " \"fail\": $FAIL_COUNT , "
echo " \"checks\": ["
local first = true
for r in " ${ JSON_RESULTS [@] } " ; do
if [ [ " $first " = = true ] ] ; then
echo " $r "
first = false
else
echo " , $r "
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
done
echo " ]"
echo "}"
return 0
fi
echo ""
echo -e " ${ BOLD } ═══════════════════════════════════════ ${ NC } "
echo -e " ${ BOLD } Cluster Health Summary ${ NC } "
echo -e " ${ BOLD } ═══════════════════════════════════════ ${ NC } "
echo -e " ${ GREEN } PASS ${ NC } : $PASS_COUNT ${ YELLOW } WARN ${ NC } : $WARN_COUNT ${ RED } FAIL ${ NC } : $FAIL_COUNT "
echo ""
if [ [ " $FAIL_COUNT " -gt 0 ] ] ; then
echo -e " Overall: ${ RED } UNHEALTHY ${ NC } "
elif [ [ " $WARN_COUNT " -gt 0 ] ] ; then
echo -e " Overall: ${ YELLOW } DEGRADED ${ NC } "
else
echo -e " Overall: ${ GREEN } HEALTHY ${ NC } "
fi
echo ""
}
# --- Slack Notification ---
2026-02-22 01:16:47 +00:00
# Human-readable check name mapping
friendly_check_name( ) {
case " $1 " in
node_status) echo "Node Status" ; ;
node_resources) echo "Node Resources" ; ;
node_conditions) echo "Node Conditions" ; ;
problematic_pods) echo "Problematic Pods" ; ;
evicted_pods) echo "Evicted Pods" ; ;
daemonsets) echo "DaemonSets" ; ;
deployments) echo "Deployments" ; ;
pvcs) echo "PVCs" ; ;
hpa) echo "HPAs" ; ;
cronjob_failures) echo "CronJob Failures" ; ;
crowdsec) echo "CrowdSec" ; ;
ingresses) echo "Ingresses" ; ;
prometheus_alerts) echo "Prometheus Alerts" ; ;
uptime_kuma) echo "Uptime Kuma" ; ;
resourcequota) echo "Resource Quotas" ; ;
statefulsets) echo "StatefulSets" ; ;
node_disk) echo "Node Disk" ; ;
helm_releases) echo "Helm Releases" ; ;
kyverno) echo "Kyverno" ; ;
nfs) echo "NFS Storage" ; ;
dns) echo "DNS Resolution" ; ;
tls_certs) echo "TLS Certificates" ; ;
gpu) echo "GPU" ; ;
cloudflare_tunnel) echo "Cloudflare Tunnel" ; ;
*) echo " $1 " ; ;
esac
}
2026-02-22 01:09:02 +00:00
send_slack( ) {
if [ [ " $SEND_SLACK " != true ] ] ; then
return 0
fi
if [ [ -z " ${ SLACK_WEBHOOK_URL :- } " ] ] ; then
[ [ " $JSON " != true ] ] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification"
return 0
fi
# Gather stats for summary line
local node_count pod_count
node_count = $( $KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ' )
pod_count = $( $KUBECTL get pods -A --no-headers --field-selector= status.phase= Running 2>/dev/null | wc -l | tr -d ' ' )
local total_checks = $(( PASS_COUNT + WARN_COUNT + FAIL_COUNT))
2026-02-22 01:16:47 +00:00
# Use python3 to build the entire Slack payload from JSON_RESULTS
local json_results_str
json_results_str = $( printf '%s\n' " ${ JSON_RESULTS [@] } " )
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
local json_payload
2026-02-22 01:16:47 +00:00
json_payload = $( echo " $json_results_str " | python3 -c "
2026-02-22 00:00:47 +00:00
import json, sys
2026-02-22 01:16:47 +00:00
CHECK_NAMES = {
'node_status' : 'Node Status' ,
'node_resources' : 'Node Resources' ,
'node_conditions' : 'Node Conditions' ,
'problematic_pods' : 'Problematic Pods' ,
'evicted_pods' : 'Evicted Pods' ,
'daemonsets' : 'DaemonSets' ,
'deployments' : 'Deployments' ,
'pvcs' : 'PVCs' ,
'hpa' : 'HPAs' ,
'cronjob_failures' : 'CronJob Failures' ,
'crowdsec' : 'CrowdSec' ,
'ingresses' : 'Ingresses' ,
'prometheus_alerts' : 'Prometheus Alerts' ,
'uptime_kuma' : 'Uptime Kuma' ,
'resourcequota' : 'Resource Quotas' ,
'statefulsets' : 'StatefulSets' ,
'node_disk' : 'Node Disk' ,
'helm_releases' : 'Helm Releases' ,
'kyverno' : 'Kyverno' ,
'nfs' : 'NFS Storage' ,
'dns' : 'DNS Resolution' ,
'tls_certs' : 'TLS Certificates' ,
'gpu' : 'GPU' ,
'cloudflare_tunnel' : 'Cloudflare Tunnel' ,
2026-02-22 01:00:48 +00:00
}
2026-02-22 01:16:47 +00:00
def format_detail( check, detail) :
\" \" \" Format detail text for readability. Truncate long lists, split semicolons.\" \" \"
detail = detail.rstrip( '; ' ) .strip( )
# For checks with long comma-separated lists (e.g. Uptime Kuma down monitors),
# truncate to first 5 items with a count
if check = = 'uptime_kuma' and ': ' in detail:
prefix, names_str = detail.split( ': ' , 1)
names = [ n.strip( ) for n in names_str.split( ',' ) if n.strip( ) ]
if len( names) > 5:
shown = ', ' .join( names[ :5] )
detail = f'{prefix}: {shown} (+{len(names) - 5} more)'
elif names:
2026-02-22 01:23:56 +00:00
detail = prefix + ': ' + ', ' .join( names)
2026-02-22 01:16:47 +00:00
# For resource quotas and similar semicolon-separated items,
# split into separate lines
if '; ' in detail:
parts = [ p.strip( ) for p in detail.split( ';' ) if p.strip( ) ]
if len( parts) > 1:
lines = '\\n' .join( f' \u2022 {p}' for p in parts)
return lines
return detail
# Parse results
fails = [ ]
warns = [ ]
for line in sys.stdin:
line = line.strip( )
if not line:
continue
try:
d = json.loads( line)
except json.JSONDecodeError:
continue
status = d.get( 'status' , '' )
check = d.get( 'check' , '' )
detail = d.get( 'detail' , '' )
name = CHECK_NAMES.get( check, check)
formatted = format_detail( check, detail)
if status = = 'FAIL' :
fails.append( ( name, formatted) )
elif status = = 'WARN' :
warns.append( ( name, formatted) )
pass_count = ${ PASS_COUNT }
warn_count = ${ WARN_COUNT }
fail_count = ${ FAIL_COUNT }
total = ${ total_checks }
nodes = '${node_count}'
pods = '${pod_count}'
blocks = [ ]
# Header block
if fail_count = = 0 and warn_count = = 0:
header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*'
summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods'
blocks.append( { 'type' : 'section' , 'text' : { 'type' : 'mrkdwn' , 'text' : f'{header}\n{summary}' } } )
else :
issue_count = fail_count + warn_count
emoji = ':rotating_light:' if fail_count > 0 else ':warning:'
header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*'
summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed'
blocks.append( { 'type' : 'section' , 'text' : { 'type' : 'mrkdwn' , 'text' : f'{header}\n{summary}' } } )
# Failed section
if fails:
blocks.append( { 'type' : 'divider' } )
lines = [ ':x: *Failed*' ]
for name, detail in fails:
if '\\n' in detail:
lines.append( f'\u2022 *{name}*:' )
lines.append( detail)
else :
lines.append( f'\u2022 *{name}*: {detail}' )
blocks.append( { 'type' : 'section' , 'text' : { 'type' : 'mrkdwn' , 'text' : '\\n' .join( lines) } } )
# Warnings section
if warns:
blocks.append( { 'type' : 'divider' } )
lines = [ ':warning: *Warnings*' ]
for name, detail in warns:
if '\\n' in detail:
lines.append( f'\u2022 *{name}*:' )
lines.append( detail)
else :
lines.append( f'\u2022 *{name}*: {detail}' )
blocks.append( { 'type' : 'section' , 'text' : { 'type' : 'mrkdwn' , 'text' : '\\n' .join( lines) } } )
# Footer with timestamp
from datetime import datetime, timezone
ts = datetime.now( timezone.utc) .strftime( '%Y-%m-%d %H:%M UTC' )
blocks.append( { 'type' : 'context' , 'elements' : [ { 'type' : 'mrkdwn' , 'text' : f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}' } ] } )
payload = { 'blocks' : blocks}
2026-02-22 01:00:48 +00:00
print( json.dumps( payload) )
" )
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
curl -s -X POST " $SLACK_WEBHOOK_URL " \
-H 'Content-Type: application/json' \
-d " $json_payload " >/dev/null 2>& 1 || {
[ [ " $JSON " != true ] ] && echo "WARNING: Failed to send Slack notification"
}
[ [ " $JSON " != true ] ] && echo "Slack notification sent."
}
2026-02-22 00:00:47 +00:00
2026-02-22 01:09:02 +00:00
# --- Main ---
main( ) {
parse_args " $@ "
if [ [ " $JSON " != true ] ] ; then
echo -e " ${ BOLD } Cluster Health Check ${ NC } — $( date '+%Y-%m-%d %H:%M:%S' ) "
echo -e " Kubeconfig: $KUBECONFIG_PATH "
if [ [ " $FIX " = = true ] ] ; then
echo -e " ${ YELLOW } Auto-fix mode enabled ${ NC } "
fi
2026-02-22 00:00:47 +00:00
fi
2026-02-22 01:09:02 +00:00
check_nodes
check_resources
check_conditions
check_pods
check_evicted
check_daemonsets
check_deployments
check_pvcs
check_hpa
check_cronjobs
check_crowdsec
check_ingresses
check_alerts
check_uptime_kuma
check_resourcequota
check_statefulsets
check_node_disk
check_helm_releases
check_kyverno
check_nfs
check_dns
check_tls_certs
check_gpu
check_cloudflare_tunnel
print_summary
send_slack
[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup
- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas
- Add democratic-csi iSCSI driver module for TrueNAS
- Add open-iscsi to cloud-init VM template
- Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0)
- Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh)
- Fix cluster healthcheck CronJob: always exit 0 to prevent circular
JobFailed alerts (reporting via Slack, not exit codes)
- Fix Uptime Kuma nested list handling in cluster-health.sh
- Add health probes to: audiobookshelf, immich ML, ntfy, headscale,
uptime-kuma, vaultwarden, rybbit (clickhouse + server + client),
shlink, shlink-web
- Add iSCSI storage documentation to CLAUDE.md
2026-03-06 19:54:21 +00:00
# Always exit 0 — reporting is done via Slack notification.
# Non-zero exits mark the CronJob as Failed, which triggers Prometheus
# JobFailed alerts, creating a circular alert loop.
2026-02-22 01:09:02 +00:00
exit 0
}
main " $@ "