infra/.claude/cluster-health.sh
Viktor Barzin 24a23709a5 fix: update healthcheck to report internal and external monitors separately
- Increase Uptime Kuma API timeout to 120s with wait_events=0.2
- Remove hardcoded password, use Vault or UPTIME_KUMA_PASSWORD env var
- Report internal and external monitor status separately
- Install uptime-kuma-api in local venv

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 19:44:20 +00:00

1728 lines
61 KiB
Bash
Executable file

#!/usr/bin/env bash
# Cluster health check script (pod-compatible version).
# Runs 24 diagnostic checks against the Kubernetes cluster and prints
# a colour-coded report with PASS / WARN / FAIL for each section.
# Optionally posts results to Slack.
#
# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]
#
# Environment:
# KUBECONFIG — path to kubeconfig (used in pod environment)
# SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack)
# UPTIME_KUMA_PASSWORD — Uptime Kuma admin password
set -euo pipefail
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m'
# --- Globals ---
PASS_COUNT=0
WARN_COUNT=0
FAIL_COUNT=0
FIX=false
QUIET=false
JSON=false
SEND_SLACK=true
KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}"
KUBECTL=""
JSON_RESULTS=()
TOTAL_CHECKS=24
# --- Helpers ---
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
pass() { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e " ${GREEN}[PASS]${NC} $*"; }
warn() { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${YELLOW}[WARN]${NC} $*"; }
fail() { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${RED}[FAIL]${NC} $*"; }
section() {
local num="$1" title="$2"
[[ "$JSON" == true ]] && return 0
[[ "$QUIET" == true ]] && return 0
echo ""
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}
section_always() {
local num="$1" title="$2"
[[ "$JSON" == true ]] && return 0
echo ""
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}
json_add() {
local name="$1" status="$2" detail="$3"
local escaped
escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))')
JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}")
}
# count lines in a variable, returning 0 for empty strings
count_lines() {
local input="$1"
if [[ -z "$input" ]]; then
echo 0
else
echo "$input" | wc -l | tr -d ' '
fi
}
# --- Argument parsing ---
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--fix) FIX=true; shift ;;
--quiet|-q) QUIET=true; shift ;;
--json) JSON=true; shift ;;
--no-slack) SEND_SLACK=false; shift ;;
--kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
-h|--help)
echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]"
echo ""
echo "Flags:"
echo " --fix Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)"
echo " --quiet, -q Only show WARN and FAIL sections"
echo " --json Machine-readable JSON output"
echo " --kubeconfig PATH Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)"
echo " --no-slack Skip Slack notification"
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
# Auto-source UPTIME_KUMA_PASSWORD from terraform.tfvars if not set
if [[ -z "${UPTIME_KUMA_PASSWORD:-}" ]]; then
local script_dir tfvars_file
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
tfvars_file="${script_dir}/../terraform.tfvars"
if [[ -f "$tfvars_file" ]]; then
UPTIME_KUMA_PASSWORD=$(grep 'uptime_kuma_password' "$tfvars_file" | head -1 | sed 's/.*= *"\(.*\)"/\1/')
export UPTIME_KUMA_PASSWORD
fi
fi
}
# --- 1. Node Status ---
check_nodes() {
section 1 "Node Status"
local nodes not_ready versions unique_versions detail=""
nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; }
not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true)
versions=$(echo "$nodes" | awk '{print $5}' | sort -u)
unique_versions=$(echo "$versions" | wc -l | tr -d ' ')
if [[ -n "$not_ready" ]]; then
[[ "$QUIET" == true ]] && section_always 1 "Node Status"
fail "NotReady nodes: $not_ready"
detail="NotReady: $not_ready"
json_add "node_status" "FAIL" "$detail"
elif [[ "$unique_versions" -gt 1 ]]; then
[[ "$QUIET" == true ]] && section_always 1 "Node Status"
warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')"
detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')"
json_add "node_status" "WARN" "$detail"
else
pass "All nodes Ready, version $(echo "$versions" | head -1)"
detail="All nodes Ready"
json_add "node_status" "PASS" "$detail"
fi
}
# --- 2. Node Resources ---
check_resources() {
section 2 "Node Resources"
local top detail="" had_issue=false status="PASS"
top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; }
while IFS= read -r line; do
local node cpu_pct mem_pct
node=$(echo "$line" | awk '{print $1}')
cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
# Skip nodes where metrics are not yet available
if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then
detail+="$node metrics unavailable; "
continue
fi
if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; "
had_issue=true
status="FAIL"
elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; "
fi
done <<< "$top"
[[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory"
json_add "node_resources" "$status" "$detail"
}
# --- 3. Node Conditions ---
check_conditions() {
section 3 "Node Conditions"
local conditions detail=""
conditions=$($KUBECTL get nodes -o json | python3 -c '
import json, sys
data = json.load(sys.stdin)
for node in data["items"]:
name = node["metadata"]["name"]
for c in node["status"]["conditions"]:
if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True":
print(name + ": " + c["type"])
' 2>&1) || true
if [[ -n "$conditions" ]]; then
[[ "$QUIET" == true ]] && section_always 3 "Node Conditions"
while IFS= read -r line; do
fail "$line"
done <<< "$conditions"
detail="$conditions"
json_add "node_conditions" "FAIL" "$detail"
else
pass "No pressure conditions on any node"
json_add "node_conditions" "PASS" "No pressure conditions"
fi
}
# --- 4. Problematic Pods ---
check_pods() {
section 4 "Problematic Pods"
local bad count detail="" status="PASS"
bad=$( {
$KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
| grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
$KUBECTL get pods -A --no-headers 2>/dev/null \
| grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
} | awk '!seen[$1,$2]++' | sed '/^$/d') || true
count=$(count_lines "$bad")
# Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled
if [[ "$FIX" == true && "$count" -gt 0 ]]; then
local fixed_count=0
while IFS= read -r line; do
[[ -z "$line" ]] && continue
local ns pod pod_status restarts restarts_clean
ns=$(echo "$line" | awk '{print $1}')
pod=$(echo "$line" | awk '{print $2}')
pod_status=$(echo "$line" | awk '{print $4}')
restarts=$(echo "$line" | awk '{print $5}')
restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0")
if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then
info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)"
$KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true
fixed_count=$((fixed_count + 1))
fi
done <<< "$bad"
if [[ "$fixed_count" -gt 0 ]]; then
info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts"
fi
fi
if [[ "$count" -eq 0 ]]; then
pass "No problematic pods"
detail="None"
elif [[ "$count" -le 10 ]]; then
[[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
warn "$count problematic pod(s):"
[[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo " $line"; done
detail="$count pods"
status="WARN"
else
[[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
fail "$count problematic pods (showing first 10):"
[[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo " $line"; done
detail="$count pods"
status="FAIL"
fi
json_add "problematic_pods" "$status" "$detail"
}
# --- 5. Evicted/Failed Pods ---
check_evicted() {
section 5 "Evicted/Failed Pods"
local evicted count detail="" status="PASS"
evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true)
count=$(count_lines "$evicted")
if [[ "$count" -eq 0 ]]; then
pass "No evicted or failed pods"
detail="0"
elif [[ "$count" -le 50 ]]; then
[[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
warn "$count evicted/failed pod(s)"
detail="$count pods"
status="WARN"
else
[[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
fail "$count evicted/failed pods"
detail="$count pods"
status="FAIL"
fi
if [[ "$FIX" == true && "$count" -gt 0 ]]; then
info "Deleting $count evicted/failed pods..."
$KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true
info "Deleted evicted/failed pods"
fi
json_add "evicted_pods" "$status" "$detail"
}
# --- 6. DaemonSets ---
check_daemonsets() {
section 6 "DaemonSets"
local ds detail="" had_issue=false
ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; }
while IFS= read -r line; do
local ns name desired ready
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
desired=$(echo "$line" | awk '{print $3}')
ready=$(echo "$line" | awk '{print $5}')
if [[ "$desired" != "$ready" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets"
fail "$ns/$name: desired=$desired ready=$ready"
detail+="$ns/$name desired=$desired ready=$ready; "
had_issue=true
fi
done <<< "$ds"
if [[ "$had_issue" == false ]]; then
pass "All DaemonSets healthy (desired == ready)"
json_add "daemonsets" "PASS" "All healthy"
else
json_add "daemonsets" "FAIL" "$detail"
fi
}
# --- 7. Deployments ---
check_deployments() {
section 7 "Deployments"
local deps detail="" had_issue=false
deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; }
while IFS= read -r line; do
local ns name ready current desired
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
ready=$(echo "$line" | awk '{print $3}')
current=$(echo "$ready" | cut -d/ -f1)
desired=$(echo "$ready" | cut -d/ -f2)
if [[ "$current" != "$desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments"
fail "$ns/$name: $current/$desired ready"
detail+="$ns/$name $current/$desired; "
had_issue=true
fi
done <<< "$deps"
if [[ "$had_issue" == false ]]; then
pass "All deployments fully available"
json_add "deployments" "PASS" "All available"
else
json_add "deployments" "FAIL" "$detail"
fi
}
# --- 8. PVC Status ---
check_pvcs() {
section 8 "PVC Status"
local pvcs detail="" had_issue=false
pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true
if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then
pass "No PVCs in cluster"
json_add "pvcs" "PASS" "No PVCs"
return 0
fi
while IFS= read -r line; do
local ns name status
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
status=$(echo "$line" | awk '{print $3}')
if [[ "$status" != "Bound" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status"
fail "$ns/$name: $status"
detail+="$ns/$name=$status; "
had_issue=true
fi
done <<< "$pvcs"
if [[ "$had_issue" == false ]]; then
pass "All PVCs Bound"
json_add "pvcs" "PASS" "All Bound"
else
json_add "pvcs" "FAIL" "$detail"
fi
}
# --- 9. HPA Health ---
check_hpa() {
section 9 "HPA Health"
local hpas detail="" had_issue=false status="PASS"
hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true
if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then
pass "No HPAs configured"
json_add "hpa" "PASS" "No HPAs"
return 0
fi
while IFS= read -r line; do
local ns name targets
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
targets=$(echo "$line" | awk '{print $3}')
if echo "$targets" | grep -q '<unknown>'; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
fail "$ns/$name: targets=$targets (unknown metrics)"
detail+="$ns/$name=unknown; "
had_issue=true
status="FAIL"
else
# Parse percentage values from targets like "45%/80%, 30%/50%"
local pcts
pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true)
if [[ -n "$pcts" ]]; then
while IFS= read -r pct; do
[[ -z "$pct" ]] && continue
if [[ "$pct" -gt 150 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
fail "$ns/$name: utilization at ${pct}%"
detail+="$ns/$name=${pct}%; "
had_issue=true
status="FAIL"
break
elif [[ "$pct" -gt 100 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
warn "$ns/$name: utilization at ${pct}%"
detail+="$ns/$name=${pct}%; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
break
fi
done <<< "$pcts"
fi
fi
done <<< "$hpas"
[[ "$had_issue" == false ]] && pass "All HPAs healthy"
json_add "hpa" "$status" "${detail:-All healthy}"
}
# --- 10. CronJob Failures ---
check_cronjobs() {
section 10 "CronJob Failures"
local failures detail=""
failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
import json, sys
from datetime import datetime, timezone, timedelta
data = json.load(sys.stdin)
cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
for job in data.get("items", []):
meta = job.get("metadata", {})
ns = meta.get("namespace", "")
name = meta.get("name", "")
owners = meta.get("ownerReferences", [])
is_cronjob = any(o.get("kind") == "CronJob" for o in owners)
if not is_cronjob:
continue
conditions = job.get("status", {}).get("conditions", [])
for c in conditions:
if c.get("type") == "Failed" and c.get("status") == "True":
ts = c.get("lastTransitionTime", "")
if ts:
try:
t = datetime.fromisoformat(ts.replace("Z", "+00:00"))
if t > cutoff:
print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
except:
print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
' 2>/dev/null) || true
if [[ -z "$failures" ]]; then
pass "No CronJob failures in last 24h"
json_add "cronjob_failures" "PASS" "None"
else
[[ "$QUIET" == true ]] && section_always 10 "CronJob Failures"
local count
count=$(count_lines "$failures")
fail "$count CronJob failure(s) in last 24h:"
[[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo " $line"; done
json_add "cronjob_failures" "FAIL" "$count failures"
fi
}
# --- 11. CrowdSec ---
check_crowdsec() {
section 11 "CrowdSec Agents"
local cs_pods not_running
cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true)
if [[ -z "$cs_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
warn "CrowdSec namespace not found or empty"
json_add "crowdsec" "WARN" "No CrowdSec pods found"
return 0
fi
not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
if [[ -n "$not_running" ]]; then
[[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
while IFS= read -r line; do
fail "CrowdSec pod not running: $line"
done <<< "$not_running"
json_add "crowdsec" "FAIL" "$not_running"
else
local total
total=$(count_lines "$cs_pods")
pass "All $total CrowdSec pods running"
json_add "crowdsec" "PASS" "$total pods running"
fi
}
# --- 12. Ingress ---
check_ingresses() {
section 12 "Ingress Routes"
local ingresses no_lb detail="" had_issue=false
ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true)
if [[ -n "$ingresses" ]]; then
no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true)
if [[ -n "$no_lb" ]]; then
[[ "$QUIET" == true ]] && section_always 12 "Ingress Routes"
while IFS= read -r line; do
fail "Ingress missing LB IP: $line"
done <<< "$no_lb"
detail="Missing LB: $no_lb"
had_issue=true
fi
fi
# Check Traefik LB service
local traefik_svc_ip
traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
if [[ -z "$traefik_svc_ip" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes"
fail "Traefik LoadBalancer has no external IP"
detail+="Traefik LB missing IP; "
had_issue=true
else
detail+="Traefik LB=$traefik_svc_ip; "
fi
if [[ "$had_issue" == false ]]; then
pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)"
json_add "ingresses" "PASS" "$detail"
else
json_add "ingresses" "FAIL" "$detail"
fi
}
# --- 13. Prometheus Alerts ---
check_alerts() {
section 13 "Prometheus Alerts"
local alerts firing_count
# Try alertmanager first, then prometheus server
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true)
if [[ -z "$alerts" ]]; then
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true)
fi
if [[ -z "$alerts" ]]; then
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
warn "Could not query Prometheus/Alertmanager"
json_add "prometheus_alerts" "WARN" "Cannot query"
return 0
fi
firing_count=$(echo "$alerts" | python3 -c '
import json, sys
try:
data = json.load(sys.stdin)
if isinstance(data, list):
active = [a for a in data if a.get("status", {}).get("state") == "active"]
count = len(active)
names = [a.get("labels", {}).get("alertname", "?") for a in active]
print(f"{count}:" + ",".join(names) if count > 0 else "0:")
elif isinstance(data, dict) and "data" in data:
alerts_list = data["data"].get("alerts", [])
firing = [a for a in alerts_list if a.get("state") == "firing"]
count = len(firing)
names = [a.get("labels", {}).get("alertname", "?") for a in firing]
print(f"{count}:" + ",".join(names) if count > 0 else "0:")
else:
print("0:")
except:
print("-1:")
' 2>/dev/null || echo "-1:")
local count names
count=$(echo "$firing_count" | cut -d: -f1)
names=$(echo "$firing_count" | cut -d: -f2-)
if [[ "$count" == "-1" ]]; then
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
warn "Failed to parse alert data"
json_add "prometheus_alerts" "WARN" "Parse error"
elif [[ "$count" -eq 0 ]]; then
pass "No firing alerts"
json_add "prometheus_alerts" "PASS" "0 firing"
elif [[ "$count" -le 3 ]]; then
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
warn "$count firing alert(s): $names"
json_add "prometheus_alerts" "WARN" "$count firing: $names"
else
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
fail "$count firing alerts: $names"
json_add "prometheus_alerts" "FAIL" "$count firing: $names"
fi
}
# --- 14. Uptime Kuma ---
check_uptime_kuma() {
section 14 "Uptime Kuma Monitors"
local result
result=$(python3 -c '
import sys, os
try:
from uptime_kuma_api import UptimeKumaApi
except ImportError:
print("ERROR:uptime-kuma-api not installed")
sys.exit(0)
try:
password = os.environ.get("UPTIME_KUMA_PASSWORD", "")
if not password:
print("ERROR:UPTIME_KUMA_PASSWORD not set")
sys.exit(0)
api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2)
api.login("admin", password)
monitors = api.get_monitors()
heartbeats = api.get_heartbeats()
internal_up = 0
internal_down = []
external_up = 0
external_down = []
paused_count = 0
for m in monitors:
mid = m.get("id")
name = m.get("name", "unknown")
active = m.get("active", True)
is_external = name.startswith("[External] ")
if not active:
paused_count += 1
continue
beats = heartbeats.get(mid, [])
if beats:
last_beat = beats[-1]
if isinstance(last_beat, list):
last_beat = last_beat[-1] if last_beat else {}
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
if hasattr(status, "value"):
status = status.value
is_up = (status == 1)
else:
is_up = False
if is_external:
if is_up:
external_up += 1
else:
external_down.append(name.replace("[External] ", ""))
else:
if is_up:
internal_up += 1
else:
internal_down.append(name)
api.disconnect()
int_down_names = ", ".join(internal_down) if internal_down else ""
ext_down_names = ", ".join(external_down) if external_down else ""
print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}")
except Exception as e:
print(f"CONN_ERROR:{e}")
' 2>/dev/null) || result="CONN_ERROR:python execution failed"
if [[ "$result" == "ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
warn "Uptime Kuma: ${result#ERROR:}"
json_add "uptime_kuma" "WARN" "${result#ERROR:}"
elif [[ "$result" == "CONN_ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}"
json_add "uptime_kuma" "WARN" "Connection failed"
else
local int_down int_up ext_down ext_up paused_count down_details
int_down=$(echo "$result" | cut -d: -f1)
int_up=$(echo "$result" | cut -d: -f2)
ext_down=$(echo "$result" | cut -d: -f3)
ext_up=$(echo "$result" | cut -d: -f4)
paused_count=$(echo "$result" | cut -d: -f5)
down_details=$(echo "$result" | cut -d: -f6-)
local int_down_names="${down_details%%|*}"
local ext_down_names="${down_details#*|}"
local total_down=$((int_down + ext_down))
local total_up=$((int_up + ext_up))
local total_active=$((total_up + total_down))
if [[ "$total_down" -eq 0 ]]; then
pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)"
json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused"
else
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
local details=""
[[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names"
[[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; }
if [[ "$total_down" -le 3 ]]; then
warn "$total_down/$total_active down: $details"
json_add "uptime_kuma" "WARN" "$details"
else
fail "$total_down/$total_active down: $details"
json_add "uptime_kuma" "FAIL" "$details"
fi
fi
fi
}
# --- 15. ResourceQuota Pressure ---
check_resourcequota() {
section 15 "ResourceQuota Pressure"
local quotas detail="" had_issue=false status="PASS"
quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
local pressure
pressure=$(echo "$quotas" | python3 -c '
import json, sys, re
def parse_cpu(val):
"""Convert CPU value to millicores."""
val = str(val)
if val.endswith("m"):
return float(val[:-1])
return float(val) * 1000
def parse_mem(val):
"""Convert memory value to bytes."""
val = str(val)
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return float(val[:-len(suffix)]) * mult
# Plain bytes or numeric
return float(val)
data = json.load(sys.stdin)
for item in data.get("items", []):
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
status = item.get("status", {})
hard = status.get("hard", {})
used = status.get("used", {})
for resource, hard_val in hard.items():
used_val = used.get(resource, "0")
try:
if "cpu" in resource:
h = parse_cpu(hard_val)
u = parse_cpu(used_val)
elif "memory" in resource or "storage" in resource:
h = parse_mem(hard_val)
u = parse_mem(used_val)
elif resource == "pods":
h = float(hard_val)
u = float(used_val)
else:
continue
if h <= 0:
continue
pct = (u / h) * 100
if pct > 80:
level = "FAIL" if pct > 95 else "WARN"
print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
except (ValueError, ZeroDivisionError):
pass
' 2>/dev/null) || true
if [[ -z "$pressure" ]]; then
pass "All ResourceQuotas below 80% usage"
json_add "resourcequota" "PASS" "All below 80%"
else
[[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
while IFS= read -r line; do
local level ns_res resource pct
level=$(echo "$line" | cut -d: -f1)
ns_res=$(echo "$line" | cut -d: -f2)
resource=$(echo "$line" | cut -d: -f3)
pct=$(echo "$line" | cut -d: -f4)
if [[ "$level" == "FAIL" ]]; then
fail "$ns_res: $resource at $pct"
status="FAIL"
else
warn "$ns_res: $resource at $pct"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$ns_res $resource=$pct; "
had_issue=true
done <<< "$pressure"
json_add "resourcequota" "$status" "$detail"
fi
}
# --- 16. StatefulSets ---
check_statefulsets() {
section 16 "StatefulSets"
local sts detail="" had_issue=false
sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
pass "No StatefulSets in cluster"
json_add "statefulsets" "PASS" "No StatefulSets"
return 0
fi
while IFS= read -r line; do
local ns name ready current desired
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
ready=$(echo "$line" | awk '{print $3}')
current=$(echo "$ready" | cut -d/ -f1)
desired=$(echo "$ready" | cut -d/ -f2)
if [[ "$current" != "$desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
fail "$ns/$name: $current/$desired ready"
detail+="$ns/$name $current/$desired; "
had_issue=true
fi
done <<< "$sts"
if [[ "$had_issue" == false ]]; then
pass "All StatefulSets fully available"
json_add "statefulsets" "PASS" "All available"
else
json_add "statefulsets" "FAIL" "$detail"
fi
}
# --- 17. Node Disk Usage ---
check_node_disk() {
section 17 "Node Disk Usage"
local node_json detail="" had_issue=false status="PASS"
node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
local disk_info
disk_info=$(echo "$node_json" | python3 -c '
import json, sys
def parse_storage(val):
"""Convert storage value to bytes."""
val = str(val)
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return float(val[:-len(suffix)]) * mult
return float(val)
data = json.load(sys.stdin)
for node in data["items"]:
name = node["metadata"]["name"]
cap = node["status"].get("capacity", {})
alloc = node["status"].get("allocatable", {})
es_cap = cap.get("ephemeral-storage", "0")
es_alloc = alloc.get("ephemeral-storage", "0")
try:
c = parse_storage(es_cap)
a = parse_storage(es_alloc)
if c > 0:
used_pct = ((c - a) / c) * 100
if used_pct > 70: # Lower threshold after node2 containerd corruption incident
if used_pct > 85:
level = "FAIL" # Critical: Risk of containerd corruption
elif used_pct > 75:
level = "WARN" # Warning: Monitor closely
else:
level = "WARN" # Early warning
print(f"{level}:{name}:{used_pct:.0f}")
except (ValueError, ZeroDivisionError):
pass
' 2>/dev/null) || true
if [[ -z "$disk_info" ]]; then
pass "All nodes below 70% ephemeral-storage usage"
json_add "node_disk" "PASS" "All below 70%"
else
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
while IFS= read -r line; do
local level node pct
level=$(echo "$line" | cut -d: -f1)
node=$(echo "$line" | cut -d: -f2)
pct=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "$node: ephemeral-storage at ${pct}%"
status="FAIL"
else
warn "$node: ephemeral-storage at ${pct}%"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$node=${pct}%; "
had_issue=true
done <<< "$disk_info"
json_add "node_disk" "$status" "$detail"
fi
}
# --- 18. Helm Release Health ---
check_helm_releases() {
section 18 "Helm Release Health"
# Helm may not be available in the pod environment
if ! command -v helm &>/dev/null; then
pass "Helm not available (skipped)"
json_add "helm_releases" "PASS" "Helm not available"
return 0
fi
local releases detail="" had_issue=false status="PASS"
releases=$(helm list --all-namespaces --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
warn "Cannot list Helm releases"
json_add "helm_releases" "WARN" "Cannot list"
return 0
}
local bad_releases
bad_releases=$(echo "$releases" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for r in data:
name = r.get("name", "?")
ns = r.get("namespace", "?")
st = r.get("status", "unknown")
if st != "deployed":
level = "FAIL" if st.startswith("pending") else "WARN"
print(f"{level}:{ns}/{name}:{st}")
' 2>/dev/null) || true
if [[ -z "$bad_releases" ]]; then
pass "All Helm releases in deployed state"
json_add "helm_releases" "PASS" "All deployed"
else
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
while IFS= read -r line; do
local level release_name release_status
level=$(echo "$line" | cut -d: -f1)
release_name=$(echo "$line" | cut -d: -f2)
release_status=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "Helm release $release_name: $release_status (blocks terraform)"
status="FAIL"
else
warn "Helm release $release_name: $release_status"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$release_name=$release_status; "
had_issue=true
done <<< "$bad_releases"
json_add "helm_releases" "$status" "$detail"
fi
}
# --- 19. Kyverno Policy Engine ---
check_kyverno() {
section 19 "Kyverno Policy Engine"
local kv_pods not_running
kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
if [[ -z "$kv_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
json_add "kyverno" "FAIL" "No Kyverno pods found"
return 0
fi
not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
if [[ -n "$not_running" ]]; then
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
while IFS= read -r line; do
fail "Kyverno pod not running: $line"
done <<< "$not_running"
json_add "kyverno" "FAIL" "$not_running"
else
local total
total=$(count_lines "$kv_pods")
pass "All $total Kyverno pods running"
json_add "kyverno" "PASS" "$total pods running"
fi
}
# --- 20. NFS Connectivity ---
check_nfs() {
section 20 "NFS Connectivity"
# Try native tools first (available locally), fall back to kubectl-based check (pod environment)
if command -v showmount &>/dev/null; then
if showmount -e 192.168.1.127 &>/dev/null; then
pass "NFS server 192.168.1.127 reachable (exports listed)"
json_add "nfs" "PASS" "NFS reachable"
return 0
fi
fi
if command -v nc &>/dev/null; then
if nc -z -G 3 192.168.1.127 2049 &>/dev/null; then
pass "NFS server 192.168.1.127 port 2049 open"
json_add "nfs" "PASS" "NFS port open"
return 0
fi
fi
# Fallback: check if NFS-backed pods are running (works in pod environment)
local nfs_pods
nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
import json, sys
data = json.load(sys.stdin)
count = 0
for pod in data.get("items", []):
for vol in pod.get("spec", {}).get("volumes", []):
if "nfs" in vol:
if pod.get("status", {}).get("phase") == "Running":
count += 1
break
print(count)
' 2>/dev/null) || nfs_pods="0"
if [[ "$nfs_pods" -gt 0 ]]; then
pass "NFS healthy ($nfs_pods pods using NFS volumes are running)"
json_add "nfs" "PASS" "$nfs_pods NFS pods running"
else
[[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
warn "Cannot verify NFS (showmount not available, no NFS pods found)"
json_add "nfs" "WARN" "Cannot verify"
fi
}
# --- 21. DNS Resolution ---
check_dns() {
section 21 "DNS Resolution"
local internal_ok=false external_ok=false detail=""
# Try dig first (available locally), fall back to python3 (pod environment)
# Use system resolver (no @server) so it works from any host or pod
if command -v dig &>/dev/null; then
if dig viktorbarzin.me +short +time=3 +tries=1 2>/dev/null | grep -q .; then
internal_ok=true
fi
if dig google.com +short +time=3 +tries=1 2>/dev/null | grep -q .; then
external_ok=true
fi
else
# Fallback: use python3 for DNS resolution (works in pod environment)
local result
result=$(python3 -c "
import socket
try:
socket.getaddrinfo('viktorbarzin.me', 443)
print('INTERNAL_OK')
except Exception:
print('INTERNAL_FAIL')
try:
socket.getaddrinfo('google.com', 443)
print('EXTERNAL_OK')
except Exception:
print('EXTERNAL_FAIL')
" 2>/dev/null) || result=""
if echo "$result" | grep -q "INTERNAL_OK"; then
internal_ok=true
fi
if echo "$result" | grep -q "EXTERNAL_OK"; then
external_ok=true
fi
fi
if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
json_add "dns" "PASS" "Both resolve"
elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
if [[ "$internal_ok" == false ]]; then
warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
detail="Internal failed"
else
warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
detail="External failed"
fi
json_add "dns" "WARN" "$detail"
else
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
fail "DNS not resolving — both internal and external failed"
json_add "dns" "FAIL" "Both failed"
fi
}
# --- 22. TLS Certificate Expiry ---
check_tls_certs() {
section 22 "TLS Certificate Expiry"
local secrets detail="" had_issue=false status="PASS"
secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
warn "Cannot list secrets"
json_add "tls_certs" "WARN" "Cannot list secrets"
return 0
}
local cert_issues
cert_issues=$(echo "$secrets" | python3 -c '
import json, sys, base64, subprocess, hashlib
from datetime import datetime, timezone
data = json.load(sys.stdin)
seen_fingerprints = set()
results = []
for item in data.get("items", []):
if item.get("type") != "kubernetes.io/tls":
continue
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
cert_data = item.get("data", {}).get("tls.crt", "")
if not cert_data:
continue
# Deduplicate by cert fingerprint
raw = base64.b64decode(cert_data)
fp = hashlib.sha256(raw).hexdigest()[:16]
if fp in seen_fingerprints:
continue
seen_fingerprints.add(fp)
# Parse certificate expiry with openssl
try:
result = subprocess.run(
["openssl", "x509", "-noout", "-enddate", "-subject"],
input=raw, capture_output=True, timeout=5
)
output = result.stdout.decode()
for line in output.splitlines():
if line.startswith("notAfter="):
date_str = line.split("=", 1)[1]
# Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
try:
expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
expiry = expiry.replace(tzinfo=timezone.utc)
days_left = (expiry - datetime.now(timezone.utc)).days
if days_left <= 7:
print(f"FAIL:{ns}/{name}:{days_left}d")
elif days_left <= 30:
print(f"WARN:{ns}/{name}:{days_left}d")
except ValueError:
pass
except (subprocess.TimeoutExpired, Exception):
pass
' 2>/dev/null) || true
if [[ -z "$cert_issues" ]]; then
pass "All TLS certificates valid for >30 days"
json_add "tls_certs" "PASS" "All valid >30d"
else
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
while IFS= read -r line; do
local level cert_name days
level=$(echo "$line" | cut -d: -f1)
cert_name=$(echo "$line" | cut -d: -f2)
days=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "TLS cert $cert_name expires in $days"
status="FAIL"
else
warn "TLS cert $cert_name expires in $days"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$cert_name=$days; "
had_issue=true
done <<< "$cert_issues"
json_add "tls_certs" "$status" "$detail"
fi
}
# --- 23. GPU Health ---
check_gpu() {
section 23 "GPU Health"
local gpu_pods not_running
gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
if [[ -z "$gpu_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
warn "NVIDIA namespace not found or empty"
json_add "gpu" "WARN" "No GPU pods found"
return 0
fi
# Check specifically for device-plugin (critical for GPU scheduling)
local device_plugin_down=false
local other_down=false
local detail=""
while IFS= read -r line; do
local pod_name pod_status
pod_name=$(echo "$line" | awk '{print $1}')
pod_status=$(echo "$line" | awk '{print $3}')
if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
if echo "$pod_name" | grep -q "device-plugin"; then
device_plugin_down=true
detail+="device-plugin $pod_name: $pod_status; "
else
other_down=true
detail+="$pod_name: $pod_status; "
fi
fi
done <<< "$gpu_pods"
if [[ "$device_plugin_down" == true ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
fail "GPU device-plugin is down — GPU workloads cannot schedule"
json_add "gpu" "FAIL" "$detail"
elif [[ "$other_down" == true ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
warn "Some GPU pods not running: $detail"
json_add "gpu" "WARN" "$detail"
else
local total
total=$(count_lines "$gpu_pods")
pass "All $total GPU pods running"
json_add "gpu" "PASS" "$total pods running"
fi
}
# --- 24. Cloudflare Tunnel ---
check_cloudflare_tunnel() {
section 24 "Cloudflare Tunnel"
local cf_pods running_count total_count
cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
if [[ -z "$cf_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel namespace not found or empty — external access broken"
json_add "cloudflare_tunnel" "FAIL" "No pods found"
return 0
fi
total_count=$(count_lines "$cf_pods")
running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
if [[ "$running_count" -eq 0 ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
elif [[ "$running_count" -lt "$total_count" ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
else
pass "Cloudflare tunnel: all $total_count pods running"
json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
fi
}
# --- 25. Advanced CPU Monitoring (Prometheus) ---
check_prometheus_cpu() {
section 25 "Advanced CPU Monitoring"
local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
local detail="" had_issue=false status="PASS"
# Start port-forward to Prometheus if not using in-cluster DNS
local prom_url pf_pid=""
if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
else
local pf_port
pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
$KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
pf_pid=$!
sleep 2
prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
fi
# Cleanup port-forward on exit from this function
trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN
# Try to query Prometheus for CPU metrics
local cpu_data
cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
warn "Prometheus not accessible for CPU monitoring"
json_add "prometheus_cpu" "WARN" "Prometheus unreachable"
return 0
}
# Parse JSON and check CPU usage
local cpu_results
cpu_results=$(echo "$cpu_data" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
if data.get('status') == 'success':
for result in data['data']['result']:
instance = result['metric']['instance']
usage = float(result['value'][1])
# Map IP to node name
if '10.0.20.100' in instance:
node = 'k8s-master'
elif '10.0.20.101' in instance:
node = 'k8s-node1'
elif '10.0.20.102' in instance:
node = 'k8s-node2'
elif '10.0.20.103' in instance:
node = 'k8s-node3'
elif '10.0.20.104' in instance:
node = 'k8s-node4'
elif 'pve-node' in instance:
node = 'proxmox-host'
else:
node = instance
print(f'{node}:{usage:.1f}')
except Exception as e:
print(f'ERROR:{e}')
" 2>/dev/null) || true
if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then
warn "Failed to parse Prometheus CPU data"
json_add "prometheus_cpu" "WARN" "Parse failed"
return 0
fi
# Check CPU thresholds
while IFS=':' read -r node usage; do
[[ -z "$node" || -z "$usage" ]] && continue
usage_int=${usage%.*} # Remove decimal
if [[ "$usage_int" -gt 85 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
fail "$node: ${usage}% CPU (critical)"
detail+="$node=${usage}% [CRIT]; "
had_issue=true
status="FAIL"
elif [[ "$usage_int" -gt 70 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
warn "$node: ${usage}% CPU (high)"
detail+="$node=${usage}% [HIGH]; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
detail+="$node=${usage}% [OK]; "
fi
done <<< "$cpu_results"
[[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)"
json_add "prometheus_cpu" "$status" "$detail"
}
# --- 26. Power Monitoring ---
check_power_monitoring() {
section 26 "Power Monitoring"
local detail="" had_issue=false status="PASS"
# Start port-forward to Prometheus if not using in-cluster DNS
local prom_url pf_pid=""
if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
else
local pf_port
pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
$KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
pf_pid=$!
sleep 2
prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
fi
trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN
# GPU Power monitoring
local gpu_query="DCGM_FI_DEV_POWER_USAGE"
local gpu_data
gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || {
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
warn "GPU power metrics unavailable"
detail+="GPU metrics unavailable; "
had_issue=true
status="WARN"
}
if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then
local gpu_results
gpu_results=$(echo "$gpu_data" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
if data.get('status') == 'success':
for result in data['data']['result']:
hostname = result['metric'].get('Hostname', 'unknown')
power = float(result['value'][1])
print(f'{hostname}:{power:.1f}')
except Exception:
pass
" 2>/dev/null) || true
# Check GPU power thresholds (Tesla T4 TDP is ~70W)
while IFS=':' read -r node power; do
[[ -z "$node" || -z "$power" ]] && continue
power_int=${power%.*}
if [[ "$power_int" -gt 65 ]]; then # > 90% of T4 TDP
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
warn "GPU $node: ${power}W (high power draw)"
detail+="GPU-$node=${power}W [HIGH]; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
elif [[ "$power_int" -gt 50 ]]; then # > 70% of T4 TDP
detail+="GPU-$node=${power}W [ACTIVE]; "
else
detail+="GPU-$node=${power}W [IDLE]; "
fi
done <<< "$gpu_results"
fi
[[ "$had_issue" == false ]] && pass "Power consumption within normal ranges"
json_add "power_monitoring" "$status" "$detail"
}
# --- Summary ---
print_summary() {
if [[ "$JSON" == true ]]; then
echo "{"
echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
echo " \"pass\": $PASS_COUNT,"
echo " \"warn\": $WARN_COUNT,"
echo " \"fail\": $FAIL_COUNT,"
echo " \"checks\": ["
local first=true
for r in "${JSON_RESULTS[@]}"; do
if [[ "$first" == true ]]; then
echo " $r"
first=false
else
echo " ,$r"
fi
done
echo " ]"
echo "}"
return 0
fi
echo ""
echo -e "${BOLD}═══════════════════════════════════════${NC}"
echo -e "${BOLD} Cluster Health Summary${NC}"
echo -e "${BOLD}═══════════════════════════════════════${NC}"
echo -e " ${GREEN}PASS${NC}: $PASS_COUNT ${YELLOW}WARN${NC}: $WARN_COUNT ${RED}FAIL${NC}: $FAIL_COUNT"
echo ""
if [[ "$FAIL_COUNT" -gt 0 ]]; then
echo -e " Overall: ${RED}UNHEALTHY${NC}"
elif [[ "$WARN_COUNT" -gt 0 ]]; then
echo -e " Overall: ${YELLOW}DEGRADED${NC}"
else
echo -e " Overall: ${GREEN}HEALTHY${NC}"
fi
echo ""
}
# --- Slack Notification ---
# Human-readable check name mapping
friendly_check_name() {
case "$1" in
node_status) echo "Node Status" ;;
node_resources) echo "Node Resources" ;;
node_conditions) echo "Node Conditions" ;;
problematic_pods) echo "Problematic Pods" ;;
evicted_pods) echo "Evicted Pods" ;;
daemonsets) echo "DaemonSets" ;;
deployments) echo "Deployments" ;;
pvcs) echo "PVCs" ;;
hpa) echo "HPAs" ;;
cronjob_failures) echo "CronJob Failures" ;;
crowdsec) echo "CrowdSec" ;;
ingresses) echo "Ingresses" ;;
prometheus_alerts) echo "Prometheus Alerts" ;;
uptime_kuma) echo "Uptime Kuma" ;;
resourcequota) echo "Resource Quotas" ;;
statefulsets) echo "StatefulSets" ;;
node_disk) echo "Node Disk" ;;
helm_releases) echo "Helm Releases" ;;
kyverno) echo "Kyverno" ;;
nfs) echo "NFS Storage" ;;
dns) echo "DNS Resolution" ;;
tls_certs) echo "TLS Certificates" ;;
gpu) echo "GPU" ;;
cloudflare_tunnel) echo "Cloudflare Tunnel" ;;
prometheus_cpu) echo "Advanced CPU Monitoring" ;;
power_monitoring) echo "Power Monitoring" ;;
*) echo "$1" ;;
esac
}
send_slack() {
if [[ "$SEND_SLACK" != true ]]; then
return 0
fi
if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then
[[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification"
return 0
fi
# Gather stats for summary line
local node_count pod_count
node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ')
local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT))
# Use python3 to build the entire Slack payload from JSON_RESULTS
local json_results_str
json_results_str=$(printf '%s\n' "${JSON_RESULTS[@]}")
local json_payload
json_payload=$(echo "$json_results_str" | python3 -c "
import json, sys
CHECK_NAMES = {
'node_status': 'Node Status',
'node_resources': 'Node Resources',
'node_conditions': 'Node Conditions',
'problematic_pods': 'Problematic Pods',
'evicted_pods': 'Evicted Pods',
'daemonsets': 'DaemonSets',
'deployments': 'Deployments',
'pvcs': 'PVCs',
'hpa': 'HPAs',
'cronjob_failures': 'CronJob Failures',
'crowdsec': 'CrowdSec',
'ingresses': 'Ingresses',
'prometheus_alerts': 'Prometheus Alerts',
'uptime_kuma': 'Uptime Kuma',
'resourcequota': 'Resource Quotas',
'statefulsets': 'StatefulSets',
'node_disk': 'Node Disk',
'helm_releases': 'Helm Releases',
'kyverno': 'Kyverno',
'nfs': 'NFS Storage',
'dns': 'DNS Resolution',
'tls_certs': 'TLS Certificates',
'gpu': 'GPU',
'cloudflare_tunnel': 'Cloudflare Tunnel',
'prometheus_cpu': 'Advanced CPU Monitoring',
'power_monitoring': 'Power Monitoring',
}
def format_detail(check, detail):
\"\"\"Format detail text for readability. Truncate long lists, split semicolons.\"\"\"
detail = detail.rstrip('; ').strip()
# For checks with long comma-separated lists (e.g. Uptime Kuma down monitors),
# truncate to first 5 items with a count
if check == 'uptime_kuma' and ': ' in detail:
prefix, names_str = detail.split(': ', 1)
names = [n.strip() for n in names_str.split(',') if n.strip()]
if len(names) > 5:
shown = ', '.join(names[:5])
detail = f'{prefix}: {shown} (+{len(names) - 5} more)'
elif names:
detail = prefix + ': ' + ', '.join(names)
# For resource quotas and similar semicolon-separated items,
# split into separate lines
if '; ' in detail:
parts = [p.strip() for p in detail.split(';') if p.strip()]
if len(parts) > 1:
lines = '\\n'.join(f' \u2022 {p}' for p in parts)
return lines
return detail
# Parse results
fails = []
warns = []
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
except json.JSONDecodeError:
continue
status = d.get('status', '')
check = d.get('check', '')
detail = d.get('detail', '')
name = CHECK_NAMES.get(check, check)
formatted = format_detail(check, detail)
if status == 'FAIL':
fails.append((name, formatted))
elif status == 'WARN':
warns.append((name, formatted))
pass_count = ${PASS_COUNT}
warn_count = ${WARN_COUNT}
fail_count = ${FAIL_COUNT}
total = ${total_checks}
nodes = '${node_count}'
pods = '${pod_count}'
blocks = []
# Header block
if fail_count == 0 and warn_count == 0:
header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*'
summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods'
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
else:
issue_count = fail_count + warn_count
emoji = ':rotating_light:' if fail_count > 0 else ':warning:'
header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*'
summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed'
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
# Failed section
if fails:
blocks.append({'type': 'divider'})
lines = [':x: *Failed*']
for name, detail in fails:
if '\\n' in detail:
lines.append(f'\u2022 *{name}*:')
lines.append(detail)
else:
lines.append(f'\u2022 *{name}*: {detail}')
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})
# Warnings section
if warns:
blocks.append({'type': 'divider'})
lines = [':warning: *Warnings*']
for name, detail in warns:
if '\\n' in detail:
lines.append(f'\u2022 *{name}*:')
lines.append(detail)
else:
lines.append(f'\u2022 *{name}*: {detail}')
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})
# Footer with timestamp
from datetime import datetime, timezone
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
blocks.append({'type': 'context', 'elements': [{'type': 'mrkdwn', 'text': f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}'}]})
payload = {'blocks': blocks}
print(json.dumps(payload))
")
curl -s -X POST "$SLACK_WEBHOOK_URL" \
-H 'Content-Type: application/json' \
-d "$json_payload" >/dev/null 2>&1 || {
[[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification"
}
[[ "$JSON" != true ]] && echo "Slack notification sent."
}
# --- Main ---
main() {
parse_args "$@"
if [[ "$JSON" != true ]]; then
echo -e "${BOLD}Cluster Health Check${NC}$(date '+%Y-%m-%d %H:%M:%S')"
echo -e "Kubeconfig: $KUBECONFIG_PATH"
if [[ "$FIX" == true ]]; then
echo -e "${YELLOW}Auto-fix mode enabled${NC}"
fi
fi
check_nodes
check_resources
check_conditions
check_pods
check_evicted
check_daemonsets
check_deployments
check_pvcs
check_hpa
check_cronjobs
check_crowdsec
check_ingresses
check_alerts
check_uptime_kuma
check_resourcequota
check_statefulsets
check_node_disk
check_helm_releases
check_kyverno
check_nfs
check_dns
check_tls_certs
check_gpu
check_cloudflare_tunnel
check_prometheus_cpu
check_power_monitoring
print_summary
send_slack
# Always exit 0 — reporting is done via Slack notification.
# Non-zero exits mark the CronJob as Failed, which triggers Prometheus
# JobFailed alerts, creating a circular alert loop.
exit 0
}
main "$@"