#!/usr/bin/env bash # Cluster health check script (pod-compatible version). # Runs 24 diagnostic checks against the Kubernetes cluster and prints # a colour-coded report with PASS / WARN / FAIL for each section. # Optionally posts results to Slack. # # Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig ] [--no-slack] # # Environment: # KUBECONFIG — path to kubeconfig (used in pod environment) # SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack) # UPTIME_KUMA_PASSWORD — Uptime Kuma admin password set -euo pipefail # --- Colors --- RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' BOLD='\033[1m' NC='\033[0m' # --- Globals --- PASS_COUNT=0 WARN_COUNT=0 FAIL_COUNT=0 FIX=false QUIET=false JSON=false SEND_SLACK=true KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}" KUBECTL="" JSON_RESULTS=() TOTAL_CHECKS=24 # --- Helpers --- info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; } pass() { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e " ${GREEN}[PASS]${NC} $*"; } warn() { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${YELLOW}[WARN]${NC} $*"; } fail() { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${RED}[FAIL]${NC} $*"; } section() { local num="$1" title="$2" [[ "$JSON" == true ]] && return 0 [[ "$QUIET" == true ]] && return 0 echo "" echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" } section_always() { local num="$1" title="$2" [[ "$JSON" == true ]] && return 0 echo "" echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" } json_add() { local name="$1" status="$2" detail="$3" local escaped escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))') JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}") } # count lines in a variable, returning 0 for empty strings count_lines() { local input="$1" if [[ -z "$input" ]]; then echo 0 else echo "$input" | wc -l | tr -d ' ' fi } # --- Argument parsing --- parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --fix) FIX=true; shift ;; --quiet|-q) QUIET=true; shift ;; --json) JSON=true; shift ;; --no-slack) SEND_SLACK=false; shift ;; --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; -h|--help) echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig ] [--no-slack]" echo "" echo "Flags:" echo " --fix Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)" echo " --quiet, -q Only show WARN and FAIL sections" echo " --json Machine-readable JSON output" echo " --kubeconfig PATH Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)" echo " --no-slack Skip Slack notification" exit 0 ;; *) echo "Unknown option: $1" >&2 exit 1 ;; esac done KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH" # Auto-source UPTIME_KUMA_PASSWORD from terraform.tfvars if not set if [[ -z "${UPTIME_KUMA_PASSWORD:-}" ]]; then local script_dir tfvars_file script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" tfvars_file="${script_dir}/../terraform.tfvars" if [[ -f "$tfvars_file" ]]; then UPTIME_KUMA_PASSWORD=$(grep 'uptime_kuma_password' "$tfvars_file" | head -1 | sed 's/.*= *"$.*$"/\1/') export UPTIME_KUMA_PASSWORD fi fi } # --- 1. Node Status --- check_nodes() { section 1 "Node Status" local nodes not_ready versions unique_versions detail="" nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; } not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true) versions=$(echo "$nodes" | awk '{print $5}' | sort -u) unique_versions=$(echo "$versions" | wc -l | tr -d ' ') if [[ -n "$not_ready" ]]; then [[ "$QUIET" == true ]] && section_always 1 "Node Status" fail "NotReady nodes: $not_ready" detail="NotReady: $not_ready" json_add "node_status" "FAIL" "$detail" elif [[ "$unique_versions" -gt 1 ]]; then [[ "$QUIET" == true ]] && section_always 1 "Node Status" warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')" detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')" json_add "node_status" "WARN" "$detail" else pass "All nodes Ready, version $(echo "$versions" | head -1)" detail="All nodes Ready" json_add "node_status" "PASS" "$detail" fi } # --- 2. Node Resources --- check_resources() { section 2 "Node Resources" local top detail="" had_issue=false status="PASS" top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; } while IFS= read -r line; do local node cpu_pct mem_pct node=$(echo "$line" | awk '{print $1}') cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%') mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%') # Skip nodes where metrics are not yet available if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then detail+="$node metrics unavailable; " continue fi if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources" fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%" detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; " had_issue=true status="FAIL" elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources" warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%" detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; " had_issue=true [[ "$status" != "FAIL" ]] && status="WARN" else detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; " fi done <<< "$top" [[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory" json_add "node_resources" "$status" "$detail" } # --- 3. Node Conditions --- check_conditions() { section 3 "Node Conditions" local conditions detail="" conditions=$($KUBECTL get nodes -o json | python3 -c ' import json, sys data = json.load(sys.stdin) for node in data["items"]: name = node["metadata"]["name"] for c in node["status"]["conditions"]: if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True": print(name + ": " + c["type"]) ' 2>&1) || true if [[ -n "$conditions" ]]; then [[ "$QUIET" == true ]] && section_always 3 "Node Conditions" while IFS= read -r line; do fail "$line" done <<< "$conditions" detail="$conditions" json_add "node_conditions" "FAIL" "$detail" else pass "No pressure conditions on any node" json_add "node_conditions" "PASS" "No pressure conditions" fi } # --- 4. Problematic Pods --- check_pods() { section 4 "Problematic Pods" local bad count detail="" status="PASS" bad=$( { $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \ | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true $KUBECTL get pods -A --no-headers 2>/dev/null \ | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true } | awk '!seen[$1,$2]++' | sed '/^$/d') || true count=$(count_lines "$bad") # Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled if [[ "$FIX" == true && "$count" -gt 0 ]]; then local fixed_count=0 while IFS= read -r line; do [[ -z "$line" ]] && continue local ns pod pod_status restarts restarts_clean ns=$(echo "$line" | awk '{print $1}') pod=$(echo "$line" | awk '{print $2}') pod_status=$(echo "$line" | awk '{print $4}') restarts=$(echo "$line" | awk '{print $5}') restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0") if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)" $KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true fixed_count=$((fixed_count + 1)) fi done <<< "$bad" if [[ "$fixed_count" -gt 0 ]]; then info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts" fi fi if [[ "$count" -eq 0 ]]; then pass "No problematic pods" detail="None" elif [[ "$count" -le 10 ]]; then [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods" warn "$count problematic pod(s):" [[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo " $line"; done detail="$count pods" status="WARN" else [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods" fail "$count problematic pods (showing first 10):" [[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo " $line"; done detail="$count pods" status="FAIL" fi json_add "problematic_pods" "$status" "$detail" } # --- 5. Evicted/Failed Pods --- check_evicted() { section 5 "Evicted/Failed Pods" local evicted count detail="" status="PASS" evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true) count=$(count_lines "$evicted") if [[ "$count" -eq 0 ]]; then pass "No evicted or failed pods" detail="0" elif [[ "$count" -le 50 ]]; then [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods" warn "$count evicted/failed pod(s)" detail="$count pods" status="WARN" else [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods" fail "$count evicted/failed pods" detail="$count pods" status="FAIL" fi if [[ "$FIX" == true && "$count" -gt 0 ]]; then info "Deleting $count evicted/failed pods..." $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true info "Deleted evicted/failed pods" fi json_add "evicted_pods" "$status" "$detail" } # --- 6. DaemonSets --- check_daemonsets() { section 6 "DaemonSets" local ds detail="" had_issue=false ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; } while IFS= read -r line; do local ns name desired ready ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') desired=$(echo "$line" | awk '{print $3}') ready=$(echo "$line" | awk '{print $5}') if [[ "$desired" != "$ready" ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets" fail "$ns/$name: desired=$desired ready=$ready" detail+="$ns/$name desired=$desired ready=$ready; " had_issue=true fi done <<< "$ds" if [[ "$had_issue" == false ]]; then pass "All DaemonSets healthy (desired == ready)" json_add "daemonsets" "PASS" "All healthy" else json_add "daemonsets" "FAIL" "$detail" fi } # --- 7. Deployments --- check_deployments() { section 7 "Deployments" local deps detail="" had_issue=false deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; } while IFS= read -r line; do local ns name ready current desired ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') ready=$(echo "$line" | awk '{print $3}') current=$(echo "$ready" | cut -d/ -f1) desired=$(echo "$ready" | cut -d/ -f2) if [[ "$current" != "$desired" ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments" fail "$ns/$name: $current/$desired ready" detail+="$ns/$name $current/$desired; " had_issue=true fi done <<< "$deps" if [[ "$had_issue" == false ]]; then pass "All deployments fully available" json_add "deployments" "PASS" "All available" else json_add "deployments" "FAIL" "$detail" fi } # --- 8. PVC Status --- check_pvcs() { section 8 "PVC Status" local pvcs detail="" had_issue=false pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then pass "No PVCs in cluster" json_add "pvcs" "PASS" "No PVCs" return 0 fi while IFS= read -r line; do local ns name status ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') status=$(echo "$line" | awk '{print $3}') if [[ "$status" != "Bound" ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status" fail "$ns/$name: $status" detail+="$ns/$name=$status; " had_issue=true fi done <<< "$pvcs" if [[ "$had_issue" == false ]]; then pass "All PVCs Bound" json_add "pvcs" "PASS" "All Bound" else json_add "pvcs" "FAIL" "$detail" fi } # --- 9. HPA Health --- check_hpa() { section 9 "HPA Health" local hpas detail="" had_issue=false status="PASS" hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then pass "No HPAs configured" json_add "hpa" "PASS" "No HPAs" return 0 fi while IFS= read -r line; do local ns name targets ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') targets=$(echo "$line" | awk '{print $3}') if echo "$targets" | grep -q ''; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" fail "$ns/$name: targets=$targets (unknown metrics)" detail+="$ns/$name=unknown; " had_issue=true status="FAIL" else # Parse percentage values from targets like "45%/80%, 30%/50%" local pcts pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true) if [[ -n "$pcts" ]]; then while IFS= read -r pct; do [[ -z "$pct" ]] && continue if [[ "$pct" -gt 150 ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" fail "$ns/$name: utilization at ${pct}%" detail+="$ns/$name=${pct}%; " had_issue=true status="FAIL" break elif [[ "$pct" -gt 100 ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health" warn "$ns/$name: utilization at ${pct}%" detail+="$ns/$name=${pct}%; " had_issue=true [[ "$status" != "FAIL" ]] && status="WARN" break fi done <<< "$pcts" fi fi done <<< "$hpas" [[ "$had_issue" == false ]] && pass "All HPAs healthy" json_add "hpa" "$status" "${detail:-All healthy}" } # --- 10. CronJob Failures --- check_cronjobs() { section 10 "CronJob Failures" local failures detail="" failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c ' import json, sys from datetime import datetime, timezone, timedelta data = json.load(sys.stdin) cutoff = datetime.now(timezone.utc) - timedelta(hours=24) for job in data.get("items", []): meta = job.get("metadata", {}) ns = meta.get("namespace", "") name = meta.get("name", "") owners = meta.get("ownerReferences", []) is_cronjob = any(o.get("kind") == "CronJob" for o in owners) if not is_cronjob: continue conditions = job.get("status", {}).get("conditions", []) for c in conditions: if c.get("type") == "Failed" and c.get("status") == "True": ts = c.get("lastTransitionTime", "") if ts: try: t = datetime.fromisoformat(ts.replace("Z", "+00:00")) if t > cutoff: print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}") except: print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}") ' 2>/dev/null) || true if [[ -z "$failures" ]]; then pass "No CronJob failures in last 24h" json_add "cronjob_failures" "PASS" "None" else [[ "$QUIET" == true ]] && section_always 10 "CronJob Failures" local count count=$(count_lines "$failures") fail "$count CronJob failure(s) in last 24h:" [[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo " $line"; done json_add "cronjob_failures" "FAIL" "$count failures" fi } # --- 11. CrowdSec --- check_crowdsec() { section 11 "CrowdSec Agents" local cs_pods not_running cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true) if [[ -z "$cs_pods" ]]; then [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents" warn "CrowdSec namespace not found or empty" json_add "crowdsec" "WARN" "No CrowdSec pods found" return 0 fi not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) if [[ -n "$not_running" ]]; then [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents" while IFS= read -r line; do fail "CrowdSec pod not running: $line" done <<< "$not_running" json_add "crowdsec" "FAIL" "$not_running" else local total total=$(count_lines "$cs_pods") pass "All $total CrowdSec pods running" json_add "crowdsec" "PASS" "$total pods running" fi } # --- 12. Ingress --- check_ingresses() { section 12 "Ingress Routes" local ingresses no_lb detail="" had_issue=false ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true) if [[ -n "$ingresses" ]]; then no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "") print $1"/"$2}' || true) if [[ -n "$no_lb" ]]; then [[ "$QUIET" == true ]] && section_always 12 "Ingress Routes" while IFS= read -r line; do fail "Ingress missing LB IP: $line" done <<< "$no_lb" detail="Missing LB: $no_lb" had_issue=true fi fi # Check Traefik LB service local traefik_svc_ip traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) if [[ -z "$traefik_svc_ip" ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes" fail "Traefik LoadBalancer has no external IP" detail+="Traefik LB missing IP; " had_issue=true else detail+="Traefik LB=$traefik_svc_ip; " fi if [[ "$had_issue" == false ]]; then pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)" json_add "ingresses" "PASS" "$detail" else json_add "ingresses" "FAIL" "$detail" fi } # --- 13. Prometheus Alerts --- check_alerts() { section 13 "Prometheus Alerts" local alerts firing_count # Try alertmanager first, then prometheus server alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \ wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true) if [[ -z "$alerts" ]]; then alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true) fi if [[ -z "$alerts" ]]; then [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" warn "Could not query Prometheus/Alertmanager" json_add "prometheus_alerts" "WARN" "Cannot query" return 0 fi firing_count=$(echo "$alerts" | python3 -c ' import json, sys try: data = json.load(sys.stdin) if isinstance(data, list): active = [a for a in data if a.get("status", {}).get("state") == "active"] count = len(active) names = [a.get("labels", {}).get("alertname", "?") for a in active] print(f"{count}:" + ",".join(names) if count > 0 else "0:") elif isinstance(data, dict) and "data" in data: alerts_list = data["data"].get("alerts", []) firing = [a for a in alerts_list if a.get("state") == "firing"] count = len(firing) names = [a.get("labels", {}).get("alertname", "?") for a in firing] print(f"{count}:" + ",".join(names) if count > 0 else "0:") else: print("0:") except: print("-1:") ' 2>/dev/null || echo "-1:") local count names count=$(echo "$firing_count" | cut -d: -f1) names=$(echo "$firing_count" | cut -d: -f2-) if [[ "$count" == "-1" ]]; then [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" warn "Failed to parse alert data" json_add "prometheus_alerts" "WARN" "Parse error" elif [[ "$count" -eq 0 ]]; then pass "No firing alerts" json_add "prometheus_alerts" "PASS" "0 firing" elif [[ "$count" -le 3 ]]; then [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" warn "$count firing alert(s): $names" json_add "prometheus_alerts" "WARN" "$count firing: $names" else [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts" fail "$count firing alerts: $names" json_add "prometheus_alerts" "FAIL" "$count firing: $names" fi } # --- 14. Uptime Kuma --- check_uptime_kuma() { section 14 "Uptime Kuma Monitors" local result result=$(python3 -c ' import sys, os try: from uptime_kuma_api import UptimeKumaApi except ImportError: print("ERROR:uptime-kuma-api not installed") sys.exit(0) try: password = os.environ.get("UPTIME_KUMA_PASSWORD", "") if not password: print("ERROR:UPTIME_KUMA_PASSWORD not set") sys.exit(0) api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2) api.login("admin", password) monitors = api.get_monitors() heartbeats = api.get_heartbeats() internal_up = 0 internal_down = [] external_up = 0 external_down = [] paused_count = 0 for m in monitors: mid = m.get("id") name = m.get("name", "unknown") active = m.get("active", True) is_external = name.startswith("[External] ") if not active: paused_count += 1 continue beats = heartbeats.get(mid, []) if beats: last_beat = beats[-1] if isinstance(last_beat, list): last_beat = last_beat[-1] if last_beat else {} status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0 if hasattr(status, "value"): status = status.value is_up = (status == 1) else: is_up = False if is_external: if is_up: external_up += 1 else: external_down.append(name.replace("[External] ", "")) else: if is_up: internal_up += 1 else: internal_down.append(name) api.disconnect() int_down_names = ", ".join(internal_down) if internal_down else "" ext_down_names = ", ".join(external_down) if external_down else "" print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}") except Exception as e: print(f"CONN_ERROR:{e}") ' 2>/dev/null) || result="CONN_ERROR:python execution failed" if [[ "$result" == "ERROR:"* ]]; then [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" warn "Uptime Kuma: ${result#ERROR:}" json_add "uptime_kuma" "WARN" "${result#ERROR:}" elif [[ "$result" == "CONN_ERROR:"* ]]; then [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}" json_add "uptime_kuma" "WARN" "Connection failed" else local int_down int_up ext_down ext_up paused_count down_details int_down=$(echo "$result" | cut -d: -f1) int_up=$(echo "$result" | cut -d: -f2) ext_down=$(echo "$result" | cut -d: -f3) ext_up=$(echo "$result" | cut -d: -f4) paused_count=$(echo "$result" | cut -d: -f5) down_details=$(echo "$result" | cut -d: -f6-) local int_down_names="${down_details%%|*}" local ext_down_names="${down_details#*|}" local total_down=$((int_down + ext_down)) local total_up=$((int_up + ext_up)) local total_active=$((total_up + total_down)) if [[ "$total_down" -eq 0 ]]; then pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)" json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused" else [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors" local details="" [[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names" [[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; } if [[ "$total_down" -le 3 ]]; then warn "$total_down/$total_active down: $details" json_add "uptime_kuma" "WARN" "$details" else fail "$total_down/$total_active down: $details" json_add "uptime_kuma" "FAIL" "$details" fi fi fi } # --- 15. ResourceQuota Pressure --- check_resourcequota() { section 15 "ResourceQuota Pressure" local quotas detail="" had_issue=false status="PASS" quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; } local pressure pressure=$(echo "$quotas" | python3 -c ' import json, sys, re def parse_cpu(val): """Convert CPU value to millicores.""" val = str(val) if val.endswith("m"): return float(val[:-1]) return float(val) * 1000 def parse_mem(val): """Convert memory value to bytes.""" val = str(val) units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} for suffix, mult in units.items(): if val.endswith(suffix): return float(val[:-len(suffix)]) * mult # Plain bytes or numeric return float(val) data = json.load(sys.stdin) for item in data.get("items", []): ns = item["metadata"]["namespace"] name = item["metadata"]["name"] status = item.get("status", {}) hard = status.get("hard", {}) used = status.get("used", {}) for resource, hard_val in hard.items(): used_val = used.get(resource, "0") try: if "cpu" in resource: h = parse_cpu(hard_val) u = parse_cpu(used_val) elif "memory" in resource or "storage" in resource: h = parse_mem(hard_val) u = parse_mem(used_val) elif resource == "pods": h = float(hard_val) u = float(used_val) else: continue if h <= 0: continue pct = (u / h) * 100 if pct > 80: level = "FAIL" if pct > 95 else "WARN" print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%") except (ValueError, ZeroDivisionError): pass ' 2>/dev/null) || true if [[ -z "$pressure" ]]; then pass "All ResourceQuotas below 80% usage" json_add "resourcequota" "PASS" "All below 80%" else [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure" while IFS= read -r line; do local level ns_res resource pct level=$(echo "$line" | cut -d: -f1) ns_res=$(echo "$line" | cut -d: -f2) resource=$(echo "$line" | cut -d: -f3) pct=$(echo "$line" | cut -d: -f4) if [[ "$level" == "FAIL" ]]; then fail "$ns_res: $resource at $pct" status="FAIL" else warn "$ns_res: $resource at $pct" [[ "$status" != "FAIL" ]] && status="WARN" fi detail+="$ns_res $resource=$pct; " had_issue=true done <<< "$pressure" json_add "resourcequota" "$status" "$detail" fi } # --- 16. StatefulSets --- check_statefulsets() { section 16 "StatefulSets" local sts detail="" had_issue=false sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then pass "No StatefulSets in cluster" json_add "statefulsets" "PASS" "No StatefulSets" return 0 fi while IFS= read -r line; do local ns name ready current desired ns=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | awk '{print $2}') ready=$(echo "$line" | awk '{print $3}') current=$(echo "$ready" | cut -d/ -f1) desired=$(echo "$ready" | cut -d/ -f2) if [[ "$current" != "$desired" ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets" fail "$ns/$name: $current/$desired ready" detail+="$ns/$name $current/$desired; " had_issue=true fi done <<< "$sts" if [[ "$had_issue" == false ]]; then pass "All StatefulSets fully available" json_add "statefulsets" "PASS" "All available" else json_add "statefulsets" "FAIL" "$detail" fi } # --- 17. Node Disk Usage --- check_node_disk() { section 17 "Node Disk Usage" local node_json detail="" had_issue=false status="PASS" node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; } local disk_info disk_info=$(echo "$node_json" | python3 -c ' import json, sys def parse_storage(val): """Convert storage value to bytes.""" val = str(val) units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} for suffix, mult in units.items(): if val.endswith(suffix): return float(val[:-len(suffix)]) * mult return float(val) data = json.load(sys.stdin) for node in data["items"]: name = node["metadata"]["name"] cap = node["status"].get("capacity", {}) alloc = node["status"].get("allocatable", {}) es_cap = cap.get("ephemeral-storage", "0") es_alloc = alloc.get("ephemeral-storage", "0") try: c = parse_storage(es_cap) a = parse_storage(es_alloc) if c > 0: used_pct = ((c - a) / c) * 100 if used_pct > 70: # Lower threshold after node2 containerd corruption incident if used_pct > 85: level = "FAIL" # Critical: Risk of containerd corruption elif used_pct > 75: level = "WARN" # Warning: Monitor closely else: level = "WARN" # Early warning print(f"{level}:{name}:{used_pct:.0f}") except (ValueError, ZeroDivisionError): pass ' 2>/dev/null) || true if [[ -z "$disk_info" ]]; then pass "All nodes below 70% ephemeral-storage usage" json_add "node_disk" "PASS" "All below 70%" else [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage" while IFS= read -r line; do local level node pct level=$(echo "$line" | cut -d: -f1) node=$(echo "$line" | cut -d: -f2) pct=$(echo "$line" | cut -d: -f3) if [[ "$level" == "FAIL" ]]; then fail "$node: ephemeral-storage at ${pct}%" status="FAIL" else warn "$node: ephemeral-storage at ${pct}%" [[ "$status" != "FAIL" ]] && status="WARN" fi detail+="$node=${pct}%; " had_issue=true done <<< "$disk_info" json_add "node_disk" "$status" "$detail" fi } # --- 18. Helm Release Health --- check_helm_releases() { section 18 "Helm Release Health" # Helm may not be available in the pod environment if ! command -v helm &>/dev/null; then pass "Helm not available (skipped)" json_add "helm_releases" "PASS" "Helm not available" return 0 fi local releases detail="" had_issue=false status="PASS" releases=$(helm list --all-namespaces --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || { [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" warn "Cannot list Helm releases" json_add "helm_releases" "WARN" "Cannot list" return 0 } local bad_releases bad_releases=$(echo "$releases" | python3 -c ' import json, sys data = json.load(sys.stdin) for r in data: name = r.get("name", "?") ns = r.get("namespace", "?") st = r.get("status", "unknown") if st != "deployed": level = "FAIL" if st.startswith("pending") else "WARN" print(f"{level}:{ns}/{name}:{st}") ' 2>/dev/null) || true if [[ -z "$bad_releases" ]]; then pass "All Helm releases in deployed state" json_add "helm_releases" "PASS" "All deployed" else [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" while IFS= read -r line; do local level release_name release_status level=$(echo "$line" | cut -d: -f1) release_name=$(echo "$line" | cut -d: -f2) release_status=$(echo "$line" | cut -d: -f3) if [[ "$level" == "FAIL" ]]; then fail "Helm release $release_name: $release_status (blocks terraform)" status="FAIL" else warn "Helm release $release_name: $release_status" [[ "$status" != "FAIL" ]] && status="WARN" fi detail+="$release_name=$release_status; " had_issue=true done <<< "$bad_releases" json_add "helm_releases" "$status" "$detail" fi } # --- 19. Kyverno Policy Engine --- check_kyverno() { section 19 "Kyverno Policy Engine" local kv_pods not_running kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true) if [[ -z "$kv_pods" ]]; then [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact" json_add "kyverno" "FAIL" "No Kyverno pods found" return 0 fi not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) if [[ -n "$not_running" ]]; then [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" while IFS= read -r line; do fail "Kyverno pod not running: $line" done <<< "$not_running" json_add "kyverno" "FAIL" "$not_running" else local total total=$(count_lines "$kv_pods") pass "All $total Kyverno pods running" json_add "kyverno" "PASS" "$total pods running" fi } # --- 20. NFS Connectivity --- check_nfs() { section 20 "NFS Connectivity" # Try native tools first (available locally), fall back to kubectl-based check (pod environment) if command -v showmount &>/dev/null; then if showmount -e 192.168.1.127 &>/dev/null; then pass "NFS server 192.168.1.127 reachable (exports listed)" json_add "nfs" "PASS" "NFS reachable" return 0 fi fi if command -v nc &>/dev/null; then if nc -z -G 3 192.168.1.127 2049 &>/dev/null; then pass "NFS server 192.168.1.127 port 2049 open" json_add "nfs" "PASS" "NFS port open" return 0 fi fi # Fallback: check if NFS-backed pods are running (works in pod environment) local nfs_pods nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c ' import json, sys data = json.load(sys.stdin) count = 0 for pod in data.get("items", []): for vol in pod.get("spec", {}).get("volumes", []): if "nfs" in vol: if pod.get("status", {}).get("phase") == "Running": count += 1 break print(count) ' 2>/dev/null) || nfs_pods="0" if [[ "$nfs_pods" -gt 0 ]]; then pass "NFS healthy ($nfs_pods pods using NFS volumes are running)" json_add "nfs" "PASS" "$nfs_pods NFS pods running" else [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity" warn "Cannot verify NFS (showmount not available, no NFS pods found)" json_add "nfs" "WARN" "Cannot verify" fi } # --- 21. DNS Resolution --- check_dns() { section 21 "DNS Resolution" local internal_ok=false external_ok=false detail="" # Try dig first (available locally), fall back to python3 (pod environment) # Use system resolver (no @server) so it works from any host or pod if command -v dig &>/dev/null; then if dig viktorbarzin.me +short +time=3 +tries=1 2>/dev/null | grep -q .; then internal_ok=true fi if dig google.com +short +time=3 +tries=1 2>/dev/null | grep -q .; then external_ok=true fi else # Fallback: use python3 for DNS resolution (works in pod environment) local result result=$(python3 -c " import socket try: socket.getaddrinfo('viktorbarzin.me', 443) print('INTERNAL_OK') except Exception: print('INTERNAL_FAIL') try: socket.getaddrinfo('google.com', 443) print('EXTERNAL_OK') except Exception: print('EXTERNAL_FAIL') " 2>/dev/null) || result="" if echo "$result" | grep -q "INTERNAL_OK"; then internal_ok=true fi if echo "$result" | grep -q "EXTERNAL_OK"; then external_ok=true fi fi if [[ "$internal_ok" == true && "$external_ok" == true ]]; then pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)" json_add "dns" "PASS" "Both resolve" elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" if [[ "$internal_ok" == false ]]; then warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK" detail="Internal failed" else warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed" detail="External failed" fi json_add "dns" "WARN" "$detail" else [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" fail "DNS not resolving — both internal and external failed" json_add "dns" "FAIL" "Both failed" fi } # --- 22. TLS Certificate Expiry --- check_tls_certs() { section 22 "TLS Certificate Expiry" local secrets detail="" had_issue=false status="PASS" secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || { [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" warn "Cannot list secrets" json_add "tls_certs" "WARN" "Cannot list secrets" return 0 } local cert_issues cert_issues=$(echo "$secrets" | python3 -c ' import json, sys, base64, subprocess, hashlib from datetime import datetime, timezone data = json.load(sys.stdin) seen_fingerprints = set() results = [] for item in data.get("items", []): if item.get("type") != "kubernetes.io/tls": continue ns = item["metadata"]["namespace"] name = item["metadata"]["name"] cert_data = item.get("data", {}).get("tls.crt", "") if not cert_data: continue # Deduplicate by cert fingerprint raw = base64.b64decode(cert_data) fp = hashlib.sha256(raw).hexdigest()[:16] if fp in seen_fingerprints: continue seen_fingerprints.add(fp) # Parse certificate expiry with openssl try: result = subprocess.run( ["openssl", "x509", "-noout", "-enddate", "-subject"], input=raw, capture_output=True, timeout=5 ) output = result.stdout.decode() for line in output.splitlines(): if line.startswith("notAfter="): date_str = line.split("=", 1)[1] # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT" try: expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z") expiry = expiry.replace(tzinfo=timezone.utc) days_left = (expiry - datetime.now(timezone.utc)).days if days_left <= 7: print(f"FAIL:{ns}/{name}:{days_left}d") elif days_left <= 30: print(f"WARN:{ns}/{name}:{days_left}d") except ValueError: pass except (subprocess.TimeoutExpired, Exception): pass ' 2>/dev/null) || true if [[ -z "$cert_issues" ]]; then pass "All TLS certificates valid for >30 days" json_add "tls_certs" "PASS" "All valid >30d" else [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" while IFS= read -r line; do local level cert_name days level=$(echo "$line" | cut -d: -f1) cert_name=$(echo "$line" | cut -d: -f2) days=$(echo "$line" | cut -d: -f3) if [[ "$level" == "FAIL" ]]; then fail "TLS cert $cert_name expires in $days" status="FAIL" else warn "TLS cert $cert_name expires in $days" [[ "$status" != "FAIL" ]] && status="WARN" fi detail+="$cert_name=$days; " had_issue=true done <<< "$cert_issues" json_add "tls_certs" "$status" "$detail" fi } # --- 23. GPU Health --- check_gpu() { section 23 "GPU Health" local gpu_pods not_running gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true) if [[ -z "$gpu_pods" ]]; then [[ "$QUIET" == true ]] && section_always 23 "GPU Health" warn "NVIDIA namespace not found or empty" json_add "gpu" "WARN" "No GPU pods found" return 0 fi # Check specifically for device-plugin (critical for GPU scheduling) local device_plugin_down=false local other_down=false local detail="" while IFS= read -r line; do local pod_name pod_status pod_name=$(echo "$line" | awk '{print $1}') pod_status=$(echo "$line" | awk '{print $3}') if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then if echo "$pod_name" | grep -q "device-plugin"; then device_plugin_down=true detail+="device-plugin $pod_name: $pod_status; " else other_down=true detail+="$pod_name: $pod_status; " fi fi done <<< "$gpu_pods" if [[ "$device_plugin_down" == true ]]; then [[ "$QUIET" == true ]] && section_always 23 "GPU Health" fail "GPU device-plugin is down — GPU workloads cannot schedule" json_add "gpu" "FAIL" "$detail" elif [[ "$other_down" == true ]]; then [[ "$QUIET" == true ]] && section_always 23 "GPU Health" warn "Some GPU pods not running: $detail" json_add "gpu" "WARN" "$detail" else local total total=$(count_lines "$gpu_pods") pass "All $total GPU pods running" json_add "gpu" "PASS" "$total pods running" fi } # --- 24. Cloudflare Tunnel --- check_cloudflare_tunnel() { section 24 "Cloudflare Tunnel" local cf_pods running_count total_count cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true) if [[ -z "$cf_pods" ]]; then [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" fail "Cloudflare tunnel namespace not found or empty — external access broken" json_add "cloudflare_tunnel" "FAIL" "No pods found" return 0 fi total_count=$(count_lines "$cf_pods") running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ') if [[ "$running_count" -eq 0 ]]; then [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" fail "Cloudflare tunnel: 0/$total_count pods running — external access broken" json_add "cloudflare_tunnel" "FAIL" "0/$total_count running" elif [[ "$running_count" -lt "$total_count" ]]; then [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)" json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running" else pass "Cloudflare tunnel: all $total_count pods running" json_add "cloudflare_tunnel" "PASS" "$total_count pods running" fi } # --- 25. Advanced CPU Monitoring (Prometheus) --- check_prometheus_cpu() { section 25 "Advanced CPU Monitoring" local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)" local detail="" had_issue=false status="PASS" # Start port-forward to Prometheus if not using in-cluster DNS local prom_url pf_pid="" if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query" else local pf_port pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null & pf_pid=$! sleep 2 prom_url="http://127.0.0.1:${pf_port}/api/v1/query" fi # Cleanup port-forward on exit from this function trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN # Try to query Prometheus for CPU metrics local cpu_data cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || { warn "Prometheus not accessible for CPU monitoring" json_add "prometheus_cpu" "WARN" "Prometheus unreachable" return 0 } # Parse JSON and check CPU usage local cpu_results cpu_results=$(echo "$cpu_data" | python3 -c " import json, sys try: data = json.load(sys.stdin) if data.get('status') == 'success': for result in data['data']['result']: instance = result['metric']['instance'] usage = float(result['value'][1]) # Map IP to node name if '10.0.20.100' in instance: node = 'k8s-master' elif '10.0.20.101' in instance: node = 'k8s-node1' elif '10.0.20.102' in instance: node = 'k8s-node2' elif '10.0.20.103' in instance: node = 'k8s-node3' elif '10.0.20.104' in instance: node = 'k8s-node4' elif 'pve-node' in instance: node = 'proxmox-host' else: node = instance print(f'{node}:{usage:.1f}') except Exception as e: print(f'ERROR:{e}') " 2>/dev/null) || true if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then warn "Failed to parse Prometheus CPU data" json_add "prometheus_cpu" "WARN" "Parse failed" return 0 fi # Check CPU thresholds while IFS=':' read -r node usage; do [[ -z "$node" || -z "$usage" ]] && continue usage_int=${usage%.*} # Remove decimal if [[ "$usage_int" -gt 85 ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring" fail "$node: ${usage}% CPU (critical)" detail+="$node=${usage}% [CRIT]; " had_issue=true status="FAIL" elif [[ "$usage_int" -gt 70 ]]; then [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring" warn "$node: ${usage}% CPU (high)" detail+="$node=${usage}% [HIGH]; " had_issue=true [[ "$status" != "FAIL" ]] && status="WARN" else detail+="$node=${usage}% [OK]; " fi done <<< "$cpu_results" [[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)" json_add "prometheus_cpu" "$status" "$detail" } # --- 26. Power Monitoring --- check_power_monitoring() { section 26 "Power Monitoring" local detail="" had_issue=false status="PASS" # Start port-forward to Prometheus if not using in-cluster DNS local prom_url pf_pid="" if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query" else local pf_port pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null & pf_pid=$! sleep 2 prom_url="http://127.0.0.1:${pf_port}/api/v1/query" fi trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN # GPU Power monitoring local gpu_query="DCGM_FI_DEV_POWER_USAGE" local gpu_data gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || { [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring" warn "GPU power metrics unavailable" detail+="GPU metrics unavailable; " had_issue=true status="WARN" } if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then local gpu_results gpu_results=$(echo "$gpu_data" | python3 -c " import json, sys try: data = json.load(sys.stdin) if data.get('status') == 'success': for result in data['data']['result']: hostname = result['metric'].get('Hostname', 'unknown') power = float(result['value'][1]) print(f'{hostname}:{power:.1f}') except Exception: pass " 2>/dev/null) || true # Check GPU power thresholds (Tesla T4 TDP is ~70W) while IFS=':' read -r node power; do [[ -z "$node" || -z "$power" ]] && continue power_int=${power%.*} if [[ "$power_int" -gt 65 ]]; then # > 90% of T4 TDP [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring" warn "GPU $node: ${power}W (high power draw)" detail+="GPU-$node=${power}W [HIGH]; " had_issue=true [[ "$status" != "FAIL" ]] && status="WARN" elif [[ "$power_int" -gt 50 ]]; then # > 70% of T4 TDP detail+="GPU-$node=${power}W [ACTIVE]; " else detail+="GPU-$node=${power}W [IDLE]; " fi done <<< "$gpu_results" fi [[ "$had_issue" == false ]] && pass "Power consumption within normal ranges" json_add "power_monitoring" "$status" "$detail" } # --- Summary --- print_summary() { if [[ "$JSON" == true ]]; then echo "{" echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," echo " \"pass\": $PASS_COUNT," echo " \"warn\": $WARN_COUNT," echo " \"fail\": $FAIL_COUNT," echo " \"checks\": [" local first=true for r in "${JSON_RESULTS[@]}"; do if [[ "$first" == true ]]; then echo " $r" first=false else echo " ,$r" fi done echo " ]" echo "}" return 0 fi echo "" echo -e "${BOLD}═══════════════════════════════════════${NC}" echo -e "${BOLD} Cluster Health Summary${NC}" echo -e "${BOLD}═══════════════════════════════════════${NC}" echo -e " ${GREEN}PASS${NC}: $PASS_COUNT ${YELLOW}WARN${NC}: $WARN_COUNT ${RED}FAIL${NC}: $FAIL_COUNT" echo "" if [[ "$FAIL_COUNT" -gt 0 ]]; then echo -e " Overall: ${RED}UNHEALTHY${NC}" elif [[ "$WARN_COUNT" -gt 0 ]]; then echo -e " Overall: ${YELLOW}DEGRADED${NC}" else echo -e " Overall: ${GREEN}HEALTHY${NC}" fi echo "" } # --- Slack Notification --- # Human-readable check name mapping friendly_check_name() { case "$1" in node_status) echo "Node Status" ;; node_resources) echo "Node Resources" ;; node_conditions) echo "Node Conditions" ;; problematic_pods) echo "Problematic Pods" ;; evicted_pods) echo "Evicted Pods" ;; daemonsets) echo "DaemonSets" ;; deployments) echo "Deployments" ;; pvcs) echo "PVCs" ;; hpa) echo "HPAs" ;; cronjob_failures) echo "CronJob Failures" ;; crowdsec) echo "CrowdSec" ;; ingresses) echo "Ingresses" ;; prometheus_alerts) echo "Prometheus Alerts" ;; uptime_kuma) echo "Uptime Kuma" ;; resourcequota) echo "Resource Quotas" ;; statefulsets) echo "StatefulSets" ;; node_disk) echo "Node Disk" ;; helm_releases) echo "Helm Releases" ;; kyverno) echo "Kyverno" ;; nfs) echo "NFS Storage" ;; dns) echo "DNS Resolution" ;; tls_certs) echo "TLS Certificates" ;; gpu) echo "GPU" ;; cloudflare_tunnel) echo "Cloudflare Tunnel" ;; prometheus_cpu) echo "Advanced CPU Monitoring" ;; power_monitoring) echo "Power Monitoring" ;; *) echo "$1" ;; esac } send_slack() { if [[ "$SEND_SLACK" != true ]]; then return 0 fi if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then [[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification" return 0 fi # Gather stats for summary line local node_count pod_count node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ') local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT)) # Use python3 to build the entire Slack payload from JSON_RESULTS local json_results_str json_results_str=$(printf '%s\n' "${JSON_RESULTS[@]}") local json_payload json_payload=$(echo "$json_results_str" | python3 -c " import json, sys CHECK_NAMES = { 'node_status': 'Node Status', 'node_resources': 'Node Resources', 'node_conditions': 'Node Conditions', 'problematic_pods': 'Problematic Pods', 'evicted_pods': 'Evicted Pods', 'daemonsets': 'DaemonSets', 'deployments': 'Deployments', 'pvcs': 'PVCs', 'hpa': 'HPAs', 'cronjob_failures': 'CronJob Failures', 'crowdsec': 'CrowdSec', 'ingresses': 'Ingresses', 'prometheus_alerts': 'Prometheus Alerts', 'uptime_kuma': 'Uptime Kuma', 'resourcequota': 'Resource Quotas', 'statefulsets': 'StatefulSets', 'node_disk': 'Node Disk', 'helm_releases': 'Helm Releases', 'kyverno': 'Kyverno', 'nfs': 'NFS Storage', 'dns': 'DNS Resolution', 'tls_certs': 'TLS Certificates', 'gpu': 'GPU', 'cloudflare_tunnel': 'Cloudflare Tunnel', 'prometheus_cpu': 'Advanced CPU Monitoring', 'power_monitoring': 'Power Monitoring', } def format_detail(check, detail): \"\"\"Format detail text for readability. Truncate long lists, split semicolons.\"\"\" detail = detail.rstrip('; ').strip() # For checks with long comma-separated lists (e.g. Uptime Kuma down monitors), # truncate to first 5 items with a count if check == 'uptime_kuma' and ': ' in detail: prefix, names_str = detail.split(': ', 1) names = [n.strip() for n in names_str.split(',') if n.strip()] if len(names) > 5: shown = ', '.join(names[:5]) detail = f'{prefix}: {shown} (+{len(names) - 5} more)' elif names: detail = prefix + ': ' + ', '.join(names) # For resource quotas and similar semicolon-separated items, # split into separate lines if '; ' in detail: parts = [p.strip() for p in detail.split(';') if p.strip()] if len(parts) > 1: lines = '\\n'.join(f' \u2022 {p}' for p in parts) return lines return detail # Parse results fails = [] warns = [] for line in sys.stdin: line = line.strip() if not line: continue try: d = json.loads(line) except json.JSONDecodeError: continue status = d.get('status', '') check = d.get('check', '') detail = d.get('detail', '') name = CHECK_NAMES.get(check, check) formatted = format_detail(check, detail) if status == 'FAIL': fails.append((name, formatted)) elif status == 'WARN': warns.append((name, formatted)) pass_count = ${PASS_COUNT} warn_count = ${WARN_COUNT} fail_count = ${FAIL_COUNT} total = ${total_checks} nodes = '${node_count}' pods = '${pod_count}' blocks = [] # Header block if fail_count == 0 and warn_count == 0: header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*' summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods' blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}}) else: issue_count = fail_count + warn_count emoji = ':rotating_light:' if fail_count > 0 else ':warning:' header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*' summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed' blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}}) # Failed section if fails: blocks.append({'type': 'divider'}) lines = [':x: *Failed*'] for name, detail in fails: if '\\n' in detail: lines.append(f'\u2022 *{name}*:') lines.append(detail) else: lines.append(f'\u2022 *{name}*: {detail}') blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}}) # Warnings section if warns: blocks.append({'type': 'divider'}) lines = [':warning: *Warnings*'] for name, detail in warns: if '\\n' in detail: lines.append(f'\u2022 *{name}*:') lines.append(detail) else: lines.append(f'\u2022 *{name}*: {detail}') blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}}) # Footer with timestamp from datetime import datetime, timezone ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC') blocks.append({'type': 'context', 'elements': [{'type': 'mrkdwn', 'text': f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}'}]}) payload = {'blocks': blocks} print(json.dumps(payload)) ") curl -s -X POST "$SLACK_WEBHOOK_URL" \ -H 'Content-Type: application/json' \ -d "$json_payload" >/dev/null 2>&1 || { [[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification" } [[ "$JSON" != true ]] && echo "Slack notification sent." } # --- Main --- main() { parse_args "$@" if [[ "$JSON" != true ]]; then echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')" echo -e "Kubeconfig: $KUBECONFIG_PATH" if [[ "$FIX" == true ]]; then echo -e "${YELLOW}Auto-fix mode enabled${NC}" fi fi check_nodes check_resources check_conditions check_pods check_evicted check_daemonsets check_deployments check_pvcs check_hpa check_cronjobs check_crowdsec check_ingresses check_alerts check_uptime_kuma check_resourcequota check_statefulsets check_node_disk check_helm_releases check_kyverno check_nfs check_dns check_tls_certs check_gpu check_cloudflare_tunnel check_prometheus_cpu check_power_monitoring print_summary send_slack # Always exit 0 — reporting is done via Slack notification. # Non-zero exits mark the CronJob as Failed, which triggers Prometheus # JobFailed alerts, creating a circular alert loop. exit 0 } main "$@"