- Loki: reduce memory limit from 6Gi to 4Gi (within LimitRange max) - Grampsweb: increase memory to 2Gi (was OOMKilled at 512Mi) - Fix PostgreSQLDown alert: check pod readiness instead of deployment - Fix MySQLDown alert: check StatefulSet replicas instead of deployment - Fix RedisDown alert: check StatefulSet replicas instead of deployment - Fix NFSServerUnresponsive: aggregate all NFS versions cluster-wide - Fix Uptime Kuma healthcheck: handle nested list heartbeat format - Update etcd backup image to registry.k8s.io/etcd:3.6.5-0
1534 lines
53 KiB
Bash
Executable file
1534 lines
53 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
# Cluster health check script (pod-compatible version).
|
|
# Runs 24 diagnostic checks against the Kubernetes cluster and prints
|
|
# a colour-coded report with PASS / WARN / FAIL for each section.
|
|
# Optionally posts results to Slack.
|
|
#
|
|
# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]
|
|
#
|
|
# Environment:
|
|
# KUBECONFIG — path to kubeconfig (used in pod environment)
|
|
# SLACK_WEBHOOK_URL — Slack incoming webhook URL (required unless --no-slack)
|
|
# UPTIME_KUMA_PASSWORD — Uptime Kuma admin password
|
|
|
|
set -euo pipefail
|
|
|
|
# --- Colors ---
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
BLUE='\033[0;34m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
|
|
# --- Globals ---
|
|
PASS_COUNT=0
|
|
WARN_COUNT=0
|
|
FAIL_COUNT=0
|
|
FIX=false
|
|
QUIET=false
|
|
JSON=false
|
|
SEND_SLACK=true
|
|
KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}"
|
|
KUBECTL=""
|
|
JSON_RESULTS=()
|
|
TOTAL_CHECKS=24
|
|
|
|
# --- Helpers ---
|
|
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
pass() { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e " ${GREEN}[PASS]${NC} $*"; }
|
|
warn() { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${RED}[FAIL]${NC} $*"; }
|
|
|
|
section() {
|
|
local num="$1" title="$2"
|
|
[[ "$JSON" == true ]] && return 0
|
|
[[ "$QUIET" == true ]] && return 0
|
|
echo ""
|
|
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
|
|
}
|
|
|
|
section_always() {
|
|
local num="$1" title="$2"
|
|
[[ "$JSON" == true ]] && return 0
|
|
echo ""
|
|
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
|
|
}
|
|
|
|
json_add() {
|
|
local name="$1" status="$2" detail="$3"
|
|
local escaped
|
|
escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))')
|
|
JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}")
|
|
}
|
|
|
|
# count lines in a variable, returning 0 for empty strings
|
|
count_lines() {
|
|
local input="$1"
|
|
if [[ -z "$input" ]]; then
|
|
echo 0
|
|
else
|
|
echo "$input" | wc -l | tr -d ' '
|
|
fi
|
|
}
|
|
|
|
# --- Argument parsing ---
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--fix) FIX=true; shift ;;
|
|
--quiet|-q) QUIET=true; shift ;;
|
|
--json) JSON=true; shift ;;
|
|
--no-slack) SEND_SLACK=false; shift ;;
|
|
--kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
|
|
-h|--help)
|
|
echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]"
|
|
echo ""
|
|
echo "Flags:"
|
|
echo " --fix Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)"
|
|
echo " --quiet, -q Only show WARN and FAIL sections"
|
|
echo " --json Machine-readable JSON output"
|
|
echo " --kubeconfig PATH Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)"
|
|
echo " --no-slack Skip Slack notification"
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
|
|
}
|
|
|
|
# --- 1. Node Status ---
|
|
check_nodes() {
|
|
section 1 "Node Status"
|
|
local nodes not_ready versions unique_versions detail=""
|
|
|
|
nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; }
|
|
not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true)
|
|
versions=$(echo "$nodes" | awk '{print $5}' | sort -u)
|
|
unique_versions=$(echo "$versions" | wc -l | tr -d ' ')
|
|
|
|
if [[ -n "$not_ready" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 1 "Node Status"
|
|
fail "NotReady nodes: $not_ready"
|
|
detail="NotReady: $not_ready"
|
|
json_add "node_status" "FAIL" "$detail"
|
|
elif [[ "$unique_versions" -gt 1 ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 1 "Node Status"
|
|
warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')"
|
|
detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')"
|
|
json_add "node_status" "WARN" "$detail"
|
|
else
|
|
pass "All nodes Ready, version $(echo "$versions" | head -1)"
|
|
detail="All nodes Ready"
|
|
json_add "node_status" "PASS" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 2. Node Resources ---
|
|
check_resources() {
|
|
section 2 "Node Resources"
|
|
local top detail="" had_issue=false status="PASS"
|
|
|
|
top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; }
|
|
|
|
while IFS= read -r line; do
|
|
local node cpu_pct mem_pct
|
|
node=$(echo "$line" | awk '{print $1}')
|
|
cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
|
|
mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
|
|
|
|
# Skip nodes where metrics are not yet available
|
|
if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then
|
|
detail+="$node metrics unavailable; "
|
|
continue
|
|
fi
|
|
|
|
if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
|
|
fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
|
|
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; "
|
|
had_issue=true
|
|
status="FAIL"
|
|
elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
|
|
warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
|
|
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; "
|
|
had_issue=true
|
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
|
else
|
|
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; "
|
|
fi
|
|
done <<< "$top"
|
|
|
|
[[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory"
|
|
json_add "node_resources" "$status" "$detail"
|
|
}
|
|
|
|
# --- 3. Node Conditions ---
|
|
check_conditions() {
|
|
section 3 "Node Conditions"
|
|
local conditions detail=""
|
|
|
|
conditions=$($KUBECTL get nodes -o json | python3 -c '
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
for node in data["items"]:
|
|
name = node["metadata"]["name"]
|
|
for c in node["status"]["conditions"]:
|
|
if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True":
|
|
print(name + ": " + c["type"])
|
|
' 2>&1) || true
|
|
|
|
if [[ -n "$conditions" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 3 "Node Conditions"
|
|
while IFS= read -r line; do
|
|
fail "$line"
|
|
done <<< "$conditions"
|
|
detail="$conditions"
|
|
json_add "node_conditions" "FAIL" "$detail"
|
|
else
|
|
pass "No pressure conditions on any node"
|
|
json_add "node_conditions" "PASS" "No pressure conditions"
|
|
fi
|
|
}
|
|
|
|
# --- 4. Problematic Pods ---
|
|
check_pods() {
|
|
section 4 "Problematic Pods"
|
|
local bad count detail="" status="PASS"
|
|
|
|
bad=$( {
|
|
$KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
|
|
| grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
|
|
$KUBECTL get pods -A --no-headers 2>/dev/null \
|
|
| grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
|
|
} | awk '!seen[$1,$2]++' | sed '/^$/d') || true
|
|
|
|
count=$(count_lines "$bad")
|
|
|
|
# Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled
|
|
if [[ "$FIX" == true && "$count" -gt 0 ]]; then
|
|
local fixed_count=0
|
|
while IFS= read -r line; do
|
|
[[ -z "$line" ]] && continue
|
|
local ns pod pod_status restarts restarts_clean
|
|
ns=$(echo "$line" | awk '{print $1}')
|
|
pod=$(echo "$line" | awk '{print $2}')
|
|
pod_status=$(echo "$line" | awk '{print $4}')
|
|
restarts=$(echo "$line" | awk '{print $5}')
|
|
restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0")
|
|
|
|
if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then
|
|
info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)"
|
|
$KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true
|
|
fixed_count=$((fixed_count + 1))
|
|
fi
|
|
done <<< "$bad"
|
|
if [[ "$fixed_count" -gt 0 ]]; then
|
|
info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts"
|
|
fi
|
|
fi
|
|
|
|
if [[ "$count" -eq 0 ]]; then
|
|
pass "No problematic pods"
|
|
detail="None"
|
|
elif [[ "$count" -le 10 ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
|
|
warn "$count problematic pod(s):"
|
|
[[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo " $line"; done
|
|
detail="$count pods"
|
|
status="WARN"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
|
|
fail "$count problematic pods (showing first 10):"
|
|
[[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo " $line"; done
|
|
detail="$count pods"
|
|
status="FAIL"
|
|
fi
|
|
json_add "problematic_pods" "$status" "$detail"
|
|
}
|
|
|
|
# --- 5. Evicted/Failed Pods ---
|
|
check_evicted() {
|
|
section 5 "Evicted/Failed Pods"
|
|
local evicted count detail="" status="PASS"
|
|
|
|
evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true)
|
|
count=$(count_lines "$evicted")
|
|
|
|
if [[ "$count" -eq 0 ]]; then
|
|
pass "No evicted or failed pods"
|
|
detail="0"
|
|
elif [[ "$count" -le 50 ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
|
|
warn "$count evicted/failed pod(s)"
|
|
detail="$count pods"
|
|
status="WARN"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
|
|
fail "$count evicted/failed pods"
|
|
detail="$count pods"
|
|
status="FAIL"
|
|
fi
|
|
|
|
if [[ "$FIX" == true && "$count" -gt 0 ]]; then
|
|
info "Deleting $count evicted/failed pods..."
|
|
$KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true
|
|
info "Deleted evicted/failed pods"
|
|
fi
|
|
json_add "evicted_pods" "$status" "$detail"
|
|
}
|
|
|
|
# --- 6. DaemonSets ---
|
|
check_daemonsets() {
|
|
section 6 "DaemonSets"
|
|
local ds detail="" had_issue=false
|
|
|
|
ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; }
|
|
|
|
while IFS= read -r line; do
|
|
local ns name desired ready
|
|
ns=$(echo "$line" | awk '{print $1}')
|
|
name=$(echo "$line" | awk '{print $2}')
|
|
desired=$(echo "$line" | awk '{print $3}')
|
|
ready=$(echo "$line" | awk '{print $5}')
|
|
|
|
if [[ "$desired" != "$ready" ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets"
|
|
fail "$ns/$name: desired=$desired ready=$ready"
|
|
detail+="$ns/$name desired=$desired ready=$ready; "
|
|
had_issue=true
|
|
fi
|
|
done <<< "$ds"
|
|
|
|
if [[ "$had_issue" == false ]]; then
|
|
pass "All DaemonSets healthy (desired == ready)"
|
|
json_add "daemonsets" "PASS" "All healthy"
|
|
else
|
|
json_add "daemonsets" "FAIL" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 7. Deployments ---
|
|
check_deployments() {
|
|
section 7 "Deployments"
|
|
local deps detail="" had_issue=false
|
|
|
|
deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; }
|
|
|
|
while IFS= read -r line; do
|
|
local ns name ready current desired
|
|
ns=$(echo "$line" | awk '{print $1}')
|
|
name=$(echo "$line" | awk '{print $2}')
|
|
ready=$(echo "$line" | awk '{print $3}')
|
|
current=$(echo "$ready" | cut -d/ -f1)
|
|
desired=$(echo "$ready" | cut -d/ -f2)
|
|
|
|
if [[ "$current" != "$desired" ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments"
|
|
fail "$ns/$name: $current/$desired ready"
|
|
detail+="$ns/$name $current/$desired; "
|
|
had_issue=true
|
|
fi
|
|
done <<< "$deps"
|
|
|
|
if [[ "$had_issue" == false ]]; then
|
|
pass "All deployments fully available"
|
|
json_add "deployments" "PASS" "All available"
|
|
else
|
|
json_add "deployments" "FAIL" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 8. PVC Status ---
|
|
check_pvcs() {
|
|
section 8 "PVC Status"
|
|
local pvcs detail="" had_issue=false
|
|
|
|
pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true
|
|
if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then
|
|
pass "No PVCs in cluster"
|
|
json_add "pvcs" "PASS" "No PVCs"
|
|
return 0
|
|
fi
|
|
|
|
while IFS= read -r line; do
|
|
local ns name status
|
|
ns=$(echo "$line" | awk '{print $1}')
|
|
name=$(echo "$line" | awk '{print $2}')
|
|
status=$(echo "$line" | awk '{print $3}')
|
|
|
|
if [[ "$status" != "Bound" ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status"
|
|
fail "$ns/$name: $status"
|
|
detail+="$ns/$name=$status; "
|
|
had_issue=true
|
|
fi
|
|
done <<< "$pvcs"
|
|
|
|
if [[ "$had_issue" == false ]]; then
|
|
pass "All PVCs Bound"
|
|
json_add "pvcs" "PASS" "All Bound"
|
|
else
|
|
json_add "pvcs" "FAIL" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 9. HPA Health ---
|
|
check_hpa() {
|
|
section 9 "HPA Health"
|
|
local hpas detail="" had_issue=false status="PASS"
|
|
|
|
hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true
|
|
if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then
|
|
pass "No HPAs configured"
|
|
json_add "hpa" "PASS" "No HPAs"
|
|
return 0
|
|
fi
|
|
|
|
while IFS= read -r line; do
|
|
local ns name targets
|
|
ns=$(echo "$line" | awk '{print $1}')
|
|
name=$(echo "$line" | awk '{print $2}')
|
|
targets=$(echo "$line" | awk '{print $3}')
|
|
|
|
if echo "$targets" | grep -q '<unknown>'; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
|
|
fail "$ns/$name: targets=$targets (unknown metrics)"
|
|
detail+="$ns/$name=unknown; "
|
|
had_issue=true
|
|
status="FAIL"
|
|
else
|
|
# Parse percentage values from targets like "45%/80%, 30%/50%"
|
|
local pcts
|
|
pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true)
|
|
if [[ -n "$pcts" ]]; then
|
|
while IFS= read -r pct; do
|
|
[[ -z "$pct" ]] && continue
|
|
if [[ "$pct" -gt 150 ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
|
|
fail "$ns/$name: utilization at ${pct}%"
|
|
detail+="$ns/$name=${pct}%; "
|
|
had_issue=true
|
|
status="FAIL"
|
|
break
|
|
elif [[ "$pct" -gt 100 ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
|
|
warn "$ns/$name: utilization at ${pct}%"
|
|
detail+="$ns/$name=${pct}%; "
|
|
had_issue=true
|
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
|
break
|
|
fi
|
|
done <<< "$pcts"
|
|
fi
|
|
fi
|
|
done <<< "$hpas"
|
|
|
|
[[ "$had_issue" == false ]] && pass "All HPAs healthy"
|
|
json_add "hpa" "$status" "${detail:-All healthy}"
|
|
}
|
|
|
|
# --- 10. CronJob Failures ---
|
|
check_cronjobs() {
|
|
section 10 "CronJob Failures"
|
|
local failures detail=""
|
|
|
|
failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
|
|
import json, sys
|
|
from datetime import datetime, timezone, timedelta
|
|
|
|
data = json.load(sys.stdin)
|
|
cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
|
|
|
|
for job in data.get("items", []):
|
|
meta = job.get("metadata", {})
|
|
ns = meta.get("namespace", "")
|
|
name = meta.get("name", "")
|
|
|
|
owners = meta.get("ownerReferences", [])
|
|
is_cronjob = any(o.get("kind") == "CronJob" for o in owners)
|
|
if not is_cronjob:
|
|
continue
|
|
|
|
conditions = job.get("status", {}).get("conditions", [])
|
|
for c in conditions:
|
|
if c.get("type") == "Failed" and c.get("status") == "True":
|
|
ts = c.get("lastTransitionTime", "")
|
|
if ts:
|
|
try:
|
|
t = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
if t > cutoff:
|
|
print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
|
|
except:
|
|
print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
|
|
' 2>/dev/null) || true
|
|
|
|
if [[ -z "$failures" ]]; then
|
|
pass "No CronJob failures in last 24h"
|
|
json_add "cronjob_failures" "PASS" "None"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 10 "CronJob Failures"
|
|
local count
|
|
count=$(count_lines "$failures")
|
|
fail "$count CronJob failure(s) in last 24h:"
|
|
[[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo " $line"; done
|
|
json_add "cronjob_failures" "FAIL" "$count failures"
|
|
fi
|
|
}
|
|
|
|
# --- 11. CrowdSec ---
|
|
check_crowdsec() {
|
|
section 11 "CrowdSec Agents"
|
|
local cs_pods not_running
|
|
|
|
cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true)
|
|
if [[ -z "$cs_pods" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
|
|
warn "CrowdSec namespace not found or empty"
|
|
json_add "crowdsec" "WARN" "No CrowdSec pods found"
|
|
return 0
|
|
fi
|
|
|
|
not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
|
|
if [[ -n "$not_running" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
|
|
while IFS= read -r line; do
|
|
fail "CrowdSec pod not running: $line"
|
|
done <<< "$not_running"
|
|
json_add "crowdsec" "FAIL" "$not_running"
|
|
else
|
|
local total
|
|
total=$(count_lines "$cs_pods")
|
|
pass "All $total CrowdSec pods running"
|
|
json_add "crowdsec" "PASS" "$total pods running"
|
|
fi
|
|
}
|
|
|
|
# --- 12. Ingress ---
|
|
check_ingresses() {
|
|
section 12 "Ingress Routes"
|
|
local ingresses no_lb detail="" had_issue=false
|
|
|
|
ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true)
|
|
if [[ -n "$ingresses" ]]; then
|
|
no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true)
|
|
if [[ -n "$no_lb" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 12 "Ingress Routes"
|
|
while IFS= read -r line; do
|
|
fail "Ingress missing LB IP: $line"
|
|
done <<< "$no_lb"
|
|
detail="Missing LB: $no_lb"
|
|
had_issue=true
|
|
fi
|
|
fi
|
|
|
|
# Check Traefik LB service
|
|
local traefik_svc_ip
|
|
traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
|
|
if [[ -z "$traefik_svc_ip" ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes"
|
|
fail "Traefik LoadBalancer has no external IP"
|
|
detail+="Traefik LB missing IP; "
|
|
had_issue=true
|
|
else
|
|
detail+="Traefik LB=$traefik_svc_ip; "
|
|
fi
|
|
|
|
if [[ "$had_issue" == false ]]; then
|
|
pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)"
|
|
json_add "ingresses" "PASS" "$detail"
|
|
else
|
|
json_add "ingresses" "FAIL" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 13. Prometheus Alerts ---
|
|
check_alerts() {
|
|
section 13 "Prometheus Alerts"
|
|
local alerts firing_count
|
|
|
|
# Try alertmanager first, then prometheus server
|
|
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
|
|
wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true)
|
|
|
|
if [[ -z "$alerts" ]]; then
|
|
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
|
|
wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true)
|
|
fi
|
|
|
|
if [[ -z "$alerts" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
|
|
warn "Could not query Prometheus/Alertmanager"
|
|
json_add "prometheus_alerts" "WARN" "Cannot query"
|
|
return 0
|
|
fi
|
|
|
|
firing_count=$(echo "$alerts" | python3 -c '
|
|
import json, sys
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
if isinstance(data, list):
|
|
active = [a for a in data if a.get("status", {}).get("state") == "active"]
|
|
count = len(active)
|
|
names = [a.get("labels", {}).get("alertname", "?") for a in active]
|
|
print(f"{count}:" + ",".join(names) if count > 0 else "0:")
|
|
elif isinstance(data, dict) and "data" in data:
|
|
alerts_list = data["data"].get("alerts", [])
|
|
firing = [a for a in alerts_list if a.get("state") == "firing"]
|
|
count = len(firing)
|
|
names = [a.get("labels", {}).get("alertname", "?") for a in firing]
|
|
print(f"{count}:" + ",".join(names) if count > 0 else "0:")
|
|
else:
|
|
print("0:")
|
|
except:
|
|
print("-1:")
|
|
' 2>/dev/null || echo "-1:")
|
|
|
|
local count names
|
|
count=$(echo "$firing_count" | cut -d: -f1)
|
|
names=$(echo "$firing_count" | cut -d: -f2-)
|
|
|
|
if [[ "$count" == "-1" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
|
|
warn "Failed to parse alert data"
|
|
json_add "prometheus_alerts" "WARN" "Parse error"
|
|
elif [[ "$count" -eq 0 ]]; then
|
|
pass "No firing alerts"
|
|
json_add "prometheus_alerts" "PASS" "0 firing"
|
|
elif [[ "$count" -le 3 ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
|
|
warn "$count firing alert(s): $names"
|
|
json_add "prometheus_alerts" "WARN" "$count firing: $names"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
|
|
fail "$count firing alerts: $names"
|
|
json_add "prometheus_alerts" "FAIL" "$count firing: $names"
|
|
fi
|
|
}
|
|
|
|
# --- 14. Uptime Kuma ---
|
|
check_uptime_kuma() {
|
|
section 14 "Uptime Kuma Monitors"
|
|
local result
|
|
|
|
result=$(python3 -c '
|
|
import sys, os
|
|
try:
|
|
from uptime_kuma_api import UptimeKumaApi
|
|
except ImportError:
|
|
print("ERROR:uptime-kuma-api not installed")
|
|
sys.exit(0)
|
|
|
|
try:
|
|
api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=30)
|
|
password = os.environ.get("UPTIME_KUMA_PASSWORD", "")
|
|
if not password:
|
|
print("ERROR:UPTIME_KUMA_PASSWORD not set")
|
|
sys.exit(0)
|
|
api.login("admin", password)
|
|
|
|
monitors = api.get_monitors()
|
|
# Build id->name map and track active/paused
|
|
id_to_name = {}
|
|
paused_count = 0
|
|
for m in monitors:
|
|
mid = m.get("id")
|
|
name = m.get("name", "unknown")
|
|
active = m.get("active", True)
|
|
if not active:
|
|
paused_count += 1
|
|
else:
|
|
id_to_name[mid] = name
|
|
|
|
# Use bulk heartbeat fetch (single API call) instead of per-monitor calls
|
|
heartbeats = api.get_heartbeats()
|
|
|
|
down = []
|
|
up_count = 0
|
|
for mid, name in id_to_name.items():
|
|
beats = heartbeats.get(mid, [])
|
|
if beats:
|
|
last_beat = beats[-1]
|
|
# Handle nested lists (some monitors return list of lists)
|
|
if isinstance(last_beat, list):
|
|
last_beat = last_beat[-1] if last_beat else {}
|
|
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
|
|
# Handle both enum and int (MonitorStatus.UP == 1)
|
|
if status == 1:
|
|
up_count += 1
|
|
elif status == 3:
|
|
paused_count += 1
|
|
else:
|
|
down.append(name)
|
|
else:
|
|
# No heartbeats = unknown, treat as down
|
|
down.append(name)
|
|
|
|
api.disconnect()
|
|
|
|
down_count = len(down)
|
|
total_active = up_count + down_count
|
|
down_names = ", ".join(down) if down else ""
|
|
print(f"{down_count}:{up_count}:{paused_count}:{total_active}:{down_names}")
|
|
except Exception as e:
|
|
print(f"CONN_ERROR:{e}")
|
|
' 2>/dev/null) || result="CONN_ERROR:python execution failed"
|
|
|
|
if [[ "$result" == "ERROR:"* ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
|
|
warn "Uptime Kuma: ${result#ERROR:}"
|
|
json_add "uptime_kuma" "WARN" "${result#ERROR:}"
|
|
elif [[ "$result" == "CONN_ERROR:"* ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
|
|
warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}"
|
|
json_add "uptime_kuma" "WARN" "Connection failed"
|
|
else
|
|
local down_count up_count paused_count total_active down_names
|
|
down_count=$(echo "$result" | cut -d: -f1)
|
|
up_count=$(echo "$result" | cut -d: -f2)
|
|
paused_count=$(echo "$result" | cut -d: -f3)
|
|
total_active=$(echo "$result" | cut -d: -f4)
|
|
down_names=$(echo "$result" | cut -d: -f5-)
|
|
|
|
if [[ "$down_count" -eq 0 ]]; then
|
|
pass "All $total_active active monitors up ($paused_count paused)"
|
|
json_add "uptime_kuma" "PASS" "$total_active up, $paused_count paused"
|
|
elif [[ "$down_count" -le 3 ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
|
|
warn "$down_count/$total_active monitor(s) down: $down_names"
|
|
json_add "uptime_kuma" "WARN" "$down_count down: $down_names"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
|
|
fail "$down_count/$total_active monitors down: $down_names"
|
|
json_add "uptime_kuma" "FAIL" "$down_count down: $down_names"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# --- 15. ResourceQuota Pressure ---
|
|
check_resourcequota() {
|
|
section 15 "ResourceQuota Pressure"
|
|
local quotas detail="" had_issue=false status="PASS"
|
|
|
|
quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
|
|
|
|
local pressure
|
|
pressure=$(echo "$quotas" | python3 -c '
|
|
import json, sys, re
|
|
|
|
def parse_cpu(val):
|
|
"""Convert CPU value to millicores."""
|
|
val = str(val)
|
|
if val.endswith("m"):
|
|
return float(val[:-1])
|
|
return float(val) * 1000
|
|
|
|
def parse_mem(val):
|
|
"""Convert memory value to bytes."""
|
|
val = str(val)
|
|
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
|
|
for suffix, mult in units.items():
|
|
if val.endswith(suffix):
|
|
return float(val[:-len(suffix)]) * mult
|
|
# Plain bytes or numeric
|
|
return float(val)
|
|
|
|
data = json.load(sys.stdin)
|
|
for item in data.get("items", []):
|
|
ns = item["metadata"]["namespace"]
|
|
name = item["metadata"]["name"]
|
|
status = item.get("status", {})
|
|
hard = status.get("hard", {})
|
|
used = status.get("used", {})
|
|
|
|
for resource, hard_val in hard.items():
|
|
used_val = used.get(resource, "0")
|
|
try:
|
|
if "cpu" in resource:
|
|
h = parse_cpu(hard_val)
|
|
u = parse_cpu(used_val)
|
|
elif "memory" in resource or "storage" in resource:
|
|
h = parse_mem(hard_val)
|
|
u = parse_mem(used_val)
|
|
elif resource == "pods":
|
|
h = float(hard_val)
|
|
u = float(used_val)
|
|
else:
|
|
continue
|
|
if h <= 0:
|
|
continue
|
|
pct = (u / h) * 100
|
|
if pct > 80:
|
|
level = "FAIL" if pct > 95 else "WARN"
|
|
print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
|
|
except (ValueError, ZeroDivisionError):
|
|
pass
|
|
' 2>/dev/null) || true
|
|
|
|
if [[ -z "$pressure" ]]; then
|
|
pass "All ResourceQuotas below 80% usage"
|
|
json_add "resourcequota" "PASS" "All below 80%"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
|
|
while IFS= read -r line; do
|
|
local level ns_res resource pct
|
|
level=$(echo "$line" | cut -d: -f1)
|
|
ns_res=$(echo "$line" | cut -d: -f2)
|
|
resource=$(echo "$line" | cut -d: -f3)
|
|
pct=$(echo "$line" | cut -d: -f4)
|
|
if [[ "$level" == "FAIL" ]]; then
|
|
fail "$ns_res: $resource at $pct"
|
|
status="FAIL"
|
|
else
|
|
warn "$ns_res: $resource at $pct"
|
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
|
fi
|
|
detail+="$ns_res $resource=$pct; "
|
|
had_issue=true
|
|
done <<< "$pressure"
|
|
json_add "resourcequota" "$status" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 16. StatefulSets ---
|
|
check_statefulsets() {
|
|
section 16 "StatefulSets"
|
|
local sts detail="" had_issue=false
|
|
|
|
sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
|
|
if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
|
|
pass "No StatefulSets in cluster"
|
|
json_add "statefulsets" "PASS" "No StatefulSets"
|
|
return 0
|
|
fi
|
|
|
|
while IFS= read -r line; do
|
|
local ns name ready current desired
|
|
ns=$(echo "$line" | awk '{print $1}')
|
|
name=$(echo "$line" | awk '{print $2}')
|
|
ready=$(echo "$line" | awk '{print $3}')
|
|
current=$(echo "$ready" | cut -d/ -f1)
|
|
desired=$(echo "$ready" | cut -d/ -f2)
|
|
|
|
if [[ "$current" != "$desired" ]]; then
|
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
|
|
fail "$ns/$name: $current/$desired ready"
|
|
detail+="$ns/$name $current/$desired; "
|
|
had_issue=true
|
|
fi
|
|
done <<< "$sts"
|
|
|
|
if [[ "$had_issue" == false ]]; then
|
|
pass "All StatefulSets fully available"
|
|
json_add "statefulsets" "PASS" "All available"
|
|
else
|
|
json_add "statefulsets" "FAIL" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 17. Node Disk Usage ---
|
|
check_node_disk() {
|
|
section 17 "Node Disk Usage"
|
|
local node_json detail="" had_issue=false status="PASS"
|
|
|
|
node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
|
|
|
|
local disk_info
|
|
disk_info=$(echo "$node_json" | python3 -c '
|
|
import json, sys
|
|
|
|
def parse_storage(val):
|
|
"""Convert storage value to bytes."""
|
|
val = str(val)
|
|
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
|
|
for suffix, mult in units.items():
|
|
if val.endswith(suffix):
|
|
return float(val[:-len(suffix)]) * mult
|
|
return float(val)
|
|
|
|
data = json.load(sys.stdin)
|
|
for node in data["items"]:
|
|
name = node["metadata"]["name"]
|
|
cap = node["status"].get("capacity", {})
|
|
alloc = node["status"].get("allocatable", {})
|
|
es_cap = cap.get("ephemeral-storage", "0")
|
|
es_alloc = alloc.get("ephemeral-storage", "0")
|
|
try:
|
|
c = parse_storage(es_cap)
|
|
a = parse_storage(es_alloc)
|
|
if c > 0:
|
|
used_pct = ((c - a) / c) * 100
|
|
if used_pct > 80:
|
|
level = "FAIL" if used_pct > 90 else "WARN"
|
|
print(f"{level}:{name}:{used_pct:.0f}")
|
|
except (ValueError, ZeroDivisionError):
|
|
pass
|
|
' 2>/dev/null) || true
|
|
|
|
if [[ -z "$disk_info" ]]; then
|
|
pass "All nodes below 80% ephemeral-storage usage"
|
|
json_add "node_disk" "PASS" "All below 80%"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
|
|
while IFS= read -r line; do
|
|
local level node pct
|
|
level=$(echo "$line" | cut -d: -f1)
|
|
node=$(echo "$line" | cut -d: -f2)
|
|
pct=$(echo "$line" | cut -d: -f3)
|
|
if [[ "$level" == "FAIL" ]]; then
|
|
fail "$node: ephemeral-storage at ${pct}%"
|
|
status="FAIL"
|
|
else
|
|
warn "$node: ephemeral-storage at ${pct}%"
|
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
|
fi
|
|
detail+="$node=${pct}%; "
|
|
had_issue=true
|
|
done <<< "$disk_info"
|
|
json_add "node_disk" "$status" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 18. Helm Release Health ---
|
|
check_helm_releases() {
|
|
section 18 "Helm Release Health"
|
|
|
|
# Helm may not be available in the pod environment
|
|
if ! command -v helm &>/dev/null; then
|
|
pass "Helm not available (skipped)"
|
|
json_add "helm_releases" "PASS" "Helm not available"
|
|
return 0
|
|
fi
|
|
|
|
local releases detail="" had_issue=false status="PASS"
|
|
|
|
releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || {
|
|
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
|
warn "Cannot list Helm releases"
|
|
json_add "helm_releases" "WARN" "Cannot list"
|
|
return 0
|
|
}
|
|
|
|
local bad_releases
|
|
bad_releases=$(echo "$releases" | python3 -c '
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
for r in data:
|
|
name = r.get("name", "?")
|
|
ns = r.get("namespace", "?")
|
|
st = r.get("status", "unknown")
|
|
if st != "deployed":
|
|
level = "FAIL" if st.startswith("pending") else "WARN"
|
|
print(f"{level}:{ns}/{name}:{st}")
|
|
' 2>/dev/null) || true
|
|
|
|
if [[ -z "$bad_releases" ]]; then
|
|
pass "All Helm releases in deployed state"
|
|
json_add "helm_releases" "PASS" "All deployed"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
|
while IFS= read -r line; do
|
|
local level release_name release_status
|
|
level=$(echo "$line" | cut -d: -f1)
|
|
release_name=$(echo "$line" | cut -d: -f2)
|
|
release_status=$(echo "$line" | cut -d: -f3)
|
|
if [[ "$level" == "FAIL" ]]; then
|
|
fail "Helm release $release_name: $release_status (blocks terraform)"
|
|
status="FAIL"
|
|
else
|
|
warn "Helm release $release_name: $release_status"
|
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
|
fi
|
|
detail+="$release_name=$release_status; "
|
|
had_issue=true
|
|
done <<< "$bad_releases"
|
|
json_add "helm_releases" "$status" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 19. Kyverno Policy Engine ---
|
|
check_kyverno() {
|
|
section 19 "Kyverno Policy Engine"
|
|
local kv_pods not_running
|
|
|
|
kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
|
|
if [[ -z "$kv_pods" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
|
|
fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
|
|
json_add "kyverno" "FAIL" "No Kyverno pods found"
|
|
return 0
|
|
fi
|
|
|
|
not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
|
|
if [[ -n "$not_running" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
|
|
while IFS= read -r line; do
|
|
fail "Kyverno pod not running: $line"
|
|
done <<< "$not_running"
|
|
json_add "kyverno" "FAIL" "$not_running"
|
|
else
|
|
local total
|
|
total=$(count_lines "$kv_pods")
|
|
pass "All $total Kyverno pods running"
|
|
json_add "kyverno" "PASS" "$total pods running"
|
|
fi
|
|
}
|
|
|
|
# --- 20. NFS Connectivity ---
|
|
check_nfs() {
|
|
section 20 "NFS Connectivity"
|
|
|
|
# Try native tools first (available locally), fall back to kubectl-based check (pod environment)
|
|
if command -v showmount &>/dev/null; then
|
|
if showmount -e 10.0.10.15 &>/dev/null; then
|
|
pass "NFS server 10.0.10.15 reachable (exports listed)"
|
|
json_add "nfs" "PASS" "NFS reachable"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
if command -v nc &>/dev/null; then
|
|
if nc -z -G 3 10.0.10.15 2049 &>/dev/null; then
|
|
pass "NFS server 10.0.10.15 port 2049 open"
|
|
json_add "nfs" "PASS" "NFS port open"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
# Fallback: check if NFS-backed pods are running (works in pod environment)
|
|
local nfs_pods
|
|
nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
count = 0
|
|
for pod in data.get("items", []):
|
|
for vol in pod.get("spec", {}).get("volumes", []):
|
|
if "nfs" in vol:
|
|
if pod.get("status", {}).get("phase") == "Running":
|
|
count += 1
|
|
break
|
|
print(count)
|
|
' 2>/dev/null) || nfs_pods="0"
|
|
|
|
if [[ "$nfs_pods" -gt 0 ]]; then
|
|
pass "NFS healthy ($nfs_pods pods using NFS volumes are running)"
|
|
json_add "nfs" "PASS" "$nfs_pods NFS pods running"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
|
|
warn "Cannot verify NFS (showmount not available, no NFS pods found)"
|
|
json_add "nfs" "WARN" "Cannot verify"
|
|
fi
|
|
}
|
|
|
|
# --- 21. DNS Resolution ---
|
|
check_dns() {
|
|
section 21 "DNS Resolution"
|
|
local internal_ok=false external_ok=false detail=""
|
|
|
|
# Try dig first (available locally), fall back to python3 (pod environment)
|
|
if command -v dig &>/dev/null; then
|
|
if dig @10.0.20.101 viktorbarzin.me +short +time=3 +tries=1 &>/dev/null; then
|
|
internal_ok=true
|
|
fi
|
|
if dig @10.0.20.101 google.com +short +time=3 +tries=1 &>/dev/null; then
|
|
external_ok=true
|
|
fi
|
|
else
|
|
# Fallback: use python3 for DNS resolution (works in pod environment)
|
|
local result
|
|
result=$(python3 -c "
|
|
import socket
|
|
try:
|
|
socket.getaddrinfo('viktorbarzin.me', 443)
|
|
print('INTERNAL_OK')
|
|
except Exception:
|
|
print('INTERNAL_FAIL')
|
|
try:
|
|
socket.getaddrinfo('google.com', 443)
|
|
print('EXTERNAL_OK')
|
|
except Exception:
|
|
print('EXTERNAL_FAIL')
|
|
" 2>/dev/null) || result=""
|
|
|
|
if echo "$result" | grep -q "INTERNAL_OK"; then
|
|
internal_ok=true
|
|
fi
|
|
if echo "$result" | grep -q "EXTERNAL_OK"; then
|
|
external_ok=true
|
|
fi
|
|
fi
|
|
|
|
if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
|
|
pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
|
|
json_add "dns" "PASS" "Both resolve"
|
|
elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
|
|
if [[ "$internal_ok" == false ]]; then
|
|
warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
|
|
detail="Internal failed"
|
|
else
|
|
warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
|
|
detail="External failed"
|
|
fi
|
|
json_add "dns" "WARN" "$detail"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
|
|
fail "DNS not resolving — both internal and external failed"
|
|
json_add "dns" "FAIL" "Both failed"
|
|
fi
|
|
}
|
|
|
|
# --- 22. TLS Certificate Expiry ---
|
|
check_tls_certs() {
|
|
section 22 "TLS Certificate Expiry"
|
|
local secrets detail="" had_issue=false status="PASS"
|
|
|
|
secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
|
|
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
|
|
warn "Cannot list secrets"
|
|
json_add "tls_certs" "WARN" "Cannot list secrets"
|
|
return 0
|
|
}
|
|
|
|
local cert_issues
|
|
cert_issues=$(echo "$secrets" | python3 -c '
|
|
import json, sys, base64, subprocess, hashlib
|
|
from datetime import datetime, timezone
|
|
|
|
data = json.load(sys.stdin)
|
|
seen_fingerprints = set()
|
|
results = []
|
|
|
|
for item in data.get("items", []):
|
|
if item.get("type") != "kubernetes.io/tls":
|
|
continue
|
|
ns = item["metadata"]["namespace"]
|
|
name = item["metadata"]["name"]
|
|
cert_data = item.get("data", {}).get("tls.crt", "")
|
|
if not cert_data:
|
|
continue
|
|
|
|
# Deduplicate by cert fingerprint
|
|
raw = base64.b64decode(cert_data)
|
|
fp = hashlib.sha256(raw).hexdigest()[:16]
|
|
if fp in seen_fingerprints:
|
|
continue
|
|
seen_fingerprints.add(fp)
|
|
|
|
# Parse certificate expiry with openssl
|
|
try:
|
|
result = subprocess.run(
|
|
["openssl", "x509", "-noout", "-enddate", "-subject"],
|
|
input=raw, capture_output=True, timeout=5
|
|
)
|
|
output = result.stdout.decode()
|
|
for line in output.splitlines():
|
|
if line.startswith("notAfter="):
|
|
date_str = line.split("=", 1)[1]
|
|
# Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
|
|
try:
|
|
expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
|
|
expiry = expiry.replace(tzinfo=timezone.utc)
|
|
days_left = (expiry - datetime.now(timezone.utc)).days
|
|
if days_left <= 7:
|
|
print(f"FAIL:{ns}/{name}:{days_left}d")
|
|
elif days_left <= 30:
|
|
print(f"WARN:{ns}/{name}:{days_left}d")
|
|
except ValueError:
|
|
pass
|
|
except (subprocess.TimeoutExpired, Exception):
|
|
pass
|
|
' 2>/dev/null) || true
|
|
|
|
if [[ -z "$cert_issues" ]]; then
|
|
pass "All TLS certificates valid for >30 days"
|
|
json_add "tls_certs" "PASS" "All valid >30d"
|
|
else
|
|
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
|
|
while IFS= read -r line; do
|
|
local level cert_name days
|
|
level=$(echo "$line" | cut -d: -f1)
|
|
cert_name=$(echo "$line" | cut -d: -f2)
|
|
days=$(echo "$line" | cut -d: -f3)
|
|
if [[ "$level" == "FAIL" ]]; then
|
|
fail "TLS cert $cert_name expires in $days"
|
|
status="FAIL"
|
|
else
|
|
warn "TLS cert $cert_name expires in $days"
|
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
|
fi
|
|
detail+="$cert_name=$days; "
|
|
had_issue=true
|
|
done <<< "$cert_issues"
|
|
json_add "tls_certs" "$status" "$detail"
|
|
fi
|
|
}
|
|
|
|
# --- 23. GPU Health ---
|
|
check_gpu() {
|
|
section 23 "GPU Health"
|
|
local gpu_pods not_running
|
|
|
|
gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
|
|
if [[ -z "$gpu_pods" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
|
|
warn "NVIDIA namespace not found or empty"
|
|
json_add "gpu" "WARN" "No GPU pods found"
|
|
return 0
|
|
fi
|
|
|
|
# Check specifically for device-plugin (critical for GPU scheduling)
|
|
local device_plugin_down=false
|
|
local other_down=false
|
|
local detail=""
|
|
|
|
while IFS= read -r line; do
|
|
local pod_name pod_status
|
|
pod_name=$(echo "$line" | awk '{print $1}')
|
|
pod_status=$(echo "$line" | awk '{print $3}')
|
|
if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
|
|
if echo "$pod_name" | grep -q "device-plugin"; then
|
|
device_plugin_down=true
|
|
detail+="device-plugin $pod_name: $pod_status; "
|
|
else
|
|
other_down=true
|
|
detail+="$pod_name: $pod_status; "
|
|
fi
|
|
fi
|
|
done <<< "$gpu_pods"
|
|
|
|
if [[ "$device_plugin_down" == true ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
|
|
fail "GPU device-plugin is down — GPU workloads cannot schedule"
|
|
json_add "gpu" "FAIL" "$detail"
|
|
elif [[ "$other_down" == true ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
|
|
warn "Some GPU pods not running: $detail"
|
|
json_add "gpu" "WARN" "$detail"
|
|
else
|
|
local total
|
|
total=$(count_lines "$gpu_pods")
|
|
pass "All $total GPU pods running"
|
|
json_add "gpu" "PASS" "$total pods running"
|
|
fi
|
|
}
|
|
|
|
# --- 24. Cloudflare Tunnel ---
|
|
check_cloudflare_tunnel() {
|
|
section 24 "Cloudflare Tunnel"
|
|
local cf_pods running_count total_count
|
|
|
|
cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
|
|
if [[ -z "$cf_pods" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
|
|
fail "Cloudflare tunnel namespace not found or empty — external access broken"
|
|
json_add "cloudflare_tunnel" "FAIL" "No pods found"
|
|
return 0
|
|
fi
|
|
|
|
total_count=$(count_lines "$cf_pods")
|
|
running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
|
|
|
|
if [[ "$running_count" -eq 0 ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
|
|
fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
|
|
json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
|
|
elif [[ "$running_count" -lt "$total_count" ]]; then
|
|
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
|
|
warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
|
|
json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
|
|
else
|
|
pass "Cloudflare tunnel: all $total_count pods running"
|
|
json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
|
|
fi
|
|
}
|
|
|
|
# --- Summary ---
|
|
print_summary() {
|
|
if [[ "$JSON" == true ]]; then
|
|
echo "{"
|
|
echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
|
|
echo " \"pass\": $PASS_COUNT,"
|
|
echo " \"warn\": $WARN_COUNT,"
|
|
echo " \"fail\": $FAIL_COUNT,"
|
|
echo " \"checks\": ["
|
|
local first=true
|
|
for r in "${JSON_RESULTS[@]}"; do
|
|
if [[ "$first" == true ]]; then
|
|
echo " $r"
|
|
first=false
|
|
else
|
|
echo " ,$r"
|
|
fi
|
|
done
|
|
echo " ]"
|
|
echo "}"
|
|
return 0
|
|
fi
|
|
|
|
echo ""
|
|
echo -e "${BOLD}═══════════════════════════════════════${NC}"
|
|
echo -e "${BOLD} Cluster Health Summary${NC}"
|
|
echo -e "${BOLD}═══════════════════════════════════════${NC}"
|
|
echo -e " ${GREEN}PASS${NC}: $PASS_COUNT ${YELLOW}WARN${NC}: $WARN_COUNT ${RED}FAIL${NC}: $FAIL_COUNT"
|
|
echo ""
|
|
|
|
if [[ "$FAIL_COUNT" -gt 0 ]]; then
|
|
echo -e " Overall: ${RED}UNHEALTHY${NC}"
|
|
elif [[ "$WARN_COUNT" -gt 0 ]]; then
|
|
echo -e " Overall: ${YELLOW}DEGRADED${NC}"
|
|
else
|
|
echo -e " Overall: ${GREEN}HEALTHY${NC}"
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
# --- Slack Notification ---
|
|
|
|
# Human-readable check name mapping
|
|
friendly_check_name() {
|
|
case "$1" in
|
|
node_status) echo "Node Status" ;;
|
|
node_resources) echo "Node Resources" ;;
|
|
node_conditions) echo "Node Conditions" ;;
|
|
problematic_pods) echo "Problematic Pods" ;;
|
|
evicted_pods) echo "Evicted Pods" ;;
|
|
daemonsets) echo "DaemonSets" ;;
|
|
deployments) echo "Deployments" ;;
|
|
pvcs) echo "PVCs" ;;
|
|
hpa) echo "HPAs" ;;
|
|
cronjob_failures) echo "CronJob Failures" ;;
|
|
crowdsec) echo "CrowdSec" ;;
|
|
ingresses) echo "Ingresses" ;;
|
|
prometheus_alerts) echo "Prometheus Alerts" ;;
|
|
uptime_kuma) echo "Uptime Kuma" ;;
|
|
resourcequota) echo "Resource Quotas" ;;
|
|
statefulsets) echo "StatefulSets" ;;
|
|
node_disk) echo "Node Disk" ;;
|
|
helm_releases) echo "Helm Releases" ;;
|
|
kyverno) echo "Kyverno" ;;
|
|
nfs) echo "NFS Storage" ;;
|
|
dns) echo "DNS Resolution" ;;
|
|
tls_certs) echo "TLS Certificates" ;;
|
|
gpu) echo "GPU" ;;
|
|
cloudflare_tunnel) echo "Cloudflare Tunnel" ;;
|
|
*) echo "$1" ;;
|
|
esac
|
|
}
|
|
|
|
send_slack() {
|
|
if [[ "$SEND_SLACK" != true ]]; then
|
|
return 0
|
|
fi
|
|
if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then
|
|
[[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification"
|
|
return 0
|
|
fi
|
|
|
|
# Gather stats for summary line
|
|
local node_count pod_count
|
|
node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
|
pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ')
|
|
|
|
local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT))
|
|
|
|
# Use python3 to build the entire Slack payload from JSON_RESULTS
|
|
local json_results_str
|
|
json_results_str=$(printf '%s\n' "${JSON_RESULTS[@]}")
|
|
|
|
local json_payload
|
|
json_payload=$(echo "$json_results_str" | python3 -c "
|
|
import json, sys
|
|
|
|
CHECK_NAMES = {
|
|
'node_status': 'Node Status',
|
|
'node_resources': 'Node Resources',
|
|
'node_conditions': 'Node Conditions',
|
|
'problematic_pods': 'Problematic Pods',
|
|
'evicted_pods': 'Evicted Pods',
|
|
'daemonsets': 'DaemonSets',
|
|
'deployments': 'Deployments',
|
|
'pvcs': 'PVCs',
|
|
'hpa': 'HPAs',
|
|
'cronjob_failures': 'CronJob Failures',
|
|
'crowdsec': 'CrowdSec',
|
|
'ingresses': 'Ingresses',
|
|
'prometheus_alerts': 'Prometheus Alerts',
|
|
'uptime_kuma': 'Uptime Kuma',
|
|
'resourcequota': 'Resource Quotas',
|
|
'statefulsets': 'StatefulSets',
|
|
'node_disk': 'Node Disk',
|
|
'helm_releases': 'Helm Releases',
|
|
'kyverno': 'Kyverno',
|
|
'nfs': 'NFS Storage',
|
|
'dns': 'DNS Resolution',
|
|
'tls_certs': 'TLS Certificates',
|
|
'gpu': 'GPU',
|
|
'cloudflare_tunnel': 'Cloudflare Tunnel',
|
|
}
|
|
|
|
def format_detail(check, detail):
|
|
\"\"\"Format detail text for readability. Truncate long lists, split semicolons.\"\"\"
|
|
detail = detail.rstrip('; ').strip()
|
|
|
|
# For checks with long comma-separated lists (e.g. Uptime Kuma down monitors),
|
|
# truncate to first 5 items with a count
|
|
if check == 'uptime_kuma' and ': ' in detail:
|
|
prefix, names_str = detail.split(': ', 1)
|
|
names = [n.strip() for n in names_str.split(',') if n.strip()]
|
|
if len(names) > 5:
|
|
shown = ', '.join(names[:5])
|
|
detail = f'{prefix}: {shown} (+{len(names) - 5} more)'
|
|
elif names:
|
|
detail = prefix + ': ' + ', '.join(names)
|
|
|
|
# For resource quotas and similar semicolon-separated items,
|
|
# split into separate lines
|
|
if '; ' in detail:
|
|
parts = [p.strip() for p in detail.split(';') if p.strip()]
|
|
if len(parts) > 1:
|
|
lines = '\\n'.join(f' \u2022 {p}' for p in parts)
|
|
return lines
|
|
|
|
return detail
|
|
|
|
# Parse results
|
|
fails = []
|
|
warns = []
|
|
for line in sys.stdin:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
d = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
status = d.get('status', '')
|
|
check = d.get('check', '')
|
|
detail = d.get('detail', '')
|
|
name = CHECK_NAMES.get(check, check)
|
|
formatted = format_detail(check, detail)
|
|
|
|
if status == 'FAIL':
|
|
fails.append((name, formatted))
|
|
elif status == 'WARN':
|
|
warns.append((name, formatted))
|
|
|
|
pass_count = ${PASS_COUNT}
|
|
warn_count = ${WARN_COUNT}
|
|
fail_count = ${FAIL_COUNT}
|
|
total = ${total_checks}
|
|
nodes = '${node_count}'
|
|
pods = '${pod_count}'
|
|
|
|
blocks = []
|
|
|
|
# Header block
|
|
if fail_count == 0 and warn_count == 0:
|
|
header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*'
|
|
summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods'
|
|
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
|
|
else:
|
|
issue_count = fail_count + warn_count
|
|
emoji = ':rotating_light:' if fail_count > 0 else ':warning:'
|
|
header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*'
|
|
summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed'
|
|
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
|
|
|
|
# Failed section
|
|
if fails:
|
|
blocks.append({'type': 'divider'})
|
|
lines = [':x: *Failed*']
|
|
for name, detail in fails:
|
|
if '\\n' in detail:
|
|
lines.append(f'\u2022 *{name}*:')
|
|
lines.append(detail)
|
|
else:
|
|
lines.append(f'\u2022 *{name}*: {detail}')
|
|
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})
|
|
|
|
# Warnings section
|
|
if warns:
|
|
blocks.append({'type': 'divider'})
|
|
lines = [':warning: *Warnings*']
|
|
for name, detail in warns:
|
|
if '\\n' in detail:
|
|
lines.append(f'\u2022 *{name}*:')
|
|
lines.append(detail)
|
|
else:
|
|
lines.append(f'\u2022 *{name}*: {detail}')
|
|
blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})
|
|
|
|
# Footer with timestamp
|
|
from datetime import datetime, timezone
|
|
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
|
|
blocks.append({'type': 'context', 'elements': [{'type': 'mrkdwn', 'text': f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}'}]})
|
|
|
|
payload = {'blocks': blocks}
|
|
print(json.dumps(payload))
|
|
")
|
|
|
|
curl -s -X POST "$SLACK_WEBHOOK_URL" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "$json_payload" >/dev/null 2>&1 || {
|
|
[[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification"
|
|
}
|
|
|
|
[[ "$JSON" != true ]] && echo "Slack notification sent."
|
|
}
|
|
|
|
# --- Main ---
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
if [[ "$JSON" != true ]]; then
|
|
echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo -e "Kubeconfig: $KUBECONFIG_PATH"
|
|
if [[ "$FIX" == true ]]; then
|
|
echo -e "${YELLOW}Auto-fix mode enabled${NC}"
|
|
fi
|
|
fi
|
|
|
|
check_nodes
|
|
check_resources
|
|
check_conditions
|
|
check_pods
|
|
check_evicted
|
|
check_daemonsets
|
|
check_deployments
|
|
check_pvcs
|
|
check_hpa
|
|
check_cronjobs
|
|
check_crowdsec
|
|
check_ingresses
|
|
check_alerts
|
|
check_uptime_kuma
|
|
check_resourcequota
|
|
check_statefulsets
|
|
check_node_disk
|
|
check_helm_releases
|
|
check_kyverno
|
|
check_nfs
|
|
check_dns
|
|
check_tls_certs
|
|
check_gpu
|
|
check_cloudflare_tunnel
|
|
print_summary
|
|
send_slack
|
|
|
|
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
|
if [[ "$FAIL_COUNT" -gt 0 ]]; then
|
|
exit 2
|
|
elif [[ "$WARN_COUNT" -gt 0 ]]; then
|
|
exit 1
|
|
fi
|
|
exit 0
|
|
}
|
|
|
|
main "$@"
|