[ci skip] Extend cluster healthcheck from 14 to 24 checks

Add 10 new checks covering gaps discovered during incident response:
ResourceQuota pressure, StatefulSets, node disk usage, Helm release
health, Kyverno policy engine, NFS connectivity, DNS resolution,
TLS certificate expiry, GPU health, and Cloudflare tunnel status.
This commit is contained in:
Viktor Barzin 2026-02-21 23:57:04 +00:00
parent 4700743560
commit 86d1d50ad0
No known key found for this signature in database
GPG key ID: 0EB088298288D958
2 changed files with 490 additions and 4 deletions

View file

@ -179,7 +179,7 @@ kubectl get pods -A
**Cluster Health Check** (`scripts/cluster_healthcheck.sh`):
- **ALWAYS use this script** to check cluster health — whether the user asks explicitly, after deploying/updating services, or whenever you need to verify cluster state. Never use ad-hoc kubectl commands to assess overall cluster health; use the script instead.
- Runs 14 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma
- Runs 24 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma, ResourceQuota pressure, StatefulSets, node disk, Helm releases, Kyverno, NFS, DNS, TLS certs, GPU, Cloudflare tunnel
- **When adding new healthchecks or monitoring**: Always update this script to validate the new component
**Terraform target examples:**

View file

@ -1,7 +1,7 @@
#!/usr/bin/env bash
# Cluster health check script.
# Runs 14 diagnostic checks against the Kubernetes cluster and prints
# Runs 24 diagnostic checks against the Kubernetes cluster and prints
# a colour-coded report with PASS / WARN / FAIL for each section.
#
# Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]
@ -26,6 +26,7 @@ JSON=false
KUBECONFIG_PATH="$(pwd)/config"
KUBECTL=""
JSON_RESULTS=()
TOTAL_CHECKS=24
# --- Helpers ---
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@ -38,14 +39,14 @@ section() {
[[ "$JSON" == true ]] && return 0
[[ "$QUIET" == true ]] && return 0
echo ""
echo -e "${BOLD}[$num/14] $title${NC}"
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}
section_always() {
local num="$1" title="$2"
[[ "$JSON" == true ]] && return 0
echo ""
echo -e "${BOLD}[$num/14] $title${NC}"
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}
json_add() {
@ -665,6 +666,481 @@ except Exception as e:
fi
}
# --- 15. ResourceQuota Pressure ---
check_resourcequota() {
section 15 "ResourceQuota Pressure"
local quotas detail="" had_issue=false status="PASS"
quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
local pressure
pressure=$(echo "$quotas" | python3 -c '
import json, sys, re
def parse_cpu(val):
"""Convert CPU value to millicores."""
val = str(val)
if val.endswith("m"):
return float(val[:-1])
return float(val) * 1000
def parse_mem(val):
"""Convert memory value to bytes."""
val = str(val)
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return float(val[:-len(suffix)]) * mult
# Plain bytes or numeric
return float(val)
data = json.load(sys.stdin)
for item in data.get("items", []):
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
status = item.get("status", {})
hard = status.get("hard", {})
used = status.get("used", {})
for resource, hard_val in hard.items():
used_val = used.get(resource, "0")
try:
if "cpu" in resource:
h = parse_cpu(hard_val)
u = parse_cpu(used_val)
elif "memory" in resource or "storage" in resource:
h = parse_mem(hard_val)
u = parse_mem(used_val)
elif resource == "pods":
h = float(hard_val)
u = float(used_val)
else:
continue
if h <= 0:
continue
pct = (u / h) * 100
if pct > 80:
level = "FAIL" if pct > 95 else "WARN"
print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
except (ValueError, ZeroDivisionError):
pass
' 2>/dev/null) || true
if [[ -z "$pressure" ]]; then
pass "All ResourceQuotas below 80% usage"
json_add "resourcequota" "PASS" "All below 80%"
else
[[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
while IFS= read -r line; do
local level ns_res resource pct
level=$(echo "$line" | cut -d: -f1)
ns_res=$(echo "$line" | cut -d: -f2)
resource=$(echo "$line" | cut -d: -f3)
pct=$(echo "$line" | cut -d: -f4)
if [[ "$level" == "FAIL" ]]; then
fail "$ns_res: $resource at $pct"
status="FAIL"
else
warn "$ns_res: $resource at $pct"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$ns_res $resource=$pct; "
had_issue=true
done <<< "$pressure"
json_add "resourcequota" "$status" "$detail"
fi
}
# --- 16. StatefulSets ---
check_statefulsets() {
section 16 "StatefulSets"
local sts detail="" had_issue=false
sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
pass "No StatefulSets in cluster"
json_add "statefulsets" "PASS" "No StatefulSets"
return 0
fi
while IFS= read -r line; do
local ns name ready current desired
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
ready=$(echo "$line" | awk '{print $3}')
current=$(echo "$ready" | cut -d/ -f1)
desired=$(echo "$ready" | cut -d/ -f2)
if [[ "$current" != "$desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
fail "$ns/$name: $current/$desired ready"
detail+="$ns/$name $current/$desired; "
had_issue=true
fi
done <<< "$sts"
if [[ "$had_issue" == false ]]; then
pass "All StatefulSets fully available"
json_add "statefulsets" "PASS" "All available"
else
json_add "statefulsets" "FAIL" "$detail"
fi
}
# --- 17. Node Disk Usage ---
check_node_disk() {
section 17 "Node Disk Usage"
local node_json detail="" had_issue=false status="PASS"
node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
local disk_info
disk_info=$(echo "$node_json" | python3 -c '
import json, sys
def parse_storage(val):
"""Convert storage value to bytes."""
val = str(val)
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return float(val[:-len(suffix)]) * mult
return float(val)
data = json.load(sys.stdin)
for node in data["items"]:
name = node["metadata"]["name"]
cap = node["status"].get("capacity", {})
alloc = node["status"].get("allocatable", {})
es_cap = cap.get("ephemeral-storage", "0")
es_alloc = alloc.get("ephemeral-storage", "0")
try:
c = parse_storage(es_cap)
a = parse_storage(es_alloc)
if c > 0:
used_pct = ((c - a) / c) * 100
if used_pct > 80:
level = "FAIL" if used_pct > 90 else "WARN"
print(f"{level}:{name}:{used_pct:.0f}")
except (ValueError, ZeroDivisionError):
pass
' 2>/dev/null) || true
if [[ -z "$disk_info" ]]; then
pass "All nodes below 80% ephemeral-storage usage"
json_add "node_disk" "PASS" "All below 80%"
else
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
while IFS= read -r line; do
local level node pct
level=$(echo "$line" | cut -d: -f1)
node=$(echo "$line" | cut -d: -f2)
pct=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "$node: ephemeral-storage at ${pct}%"
status="FAIL"
else
warn "$node: ephemeral-storage at ${pct}%"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$node=${pct}%; "
had_issue=true
done <<< "$disk_info"
json_add "node_disk" "$status" "$detail"
fi
}
# --- 18. Helm Release Health ---
check_helm_releases() {
section 18 "Helm Release Health"
local releases detail="" had_issue=false status="PASS"
releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
warn "Cannot list Helm releases"
json_add "helm_releases" "WARN" "Cannot list"
return 0
}
local bad_releases
bad_releases=$(echo "$releases" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for r in data:
name = r.get("name", "?")
ns = r.get("namespace", "?")
st = r.get("status", "unknown")
if st != "deployed":
level = "FAIL" if st.startswith("pending") else "WARN"
print(f"{level}:{ns}/{name}:{st}")
' 2>/dev/null) || true
if [[ -z "$bad_releases" ]]; then
pass "All Helm releases in deployed state"
json_add "helm_releases" "PASS" "All deployed"
else
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
while IFS= read -r line; do
local level release_name release_status
level=$(echo "$line" | cut -d: -f1)
release_name=$(echo "$line" | cut -d: -f2)
release_status=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "Helm release $release_name: $release_status (blocks terraform)"
status="FAIL"
else
warn "Helm release $release_name: $release_status"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$release_name=$release_status; "
had_issue=true
done <<< "$bad_releases"
json_add "helm_releases" "$status" "$detail"
fi
}
# --- 19. Kyverno Policy Engine ---
check_kyverno() {
section 19 "Kyverno Policy Engine"
local kv_pods not_running
kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
if [[ -z "$kv_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
json_add "kyverno" "FAIL" "No Kyverno pods found"
return 0
fi
not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
if [[ -n "$not_running" ]]; then
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
while IFS= read -r line; do
fail "Kyverno pod not running: $line"
done <<< "$not_running"
json_add "kyverno" "FAIL" "$not_running"
else
local total
total=$(count_lines "$kv_pods")
pass "All $total Kyverno pods running"
json_add "kyverno" "PASS" "$total pods running"
fi
}
# --- 20. NFS Connectivity ---
check_nfs() {
section 20 "NFS Connectivity"
if showmount -e 10.0.10.15 &>/dev/null; then
pass "NFS server 10.0.10.15 reachable (exports listed)"
json_add "nfs" "PASS" "NFS reachable"
elif nc -z -G 3 10.0.10.15 2049 &>/dev/null; then
pass "NFS server 10.0.10.15 port 2049 open"
json_add "nfs" "PASS" "NFS port open"
else
[[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
fail "NFS server 10.0.10.15 unreachable — 30+ services depend on NFS"
json_add "nfs" "FAIL" "NFS unreachable"
fi
}
# --- 21. DNS Resolution ---
check_dns() {
section 21 "DNS Resolution"
local internal_ok=false external_ok=false detail=""
if dig @10.0.20.101 viktorbarzin.me +short +time=3 +tries=1 &>/dev/null; then
internal_ok=true
fi
if dig @10.0.20.101 google.com +short +time=3 +tries=1 &>/dev/null; then
external_ok=true
fi
if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
json_add "dns" "PASS" "Both resolve"
elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
if [[ "$internal_ok" == false ]]; then
warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
detail="Internal failed"
else
warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
detail="External failed"
fi
json_add "dns" "WARN" "$detail"
else
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
fail "DNS server 10.0.20.101 not resolving — both internal and external failed"
json_add "dns" "FAIL" "Both failed"
fi
}
# --- 22. TLS Certificate Expiry ---
check_tls_certs() {
section 22 "TLS Certificate Expiry"
local secrets detail="" had_issue=false status="PASS"
secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
warn "Cannot list secrets"
json_add "tls_certs" "WARN" "Cannot list secrets"
return 0
}
local cert_issues
cert_issues=$(echo "$secrets" | python3 -c '
import json, sys, base64, subprocess, hashlib
from datetime import datetime, timezone
data = json.load(sys.stdin)
seen_fingerprints = set()
results = []
for item in data.get("items", []):
if item.get("type") != "kubernetes.io/tls":
continue
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
cert_data = item.get("data", {}).get("tls.crt", "")
if not cert_data:
continue
# Deduplicate by cert fingerprint
raw = base64.b64decode(cert_data)
fp = hashlib.sha256(raw).hexdigest()[:16]
if fp in seen_fingerprints:
continue
seen_fingerprints.add(fp)
# Parse certificate expiry with openssl
try:
result = subprocess.run(
["openssl", "x509", "-noout", "-enddate", "-subject"],
input=raw, capture_output=True, timeout=5
)
output = result.stdout.decode()
for line in output.splitlines():
if line.startswith("notAfter="):
date_str = line.split("=", 1)[1]
# Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
try:
expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
expiry = expiry.replace(tzinfo=timezone.utc)
days_left = (expiry - datetime.now(timezone.utc)).days
if days_left <= 7:
print(f"FAIL:{ns}/{name}:{days_left}d")
elif days_left <= 30:
print(f"WARN:{ns}/{name}:{days_left}d")
except ValueError:
pass
except (subprocess.TimeoutExpired, Exception):
pass
' 2>/dev/null) || true
if [[ -z "$cert_issues" ]]; then
pass "All TLS certificates valid for >30 days"
json_add "tls_certs" "PASS" "All valid >30d"
else
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
while IFS= read -r line; do
local level cert_name days
level=$(echo "$line" | cut -d: -f1)
cert_name=$(echo "$line" | cut -d: -f2)
days=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "TLS cert $cert_name expires in $days"
status="FAIL"
else
warn "TLS cert $cert_name expires in $days"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$cert_name=$days; "
had_issue=true
done <<< "$cert_issues"
json_add "tls_certs" "$status" "$detail"
fi
}
# --- 23. GPU Health ---
check_gpu() {
section 23 "GPU Health"
local gpu_pods not_running
gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
if [[ -z "$gpu_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
warn "NVIDIA namespace not found or empty"
json_add "gpu" "WARN" "No GPU pods found"
return 0
fi
# Check specifically for device-plugin (critical for GPU scheduling)
local device_plugin_down=false
local other_down=false
local detail=""
while IFS= read -r line; do
local pod_name pod_status
pod_name=$(echo "$line" | awk '{print $1}')
pod_status=$(echo "$line" | awk '{print $3}')
if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
if echo "$pod_name" | grep -q "device-plugin"; then
device_plugin_down=true
detail+="device-plugin $pod_name: $pod_status; "
else
other_down=true
detail+="$pod_name: $pod_status; "
fi
fi
done <<< "$gpu_pods"
if [[ "$device_plugin_down" == true ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
fail "GPU device-plugin is down — GPU workloads cannot schedule"
json_add "gpu" "FAIL" "$detail"
elif [[ "$other_down" == true ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
warn "Some GPU pods not running: $detail"
json_add "gpu" "WARN" "$detail"
else
local total
total=$(count_lines "$gpu_pods")
pass "All $total GPU pods running"
json_add "gpu" "PASS" "$total pods running"
fi
}
# --- 24. Cloudflare Tunnel ---
check_cloudflare_tunnel() {
section 24 "Cloudflare Tunnel"
local cf_pods running_count total_count
cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
if [[ -z "$cf_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel namespace not found or empty — external access broken"
json_add "cloudflare_tunnel" "FAIL" "No pods found"
return 0
fi
total_count=$(count_lines "$cf_pods")
running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
if [[ "$running_count" -eq 0 ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
elif [[ "$running_count" -lt "$total_count" ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
else
pass "Cloudflare tunnel: all $total_count pods running"
json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
fi
}
# --- Summary ---
print_summary() {
if [[ "$JSON" == true ]]; then
@ -731,6 +1207,16 @@ main() {
check_ingresses
check_alerts
check_uptime_kuma
check_resourcequota
check_statefulsets
check_node_disk
check_helm_releases
check_kyverno
check_nfs
check_dns
check_tls_certs
check_gpu
check_cloudflare_tunnel
print_summary
# Exit code: 2 for failures, 1 for warnings, 0 for clean