From 86d1d50ad0bb331bf69eb220d0e044fec0a83e51 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 21 Feb 2026 23:57:04 +0000 Subject: [PATCH] [ci skip] Extend cluster healthcheck from 14 to 24 checks Add 10 new checks covering gaps discovered during incident response: ResourceQuota pressure, StatefulSets, node disk usage, Helm release health, Kyverno policy engine, NFS connectivity, DNS resolution, TLS certificate expiry, GPU health, and Cloudflare tunnel status. --- .claude/CLAUDE.md | 2 +- scripts/cluster_healthcheck.sh | 492 ++++++++++++++++++++++++++++++++- 2 files changed, 490 insertions(+), 4 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index c1e78bfe..75230420 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -179,7 +179,7 @@ kubectl get pods -A **Cluster Health Check** (`scripts/cluster_healthcheck.sh`): - **ALWAYS use this script** to check cluster health — whether the user asks explicitly, after deploying/updating services, or whenever you need to verify cluster state. Never use ad-hoc kubectl commands to assess overall cluster health; use the script instead. -- Runs 14 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma +- Runs 24 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma, ResourceQuota pressure, StatefulSets, node disk, Helm releases, Kyverno, NFS, DNS, TLS certs, GPU, Cloudflare tunnel - **When adding new healthchecks or monitoring**: Always update this script to validate the new component **Terraform target examples:** diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index 75095161..27211f7e 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Cluster health check script. -# Runs 14 diagnostic checks against the Kubernetes cluster and prints +# Runs 24 diagnostic checks against the Kubernetes cluster and prints # a colour-coded report with PASS / WARN / FAIL for each section. # # Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig ] @@ -26,6 +26,7 @@ JSON=false KUBECONFIG_PATH="$(pwd)/config" KUBECTL="" JSON_RESULTS=() +TOTAL_CHECKS=24 # --- Helpers --- info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; } @@ -38,14 +39,14 @@ section() { [[ "$JSON" == true ]] && return 0 [[ "$QUIET" == true ]] && return 0 echo "" - echo -e "${BOLD}[$num/14] $title${NC}" + echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" } section_always() { local num="$1" title="$2" [[ "$JSON" == true ]] && return 0 echo "" - echo -e "${BOLD}[$num/14] $title${NC}" + echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}" } json_add() { @@ -665,6 +666,481 @@ except Exception as e: fi } +# --- 15. ResourceQuota Pressure --- +check_resourcequota() { + section 15 "ResourceQuota Pressure" + local quotas detail="" had_issue=false status="PASS" + + quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; } + + local pressure + pressure=$(echo "$quotas" | python3 -c ' +import json, sys, re + +def parse_cpu(val): + """Convert CPU value to millicores.""" + val = str(val) + if val.endswith("m"): + return float(val[:-1]) + return float(val) * 1000 + +def parse_mem(val): + """Convert memory value to bytes.""" + val = str(val) + units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} + for suffix, mult in units.items(): + if val.endswith(suffix): + return float(val[:-len(suffix)]) * mult + # Plain bytes or numeric + return float(val) + +data = json.load(sys.stdin) +for item in data.get("items", []): + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + status = item.get("status", {}) + hard = status.get("hard", {}) + used = status.get("used", {}) + + for resource, hard_val in hard.items(): + used_val = used.get(resource, "0") + try: + if "cpu" in resource: + h = parse_cpu(hard_val) + u = parse_cpu(used_val) + elif "memory" in resource or "storage" in resource: + h = parse_mem(hard_val) + u = parse_mem(used_val) + elif resource == "pods": + h = float(hard_val) + u = float(used_val) + else: + continue + if h <= 0: + continue + pct = (u / h) * 100 + if pct > 80: + level = "FAIL" if pct > 95 else "WARN" + print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%") + except (ValueError, ZeroDivisionError): + pass +' 2>/dev/null) || true + + if [[ -z "$pressure" ]]; then + pass "All ResourceQuotas below 80% usage" + json_add "resourcequota" "PASS" "All below 80%" + else + [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure" + while IFS= read -r line; do + local level ns_res resource pct + level=$(echo "$line" | cut -d: -f1) + ns_res=$(echo "$line" | cut -d: -f2) + resource=$(echo "$line" | cut -d: -f3) + pct=$(echo "$line" | cut -d: -f4) + if [[ "$level" == "FAIL" ]]; then + fail "$ns_res: $resource at $pct" + status="FAIL" + else + warn "$ns_res: $resource at $pct" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$ns_res $resource=$pct; " + had_issue=true + done <<< "$pressure" + json_add "resourcequota" "$status" "$detail" + fi +} + +# --- 16. StatefulSets --- +check_statefulsets() { + section 16 "StatefulSets" + local sts detail="" had_issue=false + + sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true + if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then + pass "No StatefulSets in cluster" + json_add "statefulsets" "PASS" "No StatefulSets" + return 0 + fi + + while IFS= read -r line; do + local ns name ready current desired + ns=$(echo "$line" | awk '{print $1}') + name=$(echo "$line" | awk '{print $2}') + ready=$(echo "$line" | awk '{print $3}') + current=$(echo "$ready" | cut -d/ -f1) + desired=$(echo "$ready" | cut -d/ -f2) + + if [[ "$current" != "$desired" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets" + fail "$ns/$name: $current/$desired ready" + detail+="$ns/$name $current/$desired; " + had_issue=true + fi + done <<< "$sts" + + if [[ "$had_issue" == false ]]; then + pass "All StatefulSets fully available" + json_add "statefulsets" "PASS" "All available" + else + json_add "statefulsets" "FAIL" "$detail" + fi +} + +# --- 17. Node Disk Usage --- +check_node_disk() { + section 17 "Node Disk Usage" + local node_json detail="" had_issue=false status="PASS" + + node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; } + + local disk_info + disk_info=$(echo "$node_json" | python3 -c ' +import json, sys + +def parse_storage(val): + """Convert storage value to bytes.""" + val = str(val) + units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4} + for suffix, mult in units.items(): + if val.endswith(suffix): + return float(val[:-len(suffix)]) * mult + return float(val) + +data = json.load(sys.stdin) +for node in data["items"]: + name = node["metadata"]["name"] + cap = node["status"].get("capacity", {}) + alloc = node["status"].get("allocatable", {}) + es_cap = cap.get("ephemeral-storage", "0") + es_alloc = alloc.get("ephemeral-storage", "0") + try: + c = parse_storage(es_cap) + a = parse_storage(es_alloc) + if c > 0: + used_pct = ((c - a) / c) * 100 + if used_pct > 80: + level = "FAIL" if used_pct > 90 else "WARN" + print(f"{level}:{name}:{used_pct:.0f}") + except (ValueError, ZeroDivisionError): + pass +' 2>/dev/null) || true + + if [[ -z "$disk_info" ]]; then + pass "All nodes below 80% ephemeral-storage usage" + json_add "node_disk" "PASS" "All below 80%" + else + [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage" + while IFS= read -r line; do + local level node pct + level=$(echo "$line" | cut -d: -f1) + node=$(echo "$line" | cut -d: -f2) + pct=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "$node: ephemeral-storage at ${pct}%" + status="FAIL" + else + warn "$node: ephemeral-storage at ${pct}%" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$node=${pct}%; " + had_issue=true + done <<< "$disk_info" + json_add "node_disk" "$status" "$detail" + fi +} + +# --- 18. Helm Release Health --- +check_helm_releases() { + section 18 "Helm Release Health" + local releases detail="" had_issue=false status="PASS" + + releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || { + [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" + warn "Cannot list Helm releases" + json_add "helm_releases" "WARN" "Cannot list" + return 0 + } + + local bad_releases + bad_releases=$(echo "$releases" | python3 -c ' +import json, sys +data = json.load(sys.stdin) +for r in data: + name = r.get("name", "?") + ns = r.get("namespace", "?") + st = r.get("status", "unknown") + if st != "deployed": + level = "FAIL" if st.startswith("pending") else "WARN" + print(f"{level}:{ns}/{name}:{st}") +' 2>/dev/null) || true + + if [[ -z "$bad_releases" ]]; then + pass "All Helm releases in deployed state" + json_add "helm_releases" "PASS" "All deployed" + else + [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health" + while IFS= read -r line; do + local level release_name release_status + level=$(echo "$line" | cut -d: -f1) + release_name=$(echo "$line" | cut -d: -f2) + release_status=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "Helm release $release_name: $release_status (blocks terraform)" + status="FAIL" + else + warn "Helm release $release_name: $release_status" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$release_name=$release_status; " + had_issue=true + done <<< "$bad_releases" + json_add "helm_releases" "$status" "$detail" + fi +} + +# --- 19. Kyverno Policy Engine --- +check_kyverno() { + section 19 "Kyverno Policy Engine" + local kv_pods not_running + + kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true) + if [[ -z "$kv_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" + fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact" + json_add "kyverno" "FAIL" "No Kyverno pods found" + return 0 + fi + + not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true) + if [[ -n "$not_running" ]]; then + [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine" + while IFS= read -r line; do + fail "Kyverno pod not running: $line" + done <<< "$not_running" + json_add "kyverno" "FAIL" "$not_running" + else + local total + total=$(count_lines "$kv_pods") + pass "All $total Kyverno pods running" + json_add "kyverno" "PASS" "$total pods running" + fi +} + +# --- 20. NFS Connectivity --- +check_nfs() { + section 20 "NFS Connectivity" + + if showmount -e 10.0.10.15 &>/dev/null; then + pass "NFS server 10.0.10.15 reachable (exports listed)" + json_add "nfs" "PASS" "NFS reachable" + elif nc -z -G 3 10.0.10.15 2049 &>/dev/null; then + pass "NFS server 10.0.10.15 port 2049 open" + json_add "nfs" "PASS" "NFS port open" + else + [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity" + fail "NFS server 10.0.10.15 unreachable — 30+ services depend on NFS" + json_add "nfs" "FAIL" "NFS unreachable" + fi +} + +# --- 21. DNS Resolution --- +check_dns() { + section 21 "DNS Resolution" + local internal_ok=false external_ok=false detail="" + + if dig @10.0.20.101 viktorbarzin.me +short +time=3 +tries=1 &>/dev/null; then + internal_ok=true + fi + if dig @10.0.20.101 google.com +short +time=3 +tries=1 &>/dev/null; then + external_ok=true + fi + + if [[ "$internal_ok" == true && "$external_ok" == true ]]; then + pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)" + json_add "dns" "PASS" "Both resolve" + elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then + [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" + if [[ "$internal_ok" == false ]]; then + warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK" + detail="Internal failed" + else + warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed" + detail="External failed" + fi + json_add "dns" "WARN" "$detail" + else + [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution" + fail "DNS server 10.0.20.101 not resolving — both internal and external failed" + json_add "dns" "FAIL" "Both failed" + fi +} + +# --- 22. TLS Certificate Expiry --- +check_tls_certs() { + section 22 "TLS Certificate Expiry" + local secrets detail="" had_issue=false status="PASS" + + secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || { + [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" + warn "Cannot list secrets" + json_add "tls_certs" "WARN" "Cannot list secrets" + return 0 + } + + local cert_issues + cert_issues=$(echo "$secrets" | python3 -c ' +import json, sys, base64, subprocess, hashlib +from datetime import datetime, timezone + +data = json.load(sys.stdin) +seen_fingerprints = set() +results = [] + +for item in data.get("items", []): + if item.get("type") != "kubernetes.io/tls": + continue + ns = item["metadata"]["namespace"] + name = item["metadata"]["name"] + cert_data = item.get("data", {}).get("tls.crt", "") + if not cert_data: + continue + + # Deduplicate by cert fingerprint + raw = base64.b64decode(cert_data) + fp = hashlib.sha256(raw).hexdigest()[:16] + if fp in seen_fingerprints: + continue + seen_fingerprints.add(fp) + + # Parse certificate expiry with openssl + try: + result = subprocess.run( + ["openssl", "x509", "-noout", "-enddate", "-subject"], + input=raw, capture_output=True, timeout=5 + ) + output = result.stdout.decode() + for line in output.splitlines(): + if line.startswith("notAfter="): + date_str = line.split("=", 1)[1] + # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT" + try: + expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z") + expiry = expiry.replace(tzinfo=timezone.utc) + days_left = (expiry - datetime.now(timezone.utc)).days + if days_left <= 7: + print(f"FAIL:{ns}/{name}:{days_left}d") + elif days_left <= 30: + print(f"WARN:{ns}/{name}:{days_left}d") + except ValueError: + pass + except (subprocess.TimeoutExpired, Exception): + pass +' 2>/dev/null) || true + + if [[ -z "$cert_issues" ]]; then + pass "All TLS certificates valid for >30 days" + json_add "tls_certs" "PASS" "All valid >30d" + else + [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry" + while IFS= read -r line; do + local level cert_name days + level=$(echo "$line" | cut -d: -f1) + cert_name=$(echo "$line" | cut -d: -f2) + days=$(echo "$line" | cut -d: -f3) + if [[ "$level" == "FAIL" ]]; then + fail "TLS cert $cert_name expires in $days" + status="FAIL" + else + warn "TLS cert $cert_name expires in $days" + [[ "$status" != "FAIL" ]] && status="WARN" + fi + detail+="$cert_name=$days; " + had_issue=true + done <<< "$cert_issues" + json_add "tls_certs" "$status" "$detail" + fi +} + +# --- 23. GPU Health --- +check_gpu() { + section 23 "GPU Health" + local gpu_pods not_running + + gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true) + if [[ -z "$gpu_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 23 "GPU Health" + warn "NVIDIA namespace not found or empty" + json_add "gpu" "WARN" "No GPU pods found" + return 0 + fi + + # Check specifically for device-plugin (critical for GPU scheduling) + local device_plugin_down=false + local other_down=false + local detail="" + + while IFS= read -r line; do + local pod_name pod_status + pod_name=$(echo "$line" | awk '{print $1}') + pod_status=$(echo "$line" | awk '{print $3}') + if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then + if echo "$pod_name" | grep -q "device-plugin"; then + device_plugin_down=true + detail+="device-plugin $pod_name: $pod_status; " + else + other_down=true + detail+="$pod_name: $pod_status; " + fi + fi + done <<< "$gpu_pods" + + if [[ "$device_plugin_down" == true ]]; then + [[ "$QUIET" == true ]] && section_always 23 "GPU Health" + fail "GPU device-plugin is down — GPU workloads cannot schedule" + json_add "gpu" "FAIL" "$detail" + elif [[ "$other_down" == true ]]; then + [[ "$QUIET" == true ]] && section_always 23 "GPU Health" + warn "Some GPU pods not running: $detail" + json_add "gpu" "WARN" "$detail" + else + local total + total=$(count_lines "$gpu_pods") + pass "All $total GPU pods running" + json_add "gpu" "PASS" "$total pods running" + fi +} + +# --- 24. Cloudflare Tunnel --- +check_cloudflare_tunnel() { + section 24 "Cloudflare Tunnel" + local cf_pods running_count total_count + + cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true) + if [[ -z "$cf_pods" ]]; then + [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" + fail "Cloudflare tunnel namespace not found or empty — external access broken" + json_add "cloudflare_tunnel" "FAIL" "No pods found" + return 0 + fi + + total_count=$(count_lines "$cf_pods") + running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ') + + if [[ "$running_count" -eq 0 ]]; then + [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" + fail "Cloudflare tunnel: 0/$total_count pods running — external access broken" + json_add "cloudflare_tunnel" "FAIL" "0/$total_count running" + elif [[ "$running_count" -lt "$total_count" ]]; then + [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel" + warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)" + json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running" + else + pass "Cloudflare tunnel: all $total_count pods running" + json_add "cloudflare_tunnel" "PASS" "$total_count pods running" + fi +} + # --- Summary --- print_summary() { if [[ "$JSON" == true ]]; then @@ -731,6 +1207,16 @@ main() { check_ingresses check_alerts check_uptime_kuma + check_resourcequota + check_statefulsets + check_node_disk + check_helm_releases + check_kyverno + check_nfs + check_dns + check_tls_certs + check_gpu + check_cloudflare_tunnel print_summary # Exit code: 2 for failures, 1 for warnings, 0 for clean