From 86d1d50ad0bb331bf69eb220d0e044fec0a83e51 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Sat, 21 Feb 2026 23:57:04 +0000
Subject: [PATCH] [ci skip] Extend cluster healthcheck from 14 to 24 checks

Add 10 new checks covering gaps discovered during incident response:
ResourceQuota pressure, StatefulSets, node disk usage, Helm release
health, Kyverno policy engine, NFS connectivity, DNS resolution,
TLS certificate expiry, GPU health, and Cloudflare tunnel status.
---
 .claude/CLAUDE.md              |   2 +-
 scripts/cluster_healthcheck.sh | 492 ++++++++++++++++++++++++++++++++-
 2 files changed, 490 insertions(+), 4 deletions(-)
diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index c1e78bfe..75230420 100755
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -179,7 +179,7 @@ kubectl get pods -A
 
 **Cluster Health Check** (`scripts/cluster_healthcheck.sh`):
 - **ALWAYS use this script** to check cluster health — whether the user asks explicitly, after deploying/updating services, or whenever you need to verify cluster state. Never use ad-hoc kubectl commands to assess overall cluster health; use the script instead.
-- Runs 14 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma
+- Runs 24 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma, ResourceQuota pressure, StatefulSets, node disk, Helm releases, Kyverno, NFS, DNS, TLS certs, GPU, Cloudflare tunnel
 - **When adding new healthchecks or monitoring**: Always update this script to validate the new component
 
 **Terraform target examples:**
diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh
index 75095161..27211f7e 100755
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # Cluster health check script.
-# Runs 14 diagnostic checks against the Kubernetes cluster and prints
+# Runs 24 diagnostic checks against the Kubernetes cluster and prints
 # a colour-coded report with PASS / WARN / FAIL for each section.
 #
 # Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]
@@ -26,6 +26,7 @@ JSON=false
 KUBECONFIG_PATH="$(pwd)/config"
 KUBECTL=""
 JSON_RESULTS=()
+TOTAL_CHECKS=24
 
 # --- Helpers ---
 info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@@ -38,14 +39,14 @@ section() {
     [[ "$JSON" == true ]] && return 0
     [[ "$QUIET" == true ]] && return 0
     echo ""
-    echo -e "${BOLD}[$num/14] $title${NC}"
+    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
 }
 
 section_always() {
     local num="$1" title="$2"
     [[ "$JSON" == true ]] && return 0
     echo ""
-    echo -e "${BOLD}[$num/14] $title${NC}"
+    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
 }
 
 json_add() {
@@ -665,6 +666,481 @@ except Exception as e:
     fi
 }
 
+# --- 15. ResourceQuota Pressure ---
+check_resourcequota() {
+    section 15 "ResourceQuota Pressure"
+    local quotas detail="" had_issue=false status="PASS"
+
+    quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
+
+    local pressure
+    pressure=$(echo "$quotas" | python3 -c '
+import json, sys, re
+
+def parse_cpu(val):
+    """Convert CPU value to millicores."""
+    val = str(val)
+    if val.endswith("m"):
+        return float(val[:-1])
+    return float(val) * 1000
+
+def parse_mem(val):
+    """Convert memory value to bytes."""
+    val = str(val)
+    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
+    for suffix, mult in units.items():
+        if val.endswith(suffix):
+            return float(val[:-len(suffix)]) * mult
+    # Plain bytes or numeric
+    return float(val)
+
+data = json.load(sys.stdin)
+for item in data.get("items", []):
+    ns = item["metadata"]["namespace"]
+    name = item["metadata"]["name"]
+    status = item.get("status", {})
+    hard = status.get("hard", {})
+    used = status.get("used", {})
+
+    for resource, hard_val in hard.items():
+        used_val = used.get(resource, "0")
+        try:
+            if "cpu" in resource:
+                h = parse_cpu(hard_val)
+                u = parse_cpu(used_val)
+            elif "memory" in resource or "storage" in resource:
+                h = parse_mem(hard_val)
+                u = parse_mem(used_val)
+            elif resource == "pods":
+                h = float(hard_val)
+                u = float(used_val)
+            else:
+                continue
+            if h <= 0:
+                continue
+            pct = (u / h) * 100
+            if pct > 80:
+                level = "FAIL" if pct > 95 else "WARN"
+                print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
+        except (ValueError, ZeroDivisionError):
+            pass
+' 2>/dev/null) || true
+
+    if [[ -z "$pressure" ]]; then
+        pass "All ResourceQuotas below 80% usage"
+        json_add "resourcequota" "PASS" "All below 80%"
+    else
+        [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
+        while IFS= read -r line; do
+            local level ns_res resource pct
+            level=$(echo "$line" | cut -d: -f1)
+            ns_res=$(echo "$line" | cut -d: -f2)
+            resource=$(echo "$line" | cut -d: -f3)
+            pct=$(echo "$line" | cut -d: -f4)
+            if [[ "$level" == "FAIL" ]]; then
+                fail "$ns_res: $resource at $pct"
+                status="FAIL"
+            else
+                warn "$ns_res: $resource at $pct"
+                [[ "$status" != "FAIL" ]] && status="WARN"
+            fi
+            detail+="$ns_res $resource=$pct; "
+            had_issue=true
+        done <<< "$pressure"
+        json_add "resourcequota" "$status" "$detail"
+    fi
+}
+
+# --- 16. StatefulSets ---
+check_statefulsets() {
+    section 16 "StatefulSets"
+    local sts detail="" had_issue=false
+
+    sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
+    if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
+        pass "No StatefulSets in cluster"
+        json_add "statefulsets" "PASS" "No StatefulSets"
+        return 0
+    fi
+
+    while IFS= read -r line; do
+        local ns name ready current desired
+        ns=$(echo "$line" | awk '{print $1}')
+        name=$(echo "$line" | awk '{print $2}')
+        ready=$(echo "$line" | awk '{print $3}')
+        current=$(echo "$ready" | cut -d/ -f1)
+        desired=$(echo "$ready" | cut -d/ -f2)
+
+        if [[ "$current" != "$desired" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
+            fail "$ns/$name: $current/$desired ready"
+            detail+="$ns/$name $current/$desired; "
+            had_issue=true
+        fi
+    done <<< "$sts"
+
+    if [[ "$had_issue" == false ]]; then
+        pass "All StatefulSets fully available"
+        json_add "statefulsets" "PASS" "All available"
+    else
+        json_add "statefulsets" "FAIL" "$detail"
+    fi
+}
+
+# --- 17. Node Disk Usage ---
+check_node_disk() {
+    section 17 "Node Disk Usage"
+    local node_json detail="" had_issue=false status="PASS"
+
+    node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
+
+    local disk_info
+    disk_info=$(echo "$node_json" | python3 -c '
+import json, sys
+
+def parse_storage(val):
+    """Convert storage value to bytes."""
+    val = str(val)
+    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
+    for suffix, mult in units.items():
+        if val.endswith(suffix):
+            return float(val[:-len(suffix)]) * mult
+    return float(val)
+
+data = json.load(sys.stdin)
+for node in data["items"]:
+    name = node["metadata"]["name"]
+    cap = node["status"].get("capacity", {})
+    alloc = node["status"].get("allocatable", {})
+    es_cap = cap.get("ephemeral-storage", "0")
+    es_alloc = alloc.get("ephemeral-storage", "0")
+    try:
+        c = parse_storage(es_cap)
+        a = parse_storage(es_alloc)
+        if c > 0:
+            used_pct = ((c - a) / c) * 100
+            if used_pct > 80:
+                level = "FAIL" if used_pct > 90 else "WARN"
+                print(f"{level}:{name}:{used_pct:.0f}")
+    except (ValueError, ZeroDivisionError):
+        pass
+' 2>/dev/null) || true
+
+    if [[ -z "$disk_info" ]]; then
+        pass "All nodes below 80% ephemeral-storage usage"
+        json_add "node_disk" "PASS" "All below 80%"
+    else
+        [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
+        while IFS= read -r line; do
+            local level node pct
+            level=$(echo "$line" | cut -d: -f1)
+            node=$(echo "$line" | cut -d: -f2)
+            pct=$(echo "$line" | cut -d: -f3)
+            if [[ "$level" == "FAIL" ]]; then
+                fail "$node: ephemeral-storage at ${pct}%"
+                status="FAIL"
+            else
+                warn "$node: ephemeral-storage at ${pct}%"
+                [[ "$status" != "FAIL" ]] && status="WARN"
+            fi
+            detail+="$node=${pct}%; "
+            had_issue=true
+        done <<< "$disk_info"
+        json_add "node_disk" "$status" "$detail"
+    fi
+}
+
+# --- 18. Helm Release Health ---
+check_helm_releases() {
+    section 18 "Helm Release Health"
+    local releases detail="" had_issue=false status="PASS"
+
+    releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || {
+        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
+        warn "Cannot list Helm releases"
+        json_add "helm_releases" "WARN" "Cannot list"
+        return 0
+    }
+
+    local bad_releases
+    bad_releases=$(echo "$releases" | python3 -c '
+import json, sys
+data = json.load(sys.stdin)
+for r in data:
+    name = r.get("name", "?")
+    ns = r.get("namespace", "?")
+    st = r.get("status", "unknown")
+    if st != "deployed":
+        level = "FAIL" if st.startswith("pending") else "WARN"
+        print(f"{level}:{ns}/{name}:{st}")
+' 2>/dev/null) || true
+
+    if [[ -z "$bad_releases" ]]; then
+        pass "All Helm releases in deployed state"
+        json_add "helm_releases" "PASS" "All deployed"
+    else
+        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
+        while IFS= read -r line; do
+            local level release_name release_status
+            level=$(echo "$line" | cut -d: -f1)
+            release_name=$(echo "$line" | cut -d: -f2)
+            release_status=$(echo "$line" | cut -d: -f3)
+            if [[ "$level" == "FAIL" ]]; then
+                fail "Helm release $release_name: $release_status (blocks terraform)"
+                status="FAIL"
+            else
+                warn "Helm release $release_name: $release_status"
+                [[ "$status" != "FAIL" ]] && status="WARN"
+            fi
+            detail+="$release_name=$release_status; "
+            had_issue=true
+        done <<< "$bad_releases"
+        json_add "helm_releases" "$status" "$detail"
+    fi
+}
+
+# --- 19. Kyverno Policy Engine ---
+check_kyverno() {
+    section 19 "Kyverno Policy Engine"
+    local kv_pods not_running
+
+    kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
+    if [[ -z "$kv_pods" ]]; then
+        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
+        fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
+        json_add "kyverno" "FAIL" "No Kyverno pods found"
+        return 0
+    fi
+
+    not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
+    if [[ -n "$not_running" ]]; then
+        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
+        while IFS= read -r line; do
+            fail "Kyverno pod not running: $line"
+        done <<< "$not_running"
+        json_add "kyverno" "FAIL" "$not_running"
+    else
+        local total
+        total=$(count_lines "$kv_pods")
+        pass "All $total Kyverno pods running"
+        json_add "kyverno" "PASS" "$total pods running"
+    fi
+}
+
+# --- 20. NFS Connectivity ---
+check_nfs() {
+    section 20 "NFS Connectivity"
+
+    if showmount -e 10.0.10.15 &>/dev/null; then
+        pass "NFS server 10.0.10.15 reachable (exports listed)"
+        json_add "nfs" "PASS" "NFS reachable"
+    elif nc -z -G 3 10.0.10.15 2049 &>/dev/null; then
+        pass "NFS server 10.0.10.15 port 2049 open"
+        json_add "nfs" "PASS" "NFS port open"
+    else
+        [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
+        fail "NFS server 10.0.10.15 unreachable — 30+ services depend on NFS"
+        json_add "nfs" "FAIL" "NFS unreachable"
+    fi
+}
+
+# --- 21. DNS Resolution ---
+check_dns() {
+    section 21 "DNS Resolution"
+    local internal_ok=false external_ok=false detail=""
+
+    if dig @10.0.20.101 viktorbarzin.me +short +time=3 +tries=1 &>/dev/null; then
+        internal_ok=true
+    fi
+    if dig @10.0.20.101 google.com +short +time=3 +tries=1 &>/dev/null; then
+        external_ok=true
+    fi
+
+    if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
+        pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
+        json_add "dns" "PASS" "Both resolve"
+    elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
+        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
+        if [[ "$internal_ok" == false ]]; then
+            warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
+            detail="Internal failed"
+        else
+            warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
+            detail="External failed"
+        fi
+        json_add "dns" "WARN" "$detail"
+    else
+        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
+        fail "DNS server 10.0.20.101 not resolving — both internal and external failed"
+        json_add "dns" "FAIL" "Both failed"
+    fi
+}
+
+# --- 22. TLS Certificate Expiry ---
+check_tls_certs() {
+    section 22 "TLS Certificate Expiry"
+    local secrets detail="" had_issue=false status="PASS"
+
+    secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
+        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
+        warn "Cannot list secrets"
+        json_add "tls_certs" "WARN" "Cannot list secrets"
+        return 0
+    }
+
+    local cert_issues
+    cert_issues=$(echo "$secrets" | python3 -c '
+import json, sys, base64, subprocess, hashlib
+from datetime import datetime, timezone
+
+data = json.load(sys.stdin)
+seen_fingerprints = set()
+results = []
+
+for item in data.get("items", []):
+    if item.get("type") != "kubernetes.io/tls":
+        continue
+    ns = item["metadata"]["namespace"]
+    name = item["metadata"]["name"]
+    cert_data = item.get("data", {}).get("tls.crt", "")
+    if not cert_data:
+        continue
+
+    # Deduplicate by cert fingerprint
+    raw = base64.b64decode(cert_data)
+    fp = hashlib.sha256(raw).hexdigest()[:16]
+    if fp in seen_fingerprints:
+        continue
+    seen_fingerprints.add(fp)
+
+    # Parse certificate expiry with openssl
+    try:
+        result = subprocess.run(
+            ["openssl", "x509", "-noout", "-enddate", "-subject"],
+            input=raw, capture_output=True, timeout=5
+        )
+        output = result.stdout.decode()
+        for line in output.splitlines():
+            if line.startswith("notAfter="):
+                date_str = line.split("=", 1)[1]
+                # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
+                try:
+                    expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
+                    expiry = expiry.replace(tzinfo=timezone.utc)
+                    days_left = (expiry - datetime.now(timezone.utc)).days
+                    if days_left <= 7:
+                        print(f"FAIL:{ns}/{name}:{days_left}d")
+                    elif days_left <= 30:
+                        print(f"WARN:{ns}/{name}:{days_left}d")
+                except ValueError:
+                    pass
+    except (subprocess.TimeoutExpired, Exception):
+        pass
+' 2>/dev/null) || true
+
+    if [[ -z "$cert_issues" ]]; then
+        pass "All TLS certificates valid for >30 days"
+        json_add "tls_certs" "PASS" "All valid >30d"
+    else
+        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
+        while IFS= read -r line; do
+            local level cert_name days
+            level=$(echo "$line" | cut -d: -f1)
+            cert_name=$(echo "$line" | cut -d: -f2)
+            days=$(echo "$line" | cut -d: -f3)
+            if [[ "$level" == "FAIL" ]]; then
+                fail "TLS cert $cert_name expires in $days"
+                status="FAIL"
+            else
+                warn "TLS cert $cert_name expires in $days"
+                [[ "$status" != "FAIL" ]] && status="WARN"
+            fi
+            detail+="$cert_name=$days; "
+            had_issue=true
+        done <<< "$cert_issues"
+        json_add "tls_certs" "$status" "$detail"
+    fi
+}
+
+# --- 23. GPU Health ---
+check_gpu() {
+    section 23 "GPU Health"
+    local gpu_pods not_running
+
+    gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
+    if [[ -z "$gpu_pods" ]]; then
+        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
+        warn "NVIDIA namespace not found or empty"
+        json_add "gpu" "WARN" "No GPU pods found"
+        return 0
+    fi
+
+    # Check specifically for device-plugin (critical for GPU scheduling)
+    local device_plugin_down=false
+    local other_down=false
+    local detail=""
+
+    while IFS= read -r line; do
+        local pod_name pod_status
+        pod_name=$(echo "$line" | awk '{print $1}')
+        pod_status=$(echo "$line" | awk '{print $3}')
+        if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
+            if echo "$pod_name" | grep -q "device-plugin"; then
+                device_plugin_down=true
+                detail+="device-plugin $pod_name: $pod_status; "
+            else
+                other_down=true
+                detail+="$pod_name: $pod_status; "
+            fi
+        fi
+    done <<< "$gpu_pods"
+
+    if [[ "$device_plugin_down" == true ]]; then
+        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
+        fail "GPU device-plugin is down — GPU workloads cannot schedule"
+        json_add "gpu" "FAIL" "$detail"
+    elif [[ "$other_down" == true ]]; then
+        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
+        warn "Some GPU pods not running: $detail"
+        json_add "gpu" "WARN" "$detail"
+    else
+        local total
+        total=$(count_lines "$gpu_pods")
+        pass "All $total GPU pods running"
+        json_add "gpu" "PASS" "$total pods running"
+    fi
+}
+
+# --- 24. Cloudflare Tunnel ---
+check_cloudflare_tunnel() {
+    section 24 "Cloudflare Tunnel"
+    local cf_pods running_count total_count
+
+    cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
+    if [[ -z "$cf_pods" ]]; then
+        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
+        fail "Cloudflare tunnel namespace not found or empty — external access broken"
+        json_add "cloudflare_tunnel" "FAIL" "No pods found"
+        return 0
+    fi
+
+    total_count=$(count_lines "$cf_pods")
+    running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
+
+    if [[ "$running_count" -eq 0 ]]; then
+        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
+        fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
+        json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
+    elif [[ "$running_count" -lt "$total_count" ]]; then
+        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
+        warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
+        json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
+    else
+        pass "Cloudflare tunnel: all $total_count pods running"
+        json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
+    fi
+}
+
 # --- Summary ---
 print_summary() {
     if [[ "$JSON" == true ]]; then
@@ -731,6 +1207,16 @@ main() {
     check_ingresses
     check_alerts
     check_uptime_kuma
+    check_resourcequota
+    check_statefulsets
+    check_node_disk
+    check_helm_releases
+    check_kyverno
+    check_nfs
+    check_dns
+    check_tls_certs
+    check_gpu
+    check_cloudflare_tunnel
     print_summary
 
     # Exit code: 2 for failures, 1 for warnings, 0 for clean