[ci skip] Extend cluster healthcheck from 14 to 24 checks
Add 10 new checks covering gaps discovered during incident response: ResourceQuota pressure, StatefulSets, node disk usage, Helm release health, Kyverno policy engine, NFS connectivity, DNS resolution, TLS certificate expiry, GPU health, and Cloudflare tunnel status.
This commit is contained in:
parent
4700743560
commit
86d1d50ad0
2 changed files with 490 additions and 4 deletions
|
|
@ -179,7 +179,7 @@ kubectl get pods -A
|
|||
|
||||
**Cluster Health Check** (`scripts/cluster_healthcheck.sh`):
|
||||
- **ALWAYS use this script** to check cluster health — whether the user asks explicitly, after deploying/updating services, or whenever you need to verify cluster state. Never use ad-hoc kubectl commands to assess overall cluster health; use the script instead.
|
||||
- Runs 14 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma
|
||||
- Runs 24 checks: nodes, resources, conditions, pods, evicted, DaemonSets, deployments, PVCs, HPAs, CronJobs, CrowdSec, ingress, Prometheus alerts, Uptime Kuma, ResourceQuota pressure, StatefulSets, node disk, Helm releases, Kyverno, NFS, DNS, TLS certs, GPU, Cloudflare tunnel
|
||||
- **When adding new healthchecks or monitoring**: Always update this script to validate the new component
|
||||
|
||||
**Terraform target examples:**
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Cluster health check script.
|
||||
# Runs 14 diagnostic checks against the Kubernetes cluster and prints
|
||||
# Runs 24 diagnostic checks against the Kubernetes cluster and prints
|
||||
# a colour-coded report with PASS / WARN / FAIL for each section.
|
||||
#
|
||||
# Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]
|
||||
|
|
@ -26,6 +26,7 @@ JSON=false
|
|||
KUBECONFIG_PATH="$(pwd)/config"
|
||||
KUBECTL=""
|
||||
JSON_RESULTS=()
|
||||
TOTAL_CHECKS=24
|
||||
|
||||
# --- Helpers ---
|
||||
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
|
|
@ -38,14 +39,14 @@ section() {
|
|||
[[ "$JSON" == true ]] && return 0
|
||||
[[ "$QUIET" == true ]] && return 0
|
||||
echo ""
|
||||
echo -e "${BOLD}[$num/14] $title${NC}"
|
||||
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
|
||||
}
|
||||
|
||||
section_always() {
|
||||
local num="$1" title="$2"
|
||||
[[ "$JSON" == true ]] && return 0
|
||||
echo ""
|
||||
echo -e "${BOLD}[$num/14] $title${NC}"
|
||||
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
|
||||
}
|
||||
|
||||
json_add() {
|
||||
|
|
@ -665,6 +666,481 @@ except Exception as e:
|
|||
fi
|
||||
}
|
||||
|
||||
# --- 15. ResourceQuota Pressure ---
|
||||
check_resourcequota() {
|
||||
section 15 "ResourceQuota Pressure"
|
||||
local quotas detail="" had_issue=false status="PASS"
|
||||
|
||||
quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
|
||||
|
||||
local pressure
|
||||
pressure=$(echo "$quotas" | python3 -c '
|
||||
import json, sys, re
|
||||
|
||||
def parse_cpu(val):
|
||||
"""Convert CPU value to millicores."""
|
||||
val = str(val)
|
||||
if val.endswith("m"):
|
||||
return float(val[:-1])
|
||||
return float(val) * 1000
|
||||
|
||||
def parse_mem(val):
|
||||
"""Convert memory value to bytes."""
|
||||
val = str(val)
|
||||
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
|
||||
for suffix, mult in units.items():
|
||||
if val.endswith(suffix):
|
||||
return float(val[:-len(suffix)]) * mult
|
||||
# Plain bytes or numeric
|
||||
return float(val)
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
for item in data.get("items", []):
|
||||
ns = item["metadata"]["namespace"]
|
||||
name = item["metadata"]["name"]
|
||||
status = item.get("status", {})
|
||||
hard = status.get("hard", {})
|
||||
used = status.get("used", {})
|
||||
|
||||
for resource, hard_val in hard.items():
|
||||
used_val = used.get(resource, "0")
|
||||
try:
|
||||
if "cpu" in resource:
|
||||
h = parse_cpu(hard_val)
|
||||
u = parse_cpu(used_val)
|
||||
elif "memory" in resource or "storage" in resource:
|
||||
h = parse_mem(hard_val)
|
||||
u = parse_mem(used_val)
|
||||
elif resource == "pods":
|
||||
h = float(hard_val)
|
||||
u = float(used_val)
|
||||
else:
|
||||
continue
|
||||
if h <= 0:
|
||||
continue
|
||||
pct = (u / h) * 100
|
||||
if pct > 80:
|
||||
level = "FAIL" if pct > 95 else "WARN"
|
||||
print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
|
||||
except (ValueError, ZeroDivisionError):
|
||||
pass
|
||||
' 2>/dev/null) || true
|
||||
|
||||
if [[ -z "$pressure" ]]; then
|
||||
pass "All ResourceQuotas below 80% usage"
|
||||
json_add "resourcequota" "PASS" "All below 80%"
|
||||
else
|
||||
[[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
|
||||
while IFS= read -r line; do
|
||||
local level ns_res resource pct
|
||||
level=$(echo "$line" | cut -d: -f1)
|
||||
ns_res=$(echo "$line" | cut -d: -f2)
|
||||
resource=$(echo "$line" | cut -d: -f3)
|
||||
pct=$(echo "$line" | cut -d: -f4)
|
||||
if [[ "$level" == "FAIL" ]]; then
|
||||
fail "$ns_res: $resource at $pct"
|
||||
status="FAIL"
|
||||
else
|
||||
warn "$ns_res: $resource at $pct"
|
||||
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||
fi
|
||||
detail+="$ns_res $resource=$pct; "
|
||||
had_issue=true
|
||||
done <<< "$pressure"
|
||||
json_add "resourcequota" "$status" "$detail"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 16. StatefulSets ---
|
||||
check_statefulsets() {
|
||||
section 16 "StatefulSets"
|
||||
local sts detail="" had_issue=false
|
||||
|
||||
sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
|
||||
if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
|
||||
pass "No StatefulSets in cluster"
|
||||
json_add "statefulsets" "PASS" "No StatefulSets"
|
||||
return 0
|
||||
fi
|
||||
|
||||
while IFS= read -r line; do
|
||||
local ns name ready current desired
|
||||
ns=$(echo "$line" | awk '{print $1}')
|
||||
name=$(echo "$line" | awk '{print $2}')
|
||||
ready=$(echo "$line" | awk '{print $3}')
|
||||
current=$(echo "$ready" | cut -d/ -f1)
|
||||
desired=$(echo "$ready" | cut -d/ -f2)
|
||||
|
||||
if [[ "$current" != "$desired" ]]; then
|
||||
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
|
||||
fail "$ns/$name: $current/$desired ready"
|
||||
detail+="$ns/$name $current/$desired; "
|
||||
had_issue=true
|
||||
fi
|
||||
done <<< "$sts"
|
||||
|
||||
if [[ "$had_issue" == false ]]; then
|
||||
pass "All StatefulSets fully available"
|
||||
json_add "statefulsets" "PASS" "All available"
|
||||
else
|
||||
json_add "statefulsets" "FAIL" "$detail"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 17. Node Disk Usage ---
|
||||
check_node_disk() {
|
||||
section 17 "Node Disk Usage"
|
||||
local node_json detail="" had_issue=false status="PASS"
|
||||
|
||||
node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
|
||||
|
||||
local disk_info
|
||||
disk_info=$(echo "$node_json" | python3 -c '
|
||||
import json, sys
|
||||
|
||||
def parse_storage(val):
|
||||
"""Convert storage value to bytes."""
|
||||
val = str(val)
|
||||
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
|
||||
for suffix, mult in units.items():
|
||||
if val.endswith(suffix):
|
||||
return float(val[:-len(suffix)]) * mult
|
||||
return float(val)
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
for node in data["items"]:
|
||||
name = node["metadata"]["name"]
|
||||
cap = node["status"].get("capacity", {})
|
||||
alloc = node["status"].get("allocatable", {})
|
||||
es_cap = cap.get("ephemeral-storage", "0")
|
||||
es_alloc = alloc.get("ephemeral-storage", "0")
|
||||
try:
|
||||
c = parse_storage(es_cap)
|
||||
a = parse_storage(es_alloc)
|
||||
if c > 0:
|
||||
used_pct = ((c - a) / c) * 100
|
||||
if used_pct > 80:
|
||||
level = "FAIL" if used_pct > 90 else "WARN"
|
||||
print(f"{level}:{name}:{used_pct:.0f}")
|
||||
except (ValueError, ZeroDivisionError):
|
||||
pass
|
||||
' 2>/dev/null) || true
|
||||
|
||||
if [[ -z "$disk_info" ]]; then
|
||||
pass "All nodes below 80% ephemeral-storage usage"
|
||||
json_add "node_disk" "PASS" "All below 80%"
|
||||
else
|
||||
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
|
||||
while IFS= read -r line; do
|
||||
local level node pct
|
||||
level=$(echo "$line" | cut -d: -f1)
|
||||
node=$(echo "$line" | cut -d: -f2)
|
||||
pct=$(echo "$line" | cut -d: -f3)
|
||||
if [[ "$level" == "FAIL" ]]; then
|
||||
fail "$node: ephemeral-storage at ${pct}%"
|
||||
status="FAIL"
|
||||
else
|
||||
warn "$node: ephemeral-storage at ${pct}%"
|
||||
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||
fi
|
||||
detail+="$node=${pct}%; "
|
||||
had_issue=true
|
||||
done <<< "$disk_info"
|
||||
json_add "node_disk" "$status" "$detail"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 18. Helm Release Health ---
|
||||
check_helm_releases() {
|
||||
section 18 "Helm Release Health"
|
||||
local releases detail="" had_issue=false status="PASS"
|
||||
|
||||
releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || {
|
||||
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
||||
warn "Cannot list Helm releases"
|
||||
json_add "helm_releases" "WARN" "Cannot list"
|
||||
return 0
|
||||
}
|
||||
|
||||
local bad_releases
|
||||
bad_releases=$(echo "$releases" | python3 -c '
|
||||
import json, sys
|
||||
data = json.load(sys.stdin)
|
||||
for r in data:
|
||||
name = r.get("name", "?")
|
||||
ns = r.get("namespace", "?")
|
||||
st = r.get("status", "unknown")
|
||||
if st != "deployed":
|
||||
level = "FAIL" if st.startswith("pending") else "WARN"
|
||||
print(f"{level}:{ns}/{name}:{st}")
|
||||
' 2>/dev/null) || true
|
||||
|
||||
if [[ -z "$bad_releases" ]]; then
|
||||
pass "All Helm releases in deployed state"
|
||||
json_add "helm_releases" "PASS" "All deployed"
|
||||
else
|
||||
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
||||
while IFS= read -r line; do
|
||||
local level release_name release_status
|
||||
level=$(echo "$line" | cut -d: -f1)
|
||||
release_name=$(echo "$line" | cut -d: -f2)
|
||||
release_status=$(echo "$line" | cut -d: -f3)
|
||||
if [[ "$level" == "FAIL" ]]; then
|
||||
fail "Helm release $release_name: $release_status (blocks terraform)"
|
||||
status="FAIL"
|
||||
else
|
||||
warn "Helm release $release_name: $release_status"
|
||||
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||
fi
|
||||
detail+="$release_name=$release_status; "
|
||||
had_issue=true
|
||||
done <<< "$bad_releases"
|
||||
json_add "helm_releases" "$status" "$detail"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 19. Kyverno Policy Engine ---
|
||||
check_kyverno() {
|
||||
section 19 "Kyverno Policy Engine"
|
||||
local kv_pods not_running
|
||||
|
||||
kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
|
||||
if [[ -z "$kv_pods" ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
|
||||
fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
|
||||
json_add "kyverno" "FAIL" "No Kyverno pods found"
|
||||
return 0
|
||||
fi
|
||||
|
||||
not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
|
||||
if [[ -n "$not_running" ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
|
||||
while IFS= read -r line; do
|
||||
fail "Kyverno pod not running: $line"
|
||||
done <<< "$not_running"
|
||||
json_add "kyverno" "FAIL" "$not_running"
|
||||
else
|
||||
local total
|
||||
total=$(count_lines "$kv_pods")
|
||||
pass "All $total Kyverno pods running"
|
||||
json_add "kyverno" "PASS" "$total pods running"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 20. NFS Connectivity ---
|
||||
check_nfs() {
|
||||
section 20 "NFS Connectivity"
|
||||
|
||||
if showmount -e 10.0.10.15 &>/dev/null; then
|
||||
pass "NFS server 10.0.10.15 reachable (exports listed)"
|
||||
json_add "nfs" "PASS" "NFS reachable"
|
||||
elif nc -z -G 3 10.0.10.15 2049 &>/dev/null; then
|
||||
pass "NFS server 10.0.10.15 port 2049 open"
|
||||
json_add "nfs" "PASS" "NFS port open"
|
||||
else
|
||||
[[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
|
||||
fail "NFS server 10.0.10.15 unreachable — 30+ services depend on NFS"
|
||||
json_add "nfs" "FAIL" "NFS unreachable"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 21. DNS Resolution ---
|
||||
check_dns() {
|
||||
section 21 "DNS Resolution"
|
||||
local internal_ok=false external_ok=false detail=""
|
||||
|
||||
if dig @10.0.20.101 viktorbarzin.me +short +time=3 +tries=1 &>/dev/null; then
|
||||
internal_ok=true
|
||||
fi
|
||||
if dig @10.0.20.101 google.com +short +time=3 +tries=1 &>/dev/null; then
|
||||
external_ok=true
|
||||
fi
|
||||
|
||||
if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
|
||||
pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
|
||||
json_add "dns" "PASS" "Both resolve"
|
||||
elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
|
||||
if [[ "$internal_ok" == false ]]; then
|
||||
warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
|
||||
detail="Internal failed"
|
||||
else
|
||||
warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
|
||||
detail="External failed"
|
||||
fi
|
||||
json_add "dns" "WARN" "$detail"
|
||||
else
|
||||
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
|
||||
fail "DNS server 10.0.20.101 not resolving — both internal and external failed"
|
||||
json_add "dns" "FAIL" "Both failed"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 22. TLS Certificate Expiry ---
|
||||
check_tls_certs() {
|
||||
section 22 "TLS Certificate Expiry"
|
||||
local secrets detail="" had_issue=false status="PASS"
|
||||
|
||||
secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
|
||||
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
|
||||
warn "Cannot list secrets"
|
||||
json_add "tls_certs" "WARN" "Cannot list secrets"
|
||||
return 0
|
||||
}
|
||||
|
||||
local cert_issues
|
||||
cert_issues=$(echo "$secrets" | python3 -c '
|
||||
import json, sys, base64, subprocess, hashlib
|
||||
from datetime import datetime, timezone
|
||||
|
||||
data = json.load(sys.stdin)
|
||||
seen_fingerprints = set()
|
||||
results = []
|
||||
|
||||
for item in data.get("items", []):
|
||||
if item.get("type") != "kubernetes.io/tls":
|
||||
continue
|
||||
ns = item["metadata"]["namespace"]
|
||||
name = item["metadata"]["name"]
|
||||
cert_data = item.get("data", {}).get("tls.crt", "")
|
||||
if not cert_data:
|
||||
continue
|
||||
|
||||
# Deduplicate by cert fingerprint
|
||||
raw = base64.b64decode(cert_data)
|
||||
fp = hashlib.sha256(raw).hexdigest()[:16]
|
||||
if fp in seen_fingerprints:
|
||||
continue
|
||||
seen_fingerprints.add(fp)
|
||||
|
||||
# Parse certificate expiry with openssl
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["openssl", "x509", "-noout", "-enddate", "-subject"],
|
||||
input=raw, capture_output=True, timeout=5
|
||||
)
|
||||
output = result.stdout.decode()
|
||||
for line in output.splitlines():
|
||||
if line.startswith("notAfter="):
|
||||
date_str = line.split("=", 1)[1]
|
||||
# Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
|
||||
try:
|
||||
expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
|
||||
expiry = expiry.replace(tzinfo=timezone.utc)
|
||||
days_left = (expiry - datetime.now(timezone.utc)).days
|
||||
if days_left <= 7:
|
||||
print(f"FAIL:{ns}/{name}:{days_left}d")
|
||||
elif days_left <= 30:
|
||||
print(f"WARN:{ns}/{name}:{days_left}d")
|
||||
except ValueError:
|
||||
pass
|
||||
except (subprocess.TimeoutExpired, Exception):
|
||||
pass
|
||||
' 2>/dev/null) || true
|
||||
|
||||
if [[ -z "$cert_issues" ]]; then
|
||||
pass "All TLS certificates valid for >30 days"
|
||||
json_add "tls_certs" "PASS" "All valid >30d"
|
||||
else
|
||||
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
|
||||
while IFS= read -r line; do
|
||||
local level cert_name days
|
||||
level=$(echo "$line" | cut -d: -f1)
|
||||
cert_name=$(echo "$line" | cut -d: -f2)
|
||||
days=$(echo "$line" | cut -d: -f3)
|
||||
if [[ "$level" == "FAIL" ]]; then
|
||||
fail "TLS cert $cert_name expires in $days"
|
||||
status="FAIL"
|
||||
else
|
||||
warn "TLS cert $cert_name expires in $days"
|
||||
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||
fi
|
||||
detail+="$cert_name=$days; "
|
||||
had_issue=true
|
||||
done <<< "$cert_issues"
|
||||
json_add "tls_certs" "$status" "$detail"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 23. GPU Health ---
|
||||
check_gpu() {
|
||||
section 23 "GPU Health"
|
||||
local gpu_pods not_running
|
||||
|
||||
gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
|
||||
if [[ -z "$gpu_pods" ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
|
||||
warn "NVIDIA namespace not found or empty"
|
||||
json_add "gpu" "WARN" "No GPU pods found"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check specifically for device-plugin (critical for GPU scheduling)
|
||||
local device_plugin_down=false
|
||||
local other_down=false
|
||||
local detail=""
|
||||
|
||||
while IFS= read -r line; do
|
||||
local pod_name pod_status
|
||||
pod_name=$(echo "$line" | awk '{print $1}')
|
||||
pod_status=$(echo "$line" | awk '{print $3}')
|
||||
if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
|
||||
if echo "$pod_name" | grep -q "device-plugin"; then
|
||||
device_plugin_down=true
|
||||
detail+="device-plugin $pod_name: $pod_status; "
|
||||
else
|
||||
other_down=true
|
||||
detail+="$pod_name: $pod_status; "
|
||||
fi
|
||||
fi
|
||||
done <<< "$gpu_pods"
|
||||
|
||||
if [[ "$device_plugin_down" == true ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
|
||||
fail "GPU device-plugin is down — GPU workloads cannot schedule"
|
||||
json_add "gpu" "FAIL" "$detail"
|
||||
elif [[ "$other_down" == true ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
|
||||
warn "Some GPU pods not running: $detail"
|
||||
json_add "gpu" "WARN" "$detail"
|
||||
else
|
||||
local total
|
||||
total=$(count_lines "$gpu_pods")
|
||||
pass "All $total GPU pods running"
|
||||
json_add "gpu" "PASS" "$total pods running"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 24. Cloudflare Tunnel ---
|
||||
check_cloudflare_tunnel() {
|
||||
section 24 "Cloudflare Tunnel"
|
||||
local cf_pods running_count total_count
|
||||
|
||||
cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
|
||||
if [[ -z "$cf_pods" ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
|
||||
fail "Cloudflare tunnel namespace not found or empty — external access broken"
|
||||
json_add "cloudflare_tunnel" "FAIL" "No pods found"
|
||||
return 0
|
||||
fi
|
||||
|
||||
total_count=$(count_lines "$cf_pods")
|
||||
running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
|
||||
|
||||
if [[ "$running_count" -eq 0 ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
|
||||
fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
|
||||
json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
|
||||
elif [[ "$running_count" -lt "$total_count" ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
|
||||
warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
|
||||
json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
|
||||
else
|
||||
pass "Cloudflare tunnel: all $total_count pods running"
|
||||
json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Summary ---
|
||||
print_summary() {
|
||||
if [[ "$JSON" == true ]]; then
|
||||
|
|
@ -731,6 +1207,16 @@ main() {
|
|||
check_ingresses
|
||||
check_alerts
|
||||
check_uptime_kuma
|
||||
check_resourcequota
|
||||
check_statefulsets
|
||||
check_node_disk
|
||||
check_helm_releases
|
||||
check_kyverno
|
||||
check_nfs
|
||||
check_dns
|
||||
check_tls_certs
|
||||
check_gpu
|
||||
check_cloudflare_tunnel
|
||||
print_summary
|
||||
|
||||
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue