infra/scripts/cluster_healthcheck.sh
Viktor Barzin 4bedabb9e8 healthcheck: fix three false-positive WARNs (HA token, cert-manager, LVM snap grep)
- HA Sofia token: auto-bootstrap from Vault secret/viktor/haos_api_token when
  HOME_ASSISTANT_SOFIA_{URL,TOKEN} env vars are unset. Default URL =
  https://ha-sofia.viktorbarzin.me.
- cert-manager: add cert_manager_installed() probe (kubectl get crd
  certificates.cert-manager.io). When not installed — which is our current
  state — report PASS "N/A" instead of noisy WARN "CRDs unavailable".
- LVM snapshot freshness: grep pattern was `-- -snap` but actual LV names use
  underscore (`foo_snap_YYYY...`), so the grep matched nothing and the check
  always WARN'd. Fixed to `grep _snap`.

After fix: PASS 36→40, WARN 9→6, FAIL 1→1 (new ha_entities FAIL is a real
HA issue, not a script bug — 400/1401 sensors stale on ha-sofia).
2026-04-19 22:13:32 +00:00

2478 lines
88 KiB
Bash
Executable file

#!/usr/bin/env bash
# Cluster health check script.
# Runs 42 diagnostic checks against the Kubernetes cluster and prints
# a colour-coded report with PASS / WARN / FAIL for each section.
#
# Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]
set -euo pipefail
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m'
# --- Globals ---
PASS_COUNT=0
WARN_COUNT=0
FAIL_COUNT=0
FIX=false
QUIET=false
JSON=false
KUBECONFIG_PATH="$(pwd)/config"
KUBECTL=""
JSON_RESULTS=()
TOTAL_CHECKS=42
# --- Helpers ---
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
pass() { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e " ${GREEN}[PASS]${NC} $*"; }
warn() { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${YELLOW}[WARN]${NC} $*"; }
fail() { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e " ${RED}[FAIL]${NC} $*"; }
section() {
local num="$1" title="$2"
[[ "$JSON" == true ]] && return 0
[[ "$QUIET" == true ]] && return 0
echo ""
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}
section_always() {
local num="$1" title="$2"
[[ "$JSON" == true ]] && return 0
echo ""
echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}
json_add() {
local name="$1" status="$2" detail="$3"
local escaped
escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))')
JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}")
}
# count lines in a variable, returning 0 for empty strings
count_lines() {
local input="$1"
if [[ -z "$input" ]]; then
echo 0
else
echo "$input" | wc -l | tr -d ' '
fi
}
# --- Argument parsing ---
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--fix) FIX=true; shift ;;
--no-fix) FIX=false; shift ;;
--quiet|-q) QUIET=true; shift ;;
--json) JSON=true; shift ;;
--kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
-h|--help)
echo "Usage: $0 [--fix|--no-fix] [--quiet|-q] [--json] [--kubeconfig <path>]"
echo ""
echo "Flags:"
echo " --fix Auto-remediate safe issues (delete evicted pods)"
echo " --no-fix Disable auto-remediation (default)"
echo " --quiet, -q Only show WARN and FAIL sections"
echo " --json Machine-readable JSON output"
echo " --kubeconfig PATH Override kubeconfig (default: \$(pwd)/config)"
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
}
# --- 1. Node Status ---
check_nodes() {
section 1 "Node Status"
local nodes not_ready versions unique_versions detail=""
nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; }
not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true)
versions=$(echo "$nodes" | awk '{print $5}' | sort -u)
unique_versions=$(echo "$versions" | wc -l | tr -d ' ')
if [[ -n "$not_ready" ]]; then
[[ "$QUIET" == true ]] && section_always 1 "Node Status"
fail "NotReady nodes: $not_ready"
detail="NotReady: $not_ready"
json_add "node_status" "FAIL" "$detail"
elif [[ "$unique_versions" -gt 1 ]]; then
[[ "$QUIET" == true ]] && section_always 1 "Node Status"
warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')"
detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')"
json_add "node_status" "WARN" "$detail"
else
pass "All nodes Ready, version $(echo "$versions" | head -1)"
detail="All nodes Ready"
json_add "node_status" "PASS" "$detail"
fi
}
# --- 2. Node Resources ---
check_resources() {
section 2 "Node Resources"
local top detail="" had_issue=false status="PASS"
top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; }
while IFS= read -r line; do
local node cpu_pct mem_pct
node=$(echo "$line" | awk '{print $1}')
cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
# Skip nodes where metrics are not yet available
if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then
detail+="$node metrics unavailable; "
continue
fi
if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; "
had_issue=true
status="FAIL"
elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; "
fi
done <<< "$top"
[[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory"
json_add "node_resources" "$status" "$detail"
}
# --- 3. Node Conditions ---
check_conditions() {
section 3 "Node Conditions"
local conditions detail=""
conditions=$($KUBECTL get nodes -o json | python3 -c '
import json, sys
data = json.load(sys.stdin)
for node in data["items"]:
name = node["metadata"]["name"]
for c in node["status"]["conditions"]:
if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True":
print(name + ": " + c["type"])
' 2>&1) || true
if [[ -n "$conditions" ]]; then
[[ "$QUIET" == true ]] && section_always 3 "Node Conditions"
while IFS= read -r line; do
fail "$line"
done <<< "$conditions"
detail="$conditions"
json_add "node_conditions" "FAIL" "$detail"
else
pass "No pressure conditions on any node"
json_add "node_conditions" "PASS" "No pressure conditions"
fi
}
# --- 4. Problematic Pods ---
check_pods() {
section 4 "Problematic Pods"
local bad count detail="" status="PASS"
bad=$( {
$KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
| grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
$KUBECTL get pods -A --no-headers 2>/dev/null \
| grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
} | awk '!seen[$1,$2]++' | sed '/^$/d') || true
count=$(count_lines "$bad")
if [[ "$count" -eq 0 ]]; then
pass "No problematic pods"
detail="None"
elif [[ "$count" -le 10 ]]; then
[[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
warn "$count problematic pod(s):"
[[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo " $line"; done
detail="$count pods"
status="WARN"
else
[[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
fail "$count problematic pods (showing first 10):"
[[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo " $line"; done
detail="$count pods"
status="FAIL"
fi
json_add "problematic_pods" "$status" "$detail"
}
# --- 5. Evicted/Failed Pods ---
check_evicted() {
section 5 "Evicted/Failed Pods"
local evicted count detail="" status="PASS"
evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true)
count=$(count_lines "$evicted")
if [[ "$count" -eq 0 ]]; then
pass "No evicted or failed pods"
detail="0"
elif [[ "$count" -le 50 ]]; then
[[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
warn "$count evicted/failed pod(s)"
detail="$count pods"
status="WARN"
else
[[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
fail "$count evicted/failed pods"
detail="$count pods"
status="FAIL"
fi
if [[ "$FIX" == true && "$count" -gt 0 ]]; then
info "Deleting $count evicted/failed pods..."
$KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true
info "Deleted evicted/failed pods"
fi
json_add "evicted_pods" "$status" "$detail"
}
# --- 6. DaemonSets ---
check_daemonsets() {
section 6 "DaemonSets"
local ds detail="" had_issue=false
ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; }
while IFS= read -r line; do
local ns name desired ready
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
desired=$(echo "$line" | awk '{print $3}')
ready=$(echo "$line" | awk '{print $5}')
if [[ "$desired" != "$ready" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets"
fail "$ns/$name: desired=$desired ready=$ready"
detail+="$ns/$name desired=$desired ready=$ready; "
had_issue=true
fi
done <<< "$ds"
if [[ "$had_issue" == false ]]; then
pass "All DaemonSets healthy (desired == ready)"
json_add "daemonsets" "PASS" "All healthy"
else
json_add "daemonsets" "FAIL" "$detail"
fi
}
# --- 7. Deployments ---
check_deployments() {
section 7 "Deployments"
local deps detail="" had_issue=false
deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; }
while IFS= read -r line; do
local ns name ready current desired
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
ready=$(echo "$line" | awk '{print $3}')
current=$(echo "$ready" | cut -d/ -f1)
desired=$(echo "$ready" | cut -d/ -f2)
if [[ "$current" != "$desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments"
fail "$ns/$name: $current/$desired ready"
detail+="$ns/$name $current/$desired; "
had_issue=true
fi
done <<< "$deps"
if [[ "$had_issue" == false ]]; then
pass "All deployments fully available"
json_add "deployments" "PASS" "All available"
else
json_add "deployments" "FAIL" "$detail"
fi
}
# --- 8. PVC Status ---
check_pvcs() {
section 8 "PVC Status"
local pvcs detail="" had_issue=false
pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true
if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then
pass "No PVCs in cluster"
json_add "pvcs" "PASS" "No PVCs"
return 0
fi
while IFS= read -r line; do
local ns name status
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
status=$(echo "$line" | awk '{print $3}')
if [[ "$status" != "Bound" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status"
fail "$ns/$name: $status"
detail+="$ns/$name=$status; "
had_issue=true
fi
done <<< "$pvcs"
if [[ "$had_issue" == false ]]; then
pass "All PVCs Bound"
json_add "pvcs" "PASS" "All Bound"
else
json_add "pvcs" "FAIL" "$detail"
fi
}
# --- 9. HPA Health ---
check_hpa() {
section 9 "HPA Health"
local hpas detail="" had_issue=false status="PASS"
hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true
if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then
pass "No HPAs configured"
json_add "hpa" "PASS" "No HPAs"
return 0
fi
while IFS= read -r line; do
local ns name targets
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
targets=$(echo "$line" | awk '{print $3}')
if echo "$targets" | grep -q '<unknown>'; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
fail "$ns/$name: targets=$targets (unknown metrics)"
detail+="$ns/$name=unknown; "
had_issue=true
status="FAIL"
else
# Parse percentage values from targets like "45%/80%, 30%/50%"
local pcts
pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true)
if [[ -n "$pcts" ]]; then
while IFS= read -r pct; do
[[ -z "$pct" ]] && continue
if [[ "$pct" -gt 150 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
fail "$ns/$name: utilization at ${pct}%"
detail+="$ns/$name=${pct}%; "
had_issue=true
status="FAIL"
break
elif [[ "$pct" -gt 100 ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
warn "$ns/$name: utilization at ${pct}%"
detail+="$ns/$name=${pct}%; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
break
fi
done <<< "$pcts"
fi
fi
done <<< "$hpas"
[[ "$had_issue" == false ]] && pass "All HPAs healthy"
json_add "hpa" "$status" "${detail:-All healthy}"
}
# --- 10. CronJob Failures ---
check_cronjobs() {
section 10 "CronJob Failures"
local failures detail=""
failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
import json, sys
from datetime import datetime, timezone, timedelta
data = json.load(sys.stdin)
cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
for job in data.get("items", []):
meta = job.get("metadata", {})
ns = meta.get("namespace", "")
name = meta.get("name", "")
owners = meta.get("ownerReferences", [])
is_cronjob = any(o.get("kind") == "CronJob" for o in owners)
if not is_cronjob:
continue
conditions = job.get("status", {}).get("conditions", [])
for c in conditions:
if c.get("type") == "Failed" and c.get("status") == "True":
ts = c.get("lastTransitionTime", "")
if ts:
try:
t = datetime.fromisoformat(ts.replace("Z", "+00:00"))
if t > cutoff:
print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
except:
print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
' 2>/dev/null) || true
if [[ -z "$failures" ]]; then
pass "No CronJob failures in last 24h"
json_add "cronjob_failures" "PASS" "None"
else
[[ "$QUIET" == true ]] && section_always 10 "CronJob Failures"
local count
count=$(count_lines "$failures")
fail "$count CronJob failure(s) in last 24h:"
[[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo " $line"; done
json_add "cronjob_failures" "FAIL" "$count failures"
fi
}
# --- 11. CrowdSec ---
check_crowdsec() {
section 11 "CrowdSec Agents"
local cs_pods not_running
cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true)
if [[ -z "$cs_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
warn "CrowdSec namespace not found or empty"
json_add "crowdsec" "WARN" "No CrowdSec pods found"
return 0
fi
not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
if [[ -n "$not_running" ]]; then
[[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
while IFS= read -r line; do
fail "CrowdSec pod not running: $line"
done <<< "$not_running"
json_add "crowdsec" "FAIL" "$not_running"
else
local total
total=$(count_lines "$cs_pods")
pass "All $total CrowdSec pods running"
json_add "crowdsec" "PASS" "$total pods running"
fi
}
# --- 12. Ingress ---
check_ingresses() {
section 12 "Ingress Routes"
local ingresses no_lb detail="" had_issue=false
ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true)
if [[ -n "$ingresses" ]]; then
no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true)
if [[ -n "$no_lb" ]]; then
[[ "$QUIET" == true ]] && section_always 12 "Ingress Routes"
while IFS= read -r line; do
fail "Ingress missing LB IP: $line"
done <<< "$no_lb"
detail="Missing LB: $no_lb"
had_issue=true
fi
fi
# Check Traefik LB service
local traefik_svc_ip
traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
if [[ -z "$traefik_svc_ip" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes"
fail "Traefik LoadBalancer has no external IP"
detail+="Traefik LB missing IP; "
had_issue=true
else
detail+="Traefik LB=$traefik_svc_ip; "
fi
if [[ "$had_issue" == false ]]; then
pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)"
json_add "ingresses" "PASS" "$detail"
else
json_add "ingresses" "FAIL" "$detail"
fi
}
# --- 13. Prometheus Alerts ---
check_alerts() {
section 13 "Prometheus Alerts"
local alerts firing_count
# Try alertmanager first, then prometheus server
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true)
if [[ -z "$alerts" ]]; then
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true)
fi
if [[ -z "$alerts" ]]; then
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
warn "Could not query Prometheus/Alertmanager"
json_add "prometheus_alerts" "WARN" "Cannot query"
return 0
fi
firing_count=$(echo "$alerts" | python3 -c '
import json, sys
try:
data = json.load(sys.stdin)
if isinstance(data, list):
active = [a for a in data if a.get("status", {}).get("state") == "active"]
count = len(active)
names = [a.get("labels", {}).get("alertname", "?") for a in active]
print(f"{count}:" + ",".join(names) if count > 0 else "0:")
elif isinstance(data, dict) and "data" in data:
alerts_list = data["data"].get("alerts", [])
firing = [a for a in alerts_list if a.get("state") == "firing"]
count = len(firing)
names = [a.get("labels", {}).get("alertname", "?") for a in firing]
print(f"{count}:" + ",".join(names) if count > 0 else "0:")
else:
print("0:")
except:
print("-1:")
' 2>/dev/null || echo "-1:")
local count names
count=$(echo "$firing_count" | cut -d: -f1)
names=$(echo "$firing_count" | cut -d: -f2-)
if [[ "$count" == "-1" ]]; then
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
warn "Failed to parse alert data"
json_add "prometheus_alerts" "WARN" "Parse error"
elif [[ "$count" -eq 0 ]]; then
pass "No firing alerts"
json_add "prometheus_alerts" "PASS" "0 firing"
elif [[ "$count" -le 3 ]]; then
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
warn "$count firing alert(s): $names"
json_add "prometheus_alerts" "WARN" "$count firing: $names"
else
[[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
fail "$count firing alerts: $names"
json_add "prometheus_alerts" "FAIL" "$count firing: $names"
fi
}
# --- 14. Uptime Kuma ---
check_uptime_kuma() {
section 14 "Uptime Kuma Monitors"
local result
# Get password from Vault (or env var fallback)
local uk_pass="${UPTIME_KUMA_PASSWORD:-}"
if [[ -z "$uk_pass" ]]; then
uk_pass=$(vault kv get -field=uptime_kuma_admin_password secret/viktor 2>/dev/null) || true
fi
if [[ -z "$uk_pass" ]]; then
warn "Uptime Kuma: password not available (set UPTIME_KUMA_PASSWORD or vault login)"
json_add "uptime_kuma" "WARN" "password not available"
return 0
fi
result=$(UPTIME_KUMA_PASSWORD="$uk_pass" ~/.venvs/claude/bin/python3 -c '
import sys, os
try:
from uptime_kuma_api import UptimeKumaApi
except ImportError:
print("ERROR:uptime-kuma-api not installed")
sys.exit(0)
try:
api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2)
api.login("admin", os.environ["UPTIME_KUMA_PASSWORD"])
monitors = api.get_monitors()
heartbeats = api.get_heartbeats()
# Separate internal and external monitors
internal_up = 0
internal_down = []
external_up = 0
external_down = []
paused_count = 0
for m in monitors:
mid = m.get("id")
name = m.get("name", "unknown")
active = m.get("active", True)
is_external = name.startswith("[External] ")
if not active:
paused_count += 1
continue
beats = heartbeats.get(mid, [])
if beats:
last_beat = beats[-1]
if isinstance(last_beat, list):
last_beat = last_beat[-1] if last_beat else {}
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
if hasattr(status, "value"):
status = status.value
is_up = (status == 1)
else:
is_up = False
if is_external:
if is_up:
external_up += 1
else:
external_down.append(name.replace("[External] ", ""))
else:
if is_up:
internal_up += 1
else:
internal_down.append(name)
api.disconnect()
int_down_names = ", ".join(internal_down) if internal_down else ""
ext_down_names = ", ".join(external_down) if external_down else ""
# Format: int_down:int_up:ext_down:ext_up:paused:int_down_names|ext_down_names
print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}")
except Exception as e:
print(f"CONN_ERROR:{e}")
' 2>/dev/null) || result="CONN_ERROR:python execution failed"
if [[ "$result" == "ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
warn "Uptime Kuma: ${result#ERROR:}"
json_add "uptime_kuma" "WARN" "${result#ERROR:}"
elif [[ "$result" == "CONN_ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}"
json_add "uptime_kuma" "WARN" "Connection failed"
else
local int_down int_up ext_down ext_up paused_count down_details
int_down=$(echo "$result" | cut -d: -f1)
int_up=$(echo "$result" | cut -d: -f2)
ext_down=$(echo "$result" | cut -d: -f3)
ext_up=$(echo "$result" | cut -d: -f4)
paused_count=$(echo "$result" | cut -d: -f5)
down_details=$(echo "$result" | cut -d: -f6-)
local int_down_names="${down_details%%|*}"
local ext_down_names="${down_details#*|}"
local total_down=$((int_down + ext_down))
local total_up=$((int_up + ext_up))
local total_active=$((total_up + total_down))
if [[ "$total_down" -eq 0 ]]; then
pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)"
json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused"
else
[[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
local details=""
[[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names"
[[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; }
if [[ "$total_down" -le 3 ]]; then
warn "$total_down/$total_active down: $details"
json_add "uptime_kuma" "WARN" "$details"
else
fail "$total_down/$total_active down: $details"
json_add "uptime_kuma" "FAIL" "$details"
fi
fi
fi
}
# --- 15. ResourceQuota Pressure ---
check_resourcequota() {
section 15 "ResourceQuota Pressure"
local quotas detail="" had_issue=false status="PASS"
quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
local pressure
pressure=$(echo "$quotas" | python3 -c '
import json, sys, re
def parse_cpu(val):
"""Convert CPU value to millicores."""
val = str(val)
if val.endswith("m"):
return float(val[:-1])
return float(val) * 1000
def parse_mem(val):
"""Convert memory value to bytes."""
val = str(val)
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return float(val[:-len(suffix)]) * mult
# Plain bytes or numeric
return float(val)
data = json.load(sys.stdin)
for item in data.get("items", []):
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
status = item.get("status", {})
hard = status.get("hard", {})
used = status.get("used", {})
for resource, hard_val in hard.items():
used_val = used.get(resource, "0")
try:
if "cpu" in resource:
h = parse_cpu(hard_val)
u = parse_cpu(used_val)
elif "memory" in resource or "storage" in resource:
h = parse_mem(hard_val)
u = parse_mem(used_val)
elif resource == "pods":
h = float(hard_val)
u = float(used_val)
else:
continue
if h <= 0:
continue
pct = (u / h) * 100
if pct > 80:
level = "FAIL" if pct > 95 else "WARN"
print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
except (ValueError, ZeroDivisionError):
pass
' 2>/dev/null) || true
if [[ -z "$pressure" ]]; then
pass "All ResourceQuotas below 80% usage"
json_add "resourcequota" "PASS" "All below 80%"
else
[[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
while IFS= read -r line; do
local level ns_res resource pct
level=$(echo "$line" | cut -d: -f1)
ns_res=$(echo "$line" | cut -d: -f2)
resource=$(echo "$line" | cut -d: -f3)
pct=$(echo "$line" | cut -d: -f4)
if [[ "$level" == "FAIL" ]]; then
fail "$ns_res: $resource at $pct"
status="FAIL"
else
warn "$ns_res: $resource at $pct"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$ns_res $resource=$pct; "
had_issue=true
done <<< "$pressure"
json_add "resourcequota" "$status" "$detail"
fi
}
# --- 16. StatefulSets ---
check_statefulsets() {
section 16 "StatefulSets"
local sts detail="" had_issue=false
sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
pass "No StatefulSets in cluster"
json_add "statefulsets" "PASS" "No StatefulSets"
return 0
fi
while IFS= read -r line; do
local ns name ready current desired
ns=$(echo "$line" | awk '{print $1}')
name=$(echo "$line" | awk '{print $2}')
ready=$(echo "$line" | awk '{print $3}')
current=$(echo "$ready" | cut -d/ -f1)
desired=$(echo "$ready" | cut -d/ -f2)
if [[ "$current" != "$desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
fail "$ns/$name: $current/$desired ready"
detail+="$ns/$name $current/$desired; "
had_issue=true
fi
done <<< "$sts"
if [[ "$had_issue" == false ]]; then
pass "All StatefulSets fully available"
json_add "statefulsets" "PASS" "All available"
else
json_add "statefulsets" "FAIL" "$detail"
fi
}
# --- 17. Node Disk Usage ---
check_node_disk() {
section 17 "Node Disk Usage"
local node_json detail="" had_issue=false status="PASS"
node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
local disk_info
disk_info=$(echo "$node_json" | python3 -c '
import json, sys
def parse_storage(val):
"""Convert storage value to bytes."""
val = str(val)
units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
for suffix, mult in units.items():
if val.endswith(suffix):
return float(val[:-len(suffix)]) * mult
return float(val)
data = json.load(sys.stdin)
for node in data["items"]:
name = node["metadata"]["name"]
cap = node["status"].get("capacity", {})
alloc = node["status"].get("allocatable", {})
es_cap = cap.get("ephemeral-storage", "0")
es_alloc = alloc.get("ephemeral-storage", "0")
try:
c = parse_storage(es_cap)
a = parse_storage(es_alloc)
if c > 0:
used_pct = ((c - a) / c) * 100
if used_pct > 80:
level = "FAIL" if used_pct > 90 else "WARN"
print(f"{level}:{name}:{used_pct:.0f}")
except (ValueError, ZeroDivisionError):
pass
' 2>/dev/null) || true
if [[ -z "$disk_info" ]]; then
pass "All nodes below 80% ephemeral-storage usage"
json_add "node_disk" "PASS" "All below 80%"
else
[[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
while IFS= read -r line; do
local level node pct
level=$(echo "$line" | cut -d: -f1)
node=$(echo "$line" | cut -d: -f2)
pct=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "$node: ephemeral-storage at ${pct}%"
status="FAIL"
else
warn "$node: ephemeral-storage at ${pct}%"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$node=${pct}%; "
had_issue=true
done <<< "$disk_info"
json_add "node_disk" "$status" "$detail"
fi
}
# --- 18. Helm Release Health ---
check_helm_releases() {
section 18 "Helm Release Health"
local releases detail="" had_issue=false status="PASS"
releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
warn "Cannot list Helm releases"
json_add "helm_releases" "WARN" "Cannot list"
return 0
}
local bad_releases
bad_releases=$(echo "$releases" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for r in data:
name = r.get("name", "?")
ns = r.get("namespace", "?")
st = r.get("status", "unknown")
if st != "deployed":
level = "FAIL" if st.startswith("pending") else "WARN"
print(f"{level}:{ns}/{name}:{st}")
' 2>/dev/null) || true
if [[ -z "$bad_releases" ]]; then
pass "All Helm releases in deployed state"
json_add "helm_releases" "PASS" "All deployed"
else
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
while IFS= read -r line; do
local level release_name release_status
level=$(echo "$line" | cut -d: -f1)
release_name=$(echo "$line" | cut -d: -f2)
release_status=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "Helm release $release_name: $release_status (blocks terraform)"
status="FAIL"
else
warn "Helm release $release_name: $release_status"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$release_name=$release_status; "
had_issue=true
done <<< "$bad_releases"
json_add "helm_releases" "$status" "$detail"
fi
}
# --- 19. Kyverno Policy Engine ---
check_kyverno() {
section 19 "Kyverno Policy Engine"
local kv_pods not_running
kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
if [[ -z "$kv_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
json_add "kyverno" "FAIL" "No Kyverno pods found"
return 0
fi
not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
if [[ -n "$not_running" ]]; then
[[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
while IFS= read -r line; do
fail "Kyverno pod not running: $line"
done <<< "$not_running"
json_add "kyverno" "FAIL" "$not_running"
else
local total
total=$(count_lines "$kv_pods")
pass "All $total Kyverno pods running"
json_add "kyverno" "PASS" "$total pods running"
fi
}
# --- 20. NFS Connectivity ---
check_nfs() {
section 20 "NFS Connectivity"
if showmount -e 192.168.1.127 &>/dev/null; then
pass "NFS server 192.168.1.127 (Proxmox) reachable (exports listed)"
json_add "nfs" "PASS" "NFS reachable"
elif nc -z -G 3 192.168.1.127 2049 &>/dev/null; then
pass "NFS server 192.168.1.127 port 2049 open"
json_add "nfs" "PASS" "NFS port open"
else
[[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
fail "NFS server 192.168.1.127 (Proxmox) unreachable — 30+ services depend on NFS"
json_add "nfs" "FAIL" "NFS unreachable"
fi
}
# --- 21. DNS Resolution ---
check_dns() {
section 21 "DNS Resolution"
local internal_ok=false external_ok=false detail=""
# Test DNS from inside the cluster via kubectl exec (MetalLB IPs may not be
# reachable from outside the L2 network)
local dns_pod
dns_pod=$($KUBECTL get pods -n technitium -l app=technitium -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
if [[ -n "$dns_pod" ]]; then
if $KUBECTL exec -n technitium "$dns_pod" -- nslookup viktorbarzin.me 127.0.0.1 &>/dev/null; then
internal_ok=true
fi
if $KUBECTL exec -n technitium "$dns_pod" -- nslookup google.com 127.0.0.1 &>/dev/null; then
external_ok=true
fi
fi
if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
json_add "dns" "PASS" "Both resolve"
elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
if [[ "$internal_ok" == false ]]; then
warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
detail="Internal failed"
else
warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
detail="External failed"
fi
json_add "dns" "WARN" "$detail"
else
[[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
fail "DNS server (Technitium) not resolving — both internal and external failed"
json_add "dns" "FAIL" "Both failed"
fi
}
# --- 22. TLS Certificate Expiry ---
check_tls_certs() {
section 22 "TLS Certificate Expiry"
local secrets detail="" had_issue=false status="PASS"
secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
warn "Cannot list secrets"
json_add "tls_certs" "WARN" "Cannot list secrets"
return 0
}
local cert_issues
cert_issues=$(echo "$secrets" | python3 -c '
import json, sys, base64, subprocess, hashlib
from datetime import datetime, timezone
data = json.load(sys.stdin)
seen_fingerprints = set()
results = []
for item in data.get("items", []):
if item.get("type") != "kubernetes.io/tls":
continue
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
cert_data = item.get("data", {}).get("tls.crt", "")
if not cert_data:
continue
# Deduplicate by cert fingerprint
raw = base64.b64decode(cert_data)
fp = hashlib.sha256(raw).hexdigest()[:16]
if fp in seen_fingerprints:
continue
seen_fingerprints.add(fp)
# Parse certificate expiry with openssl
try:
result = subprocess.run(
["openssl", "x509", "-noout", "-enddate", "-subject"],
input=raw, capture_output=True, timeout=5
)
output = result.stdout.decode()
for line in output.splitlines():
if line.startswith("notAfter="):
date_str = line.split("=", 1)[1]
# Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
try:
expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
expiry = expiry.replace(tzinfo=timezone.utc)
days_left = (expiry - datetime.now(timezone.utc)).days
if days_left <= 7:
print(f"FAIL:{ns}/{name}:{days_left}d")
elif days_left <= 30:
print(f"WARN:{ns}/{name}:{days_left}d")
except ValueError:
pass
except (subprocess.TimeoutExpired, Exception):
pass
' 2>/dev/null) || true
if [[ -z "$cert_issues" ]]; then
pass "All TLS certificates valid for >30 days"
json_add "tls_certs" "PASS" "All valid >30d"
else
[[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
while IFS= read -r line; do
local level cert_name days
level=$(echo "$line" | cut -d: -f1)
cert_name=$(echo "$line" | cut -d: -f2)
days=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "TLS cert $cert_name expires in $days"
status="FAIL"
else
warn "TLS cert $cert_name expires in $days"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$cert_name=$days; "
had_issue=true
done <<< "$cert_issues"
json_add "tls_certs" "$status" "$detail"
fi
}
# --- 23. GPU Health ---
check_gpu() {
section 23 "GPU Health"
local gpu_pods not_running
gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
if [[ -z "$gpu_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
warn "NVIDIA namespace not found or empty"
json_add "gpu" "WARN" "No GPU pods found"
return 0
fi
# Check specifically for device-plugin (critical for GPU scheduling)
local device_plugin_down=false
local other_down=false
local detail=""
while IFS= read -r line; do
local pod_name pod_status
pod_name=$(echo "$line" | awk '{print $1}')
pod_status=$(echo "$line" | awk '{print $3}')
if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
if echo "$pod_name" | grep -q "device-plugin"; then
device_plugin_down=true
detail+="device-plugin $pod_name: $pod_status; "
else
other_down=true
detail+="$pod_name: $pod_status; "
fi
fi
done <<< "$gpu_pods"
if [[ "$device_plugin_down" == true ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
fail "GPU device-plugin is down — GPU workloads cannot schedule"
json_add "gpu" "FAIL" "$detail"
elif [[ "$other_down" == true ]]; then
[[ "$QUIET" == true ]] && section_always 23 "GPU Health"
warn "Some GPU pods not running: $detail"
json_add "gpu" "WARN" "$detail"
else
local total
total=$(count_lines "$gpu_pods")
pass "All $total GPU pods running"
json_add "gpu" "PASS" "$total pods running"
fi
}
# --- 24. Cloudflare Tunnel ---
check_cloudflare_tunnel() {
section 24 "Cloudflare Tunnel"
local cf_pods running_count total_count
cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
if [[ -z "$cf_pods" ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel namespace not found or empty — external access broken"
json_add "cloudflare_tunnel" "FAIL" "No pods found"
return 0
fi
total_count=$(count_lines "$cf_pods")
running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
if [[ "$running_count" -eq 0 ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
elif [[ "$running_count" -lt "$total_count" ]]; then
[[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
else
pass "Cloudflare tunnel: all $total_count pods running"
json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
fi
}
# --- 25. Resource Usage ---
check_overcommit() {
section 25 "Resource Usage"
local detail="" had_issue=false status="PASS"
local usage
usage=$($KUBECTL top nodes --no-headers 2>/dev/null) || { fail "Cannot get node metrics"; json_add "overcommit" "FAIL" "No metrics"; return 0; }
if [[ -z "$usage" ]]; then
fail "metrics-server returned no data"
json_add "overcommit" "FAIL" "No data"
return 0
fi
while IFS= read -r line; do
local name cpu_pct mem_pct cpu_cores mem_bytes level node_detail
name=$(echo "$line" | awk '{print $1}')
cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
cpu_cores=$(echo "$line" | awk '{print $2}')
mem_bytes=$(echo "$line" | awk '{print $4}')
if [[ "$cpu_pct" -gt 90 || "$mem_pct" -gt 90 ]]; then
level="FAIL"
elif [[ "$cpu_pct" -gt 80 || "$mem_pct" -gt 80 ]]; then
level="WARN"
else
level="OK"
fi
node_detail="${name}: cpu ${cpu_cores} (${cpu_pct}%), mem ${mem_bytes} (${mem_pct}%)"
if [[ "$level" == "FAIL" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Usage"
fail "$node_detail"
had_issue=true
status="FAIL"
elif [[ "$level" == "WARN" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Usage"
warn "$node_detail"
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
pass "$node_detail"
fi
detail+="$node_detail; "
done <<< "$usage"
json_add "overcommit" "$status" "$detail"
}
# --- HA helpers ---
HA_CACHE_DIR=""
ha_sofia_available() {
if [[ -z "${HOME_ASSISTANT_SOFIA_URL:-}" ]]; then
export HOME_ASSISTANT_SOFIA_URL="https://ha-sofia.viktorbarzin.me"
fi
if [[ -z "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]]; then
if command -v vault >/dev/null 2>&1 && [[ -n "${VAULT_TOKEN:-}${HOME:-}" ]]; then
local t
t=$(vault kv get -field=haos_api_token secret/viktor 2>/dev/null || true)
[[ -n "$t" ]] && export HOME_ASSISTANT_SOFIA_TOKEN="$t"
fi
fi
[[ -n "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]] || return 1
return 0
}
# Fetch all HA data once and cache in temp files
ha_sofia_fetch_cache() {
if [[ -n "$HA_CACHE_DIR" ]]; then
return 0
fi
HA_CACHE_DIR=$(mktemp -d)
export HA_CACHE_DIR
trap "rm -rf $HA_CACHE_DIR" EXIT
python3 << 'HA_FETCH_EOF'
import os, json, requests, sys
url = os.environ["HOME_ASSISTANT_SOFIA_URL"]
token = os.environ["HOME_ASSISTANT_SOFIA_TOKEN"]
cache = os.environ["HA_CACHE_DIR"]
headers = {"Authorization": f"Bearer {token}"}
errors = []
# Fetch states (used by checks 26, 28)
try:
resp = requests.get(f"{url}/api/states", headers=headers, timeout=30)
resp.raise_for_status()
with open(f"{cache}/states.json", "w") as f:
json.dump(resp.json(), f)
except Exception as e:
errors.append(f"states:{e}")
# Fetch config entries (used by check 27)
try:
resp = requests.get(f"{url}/api/config/config_entries/entry", headers=headers, timeout=30)
resp.raise_for_status()
with open(f"{cache}/entries.json", "w") as f:
json.dump(resp.json(), f)
except Exception as e:
errors.append(f"entries:{e}")
# Fetch config (used by check 29)
try:
resp = requests.get(f"{url}/api/config", headers=headers, timeout=10)
resp.raise_for_status()
with open(f"{cache}/config.json", "w") as f:
json.dump(resp.json(), f)
except Exception as e:
errors.append(f"config:{e}")
if errors:
with open(f"{cache}/errors.txt", "w") as f:
f.write("\n".join(errors))
HA_FETCH_EOF
}
# --- 26. HA Entity Availability ---
check_ha_entities() {
section 26 "HA Sofia — Entity Availability"
if ! ha_sofia_available; then
warn "HA Sofia token not configured — skipping"
json_add "ha_entities" "WARN" "Token not configured"
return 0
fi
ha_sofia_fetch_cache
if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
local err=""
[[ -f "$HA_CACHE_DIR/errors.txt" ]] && err=$(grep "^states:" "$HA_CACHE_DIR/errors.txt" | head -1)
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
warn "HA Sofia API unreachable: ${err:-unknown error}"
json_add "ha_entities" "WARN" "API unreachable"
return 0
fi
local result
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json
cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/states.json") as f:
states = json.load(f)
unavail = [s for s in states if s.get("state") in ("unavailable", "unknown")]
domains = {}
for s in unavail:
d = s["entity_id"].split(".")[0]
domains[d] = domains.get(d, 0) + 1
total = len(states)
count = len(unavail)
summary = ", ".join(f"{d}:{n}" for d, n in sorted(domains.items(), key=lambda x: -x[1]))
entity_list = "\n".join("ENTITY:" + s["entity_id"] for s in unavail)
print(f"{count}:{total}:{summary}")
if entity_list:
print(entity_list)
PYEOF
) || result="ERROR:python execution failed"
if [[ "$result" == "ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
warn "HA Sofia: ${result#ERROR:}"
json_add "ha_entities" "WARN" "${result#ERROR:}"
return 0
fi
local first_line count total summary
first_line=$(echo "$result" | head -1)
count=$(echo "$first_line" | cut -d: -f1)
total=$(echo "$first_line" | cut -d: -f2)
summary=$(echo "$first_line" | cut -d: -f3-)
if [[ "$count" -eq 0 ]]; then
pass "All $total HA entities available"
json_add "ha_entities" "PASS" "0/$total unavailable"
elif [[ "$count" -le 10 ]]; then
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
warn "$count/$total entities unavailable ($summary)"
if [[ "$JSON" != true && "$QUIET" != true ]]; then
echo "$result" | grep "^ENTITY:" | sed 's/^ENTITY:/ /'
fi
json_add "ha_entities" "WARN" "$count/$total: $summary"
else
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
fail "$count/$total entities unavailable ($summary)"
if [[ "$JSON" != true && "$QUIET" != true ]]; then
echo "$result" | grep "^ENTITY:" | head -20 | sed 's/^ENTITY:/ /'
local entity_count
entity_count=$(echo "$result" | grep -c "^ENTITY:" || true)
if [[ "$entity_count" -gt 20 ]]; then
echo " ... and $((entity_count - 20)) more"
fi
fi
json_add "ha_entities" "FAIL" "$count/$total: $summary"
fi
}
# --- 27. HA Integration Health ---
check_ha_integrations() {
section 27 "HA Sofia — Integration Health"
if ! ha_sofia_available; then
warn "HA Sofia token not configured — skipping"
json_add "ha_integrations" "WARN" "Token not configured"
return 0
fi
ha_sofia_fetch_cache
if [[ ! -f "$HA_CACHE_DIR/entries.json" ]]; then
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
warn "HA Sofia config entries API unavailable"
json_add "ha_integrations" "WARN" "API unavailable"
return 0
fi
local result
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json
cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/entries.json") as f:
entries = json.load(f)
total = len(entries)
not_loaded = []
setup_error = []
for e in entries:
state = e.get("state", "loaded")
domain = e.get("domain", "?")
title = e.get("title", "?")
if state == "setup_error" or state == "setup_retry":
setup_error.append(f"{domain} ({title})")
elif state == "not_loaded":
not_loaded.append(f"{domain} ({title})")
error_count = len(setup_error)
unloaded_count = len(not_loaded)
error_names = "; ".join(setup_error) if setup_error else ""
unloaded_names = "; ".join(not_loaded) if not_loaded else ""
print(f"{total}:{error_count}:{unloaded_count}:{error_names}:{unloaded_names}")
PYEOF
) || result="ERROR:python execution failed"
if [[ "$result" == "ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
warn "HA Sofia: ${result#ERROR:}"
json_add "ha_integrations" "WARN" "${result#ERROR:}"
return 0
fi
local total error_count unloaded_count error_names unloaded_names
total=$(echo "$result" | cut -d: -f1)
error_count=$(echo "$result" | cut -d: -f2)
unloaded_count=$(echo "$result" | cut -d: -f3)
error_names=$(echo "$result" | cut -d: -f4)
unloaded_names=$(echo "$result" | cut -d: -f5-)
if [[ "$error_count" -gt 0 ]]; then
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
fail "$error_count integration(s) in error state: $error_names"
json_add "ha_integrations" "FAIL" "$error_count errors: $error_names"
elif [[ "$unloaded_count" -gt 0 ]]; then
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
warn "$unloaded_count integration(s) not loaded: $unloaded_names"
json_add "ha_integrations" "WARN" "$unloaded_count not loaded: $unloaded_names"
else
pass "All $total integrations loaded"
json_add "ha_integrations" "PASS" "All $total loaded"
fi
}
# --- 28. HA Automation Status ---
check_ha_automations() {
section 28 "HA Sofia — Automation Status"
if ! ha_sofia_available; then
warn "HA Sofia token not configured — skipping"
json_add "ha_automations" "WARN" "Token not configured"
return 0
fi
ha_sofia_fetch_cache
if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
[[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
warn "HA Sofia states API unavailable"
json_add "ha_automations" "WARN" "API unavailable"
return 0
fi
local result
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json
from datetime import datetime, timezone
cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/states.json") as f:
states = json.load(f)
autos = [s for s in states if s["entity_id"].startswith("automation.")]
total = len(autos)
disabled = [a["entity_id"] for a in autos if a["state"] == "off"]
disabled_count = len(disabled)
now = datetime.now(timezone.utc)
stale = []
for a in autos:
if a["state"] == "off":
continue
lt = a.get("attributes", {}).get("last_triggered")
if lt:
try:
t = datetime.fromisoformat(lt.replace("Z", "+00:00"))
days = (now - t).days
if days > 30:
stale.append(a["entity_id"] + "=" + str(days) + "d")
except:
pass
stale_count = len(stale)
disabled_names = "; ".join(disabled)
stale_names = "; ".join(stale[:10])
print(f"{total}:{disabled_count}:{stale_count}:{disabled_names}:{stale_names}")
PYEOF
) || result="ERROR:python execution failed"
if [[ "$result" == "ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
warn "HA Sofia: ${result#ERROR:}"
json_add "ha_automations" "WARN" "${result#ERROR:}"
return 0
fi
local total disabled_count stale_count disabled_names stale_names
total=$(echo "$result" | cut -d: -f1)
disabled_count=$(echo "$result" | cut -d: -f2)
stale_count=$(echo "$result" | cut -d: -f3)
disabled_names=$(echo "$result" | cut -d: -f4)
stale_names=$(echo "$result" | cut -d: -f5-)
local status="PASS" detail=""
if [[ "$disabled_count" -gt 0 ]]; then
[[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
warn "$disabled_count/$total automation(s) disabled"
if [[ "$JSON" != true && "$QUIET" != true && -n "$disabled_names" ]]; then
echo "$disabled_names" | tr ';' '\n' | sed 's/^ */ /'
fi
status="WARN"
detail+="$disabled_count disabled; "
fi
if [[ "$stale_count" -gt 0 ]]; then
[[ "$status" == "PASS" && "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
warn "$stale_count automation(s) not triggered in 30+ days"
if [[ "$JSON" != true && "$QUIET" != true && -n "$stale_names" ]]; then
echo "$stale_names" | tr ';' '\n' | sed 's/^ */ /'
fi
[[ "$status" == "PASS" ]] && status="WARN"
detail+="$stale_count stale; "
fi
if [[ "$status" == "PASS" ]]; then
pass "All $total automations enabled and recently active"
json_add "ha_automations" "PASS" "All $total active"
else
json_add "ha_automations" "$status" "$detail"
fi
}
# --- 29. HA System Resources ---
check_ha_system() {
section 29 "HA Sofia — System Resources"
if ! ha_sofia_available; then
warn "HA Sofia token not configured — skipping"
json_add "ha_system" "WARN" "Token not configured"
return 0
fi
ha_sofia_fetch_cache
if [[ ! -f "$HA_CACHE_DIR/states.json" ]] || [[ ! -f "$HA_CACHE_DIR/config.json" ]]; then
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
warn "HA Sofia API unavailable for system check"
json_add "ha_system" "WARN" "API unavailable"
return 0
fi
local result
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json
cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/states.json") as f:
states = json.load(f)
with open(f"{cache}/config.json") as f:
config = json.load(f)
version = config.get("version", "unknown")
entity_map = {s["entity_id"]: s for s in states}
cpu_patterns = ["sensor.processor_use", "sensor.system_monitor_processor_use"]
mem_patterns = ["sensor.memory_use_percent", "sensor.system_monitor_memory_use_percent"]
disk_patterns = ["sensor.disk_use_percent", "sensor.disk_use_percent_", "sensor.system_monitor_disk_use_percent"]
def find_entity(patterns):
for p in patterns:
if p in entity_map:
try:
return float(entity_map[p]["state"])
except (ValueError, TypeError):
pass
for eid, s in entity_map.items():
for p in patterns:
if p.rstrip("_") in eid and "percent" in eid:
try:
return float(s["state"])
except (ValueError, TypeError):
pass
return None
cpu = find_entity(cpu_patterns)
mem = find_entity(mem_patterns)
disk = find_entity(disk_patterns)
parts = ["version=" + version]
if cpu is not None:
parts.append("cpu=" + str(int(cpu)))
if mem is not None:
parts.append("mem=" + str(int(mem)))
if disk is not None:
parts.append("disk=" + str(int(disk)))
level = "PASS"
for val in [cpu, mem, disk]:
if val is not None:
if val > 90:
level = "FAIL"
break
elif val > 80:
level = "WARN"
print(level + ":" + ":".join(parts))
PYEOF
) || result="ERROR:python execution failed"
if [[ "$result" == "ERROR:"* ]]; then
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
warn "HA Sofia: ${result#ERROR:}"
json_add "ha_system" "WARN" "${result#ERROR:}"
return 0
fi
local level detail
level=$(echo "$result" | cut -d: -f1)
detail=$(echo "$result" | cut -d: -f2-)
if [[ "$level" == "FAIL" ]]; then
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
fail "HA Sofia resources critical: $detail"
json_add "ha_system" "FAIL" "$detail"
elif [[ "$level" == "WARN" ]]; then
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
warn "HA Sofia resources elevated: $detail"
json_add "ha_system" "WARN" "$detail"
else
pass "HA Sofia healthy ($detail)"
json_add "ha_system" "PASS" "$detail"
fi
}
# --- 30. Hardware Exporters ---
check_hardware_exporters() {
section 30 "Hardware Exporters"
local detail="" had_issue=false status="PASS"
# Check exporter pods are Running
local exporters=(
"monitoring:snmp-exporter"
"monitoring:idrac-redfish-exporter"
"monitoring:proxmox-exporter"
"tuya-bridge:tuya-bridge"
)
for entry in "${exporters[@]}"; do
local ns="${entry%%:*}"
local name="${entry##*:}"
local pods
pods=$($KUBECTL get pods -n "$ns" -l "app=$name" --no-headers 2>/dev/null || true)
# If label selector returns nothing, try matching by deployment name prefix
if [[ -z "$pods" ]]; then
pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep "^${name}-" || true)
fi
if [[ -z "$pods" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
fail "$ns/$name: no pods found"
detail+="$ns/$name=missing; "
had_issue=true
status="FAIL"
continue
fi
local not_running
not_running=$(echo "$pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
if [[ -n "$not_running" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
fail "$ns/$name pod not running: $not_running"
detail+="$ns/$name=not-running; "
had_issue=true
status="FAIL"
fi
done
# Check Prometheus scrape targets for hardware exporters
local prom_jobs=("snmp-idrac" "snmp-ups" "redfish-idrac" "proxmox-host")
local up_result
up_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -q -O- 'http://localhost:9090/api/v1/query?query=up' 2>/dev/null || true)
if [[ -n "$up_result" ]]; then
for job in "${prom_jobs[@]}"; do
local job_up
job_up=$(echo "$up_result" | python3 -c "
import json, sys
data = json.load(sys.stdin)
for r in data.get('data', {}).get('result', []):
if r.get('metric', {}).get('job') == '$job':
print(r.get('value', [0, '0'])[1])
break
else:
print('missing')
" 2>/dev/null) || job_up="error"
if [[ "$job_up" == "1" ]]; then
detail+="$job=up; "
elif [[ "$job_up" == "missing" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
warn "Prometheus target '$job' not found"
detail+="$job=missing; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
fail "Prometheus target '$job' is down (up=$job_up)"
detail+="$job=down; "
had_issue=true
status="FAIL"
fi
done
else
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
warn "Cannot query Prometheus for exporter targets"
detail+="prometheus-query-failed; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
fi
if [[ "$had_issue" == false ]]; then
pass "All hardware exporters running and scraped by Prometheus"
fi
json_add "hardware_exporters" "$status" "${detail:-All healthy}"
}
# Returns 0 if cert-manager CRDs are installed, 1 otherwise.
cert_manager_installed() {
$KUBECTL get crd certificates.cert-manager.io -o name >/dev/null 2>&1
}
# --- 31. cert-manager: Certificate Readiness ---
check_cert_manager_certificates() {
section 31 "cert-manager — Certificate Readiness"
local certs not_ready detail="" status="PASS"
if ! cert_manager_installed; then
pass "cert-manager not installed — N/A"
json_add "certmanager_certificates" "PASS" "N/A (cert-manager not installed)"
return 0
fi
certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || {
warn "cert-manager CRDs installed but API query failed"
json_add "certmanager_certificates" "WARN" "API query failed"
return 0
}
not_ready=$(echo "$certs" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for item in data.get("items", []):
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
conds = item.get("status", {}).get("conditions", [])
ready = next((c for c in conds if c.get("type") == "Ready"), None)
if not ready or ready.get("status") != "True":
reason = ready.get("reason", "NoCondition") if ready else "NoCondition"
print(f"{ns}/{name}:{reason}")
' 2>/dev/null) || true
if [[ -z "$not_ready" ]]; then
pass "All Certificate CRs Ready"
json_add "certmanager_certificates" "PASS" "All Ready"
else
[[ "$QUIET" == true ]] && section_always 31 "cert-manager — Certificate Readiness"
local count
count=$(count_lines "$not_ready")
while IFS= read -r line; do
fail "Certificate not Ready: $line"
detail+="$line; "
done <<< "$not_ready"
status="FAIL"
json_add "certmanager_certificates" "$status" "$count not Ready: $detail"
fi
}
# --- 32. cert-manager: Certificate Expiry (<14d) ---
check_cert_manager_expiry() {
section 32 "cert-manager — Certificate Expiry (<14d)"
local certs expiring detail="" status="PASS"
if ! cert_manager_installed; then
pass "cert-manager not installed — N/A"
json_add "certmanager_expiry" "PASS" "N/A (cert-manager not installed)"
return 0
fi
certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || {
warn "cert-manager CRDs installed but API query failed"
json_add "certmanager_expiry" "WARN" "API query failed"
return 0
}
expiring=$(echo "$certs" | python3 -c '
import json, sys
from datetime import datetime, timezone, timedelta
data = json.load(sys.stdin)
cutoff = datetime.now(timezone.utc) + timedelta(days=14)
for item in data.get("items", []):
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
not_after = item.get("status", {}).get("notAfter")
if not not_after:
continue
try:
expiry = datetime.fromisoformat(not_after.replace("Z", "+00:00"))
if expiry < cutoff:
days = (expiry - datetime.now(timezone.utc)).days
level = "FAIL" if days <= 3 else "WARN"
print(f"{level}:{ns}/{name}:{days}")
except ValueError:
pass
' 2>/dev/null) || true
if [[ -z "$expiring" ]]; then
pass "No Certificate CRs expiring within 14 days"
json_add "certmanager_expiry" "PASS" "None expiring <14d"
else
[[ "$QUIET" == true ]] && section_always 32 "cert-manager — Certificate Expiry (<14d)"
while IFS= read -r line; do
local level cert_name days
level=$(echo "$line" | cut -d: -f1)
cert_name=$(echo "$line" | cut -d: -f2)
days=$(echo "$line" | cut -d: -f3)
if [[ "$level" == "FAIL" ]]; then
fail "Certificate $cert_name expires in ${days}d"
status="FAIL"
else
warn "Certificate $cert_name expires in ${days}d"
[[ "$status" != "FAIL" ]] && status="WARN"
fi
detail+="$cert_name=${days}d; "
done <<< "$expiring"
json_add "certmanager_expiry" "$status" "$detail"
fi
}
# --- 33. cert-manager: Failed CertificateRequests ---
check_cert_manager_requests() {
section 33 "cert-manager — Failed CertificateRequests"
local requests failed detail="" status="PASS"
if ! cert_manager_installed; then
pass "cert-manager not installed — N/A"
json_add "certmanager_requests" "PASS" "N/A (cert-manager not installed)"
return 0
fi
requests=$($KUBECTL get certificaterequests.cert-manager.io -A -o json 2>/dev/null) || {
warn "cert-manager CRDs installed but API query failed"
json_add "certmanager_requests" "WARN" "API query failed"
return 0
}
failed=$(echo "$requests" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for item in data.get("items", []):
ns = item["metadata"]["namespace"]
name = item["metadata"]["name"]
conds = item.get("status", {}).get("conditions", [])
for c in conds:
if c.get("type") == "Ready" and c.get("status") == "False" and c.get("reason") == "Failed":
print(f"{ns}/{name}:{c.get(\"message\", \"\")[:80]}")
break
' 2>/dev/null) || true
if [[ -z "$failed" ]]; then
pass "No failed CertificateRequests"
json_add "certmanager_requests" "PASS" "None failed"
else
[[ "$QUIET" == true ]] && section_always 33 "cert-manager — Failed CertificateRequests"
local count
count=$(count_lines "$failed")
while IFS= read -r line; do
fail "CertificateRequest failed: $line"
detail+="$line; "
done <<< "$failed"
status="FAIL"
json_add "certmanager_requests" "$status" "$count failed: $detail"
fi
}
# --- 34. Backup Freshness: Per-DB Dumps ---
check_backup_per_db() {
section 34 "Backup Freshness — Per-DB Dumps"
local detail="" had_issue=false status="PASS"
# Freshness threshold: 25 hours
local now_epoch max_age_sec
now_epoch=$(date -u +%s)
max_age_sec=$((25 * 3600))
_check_cronjob_fresh() {
local ns="$1" cj="$2" label="$3"
local ts age_sec
ts=$($KUBECTL get cronjob -n "$ns" "$cj" -o jsonpath='{.status.lastSuccessfulTime}' 2>/dev/null || true)
if [[ -z "$ts" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps"
fail "$label: CronJob $ns/$cj has no lastSuccessfulTime"
detail+="${label}=no-success; "
had_issue=true
status="FAIL"
return 0
fi
local ts_epoch
ts_epoch=$(date -u -d "$ts" +%s 2>/dev/null || echo 0)
age_sec=$((now_epoch - ts_epoch))
if [[ "$age_sec" -gt "$max_age_sec" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps"
local age_h=$((age_sec / 3600))
fail "$label: last success ${age_h}h ago (>25h)"
detail+="${label}=${age_h}h; "
had_issue=true
status="FAIL"
else
local age_h=$((age_sec / 3600))
detail+="${label}=${age_h}h; "
fi
}
_check_cronjob_fresh dbaas mysql-backup-per-db mysql
_check_cronjob_fresh dbaas postgresql-backup-per-db pg
[[ "$had_issue" == false ]] && pass "Per-DB dumps fresh — $detail"
json_add "backup_per_db" "$status" "$detail"
}
# --- 35. Backup Freshness: Offsite Sync ---
check_backup_offsite_sync() {
section 35 "Backup Freshness — Offsite Sync"
local metrics detail="" status="PASS"
metrics=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true)
if [[ -z "$metrics" ]]; then
[[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
warn "Cannot query Pushgateway"
json_add "backup_offsite_sync" "WARN" "Pushgateway unreachable"
return 0
fi
local age_hours
age_hours=$(echo "$metrics" | python3 -c '
import sys, re, time
ts = None
for line in sys.stdin:
if line.startswith("#"):
continue
if "backup_last_success_timestamp" in line and "offsite-backup-sync" in line:
m = re.search(r"\s([0-9.eE+]+)\s*$", line.strip())
if m:
try:
ts = float(m.group(1))
break
except ValueError:
pass
if ts is None:
print("missing")
else:
age = (time.time() - ts) / 3600
print(f"{age:.1f}")
' 2>/dev/null) || age_hours="error"
if [[ "$age_hours" == "missing" ]]; then
[[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
fail "backup_last_success_timestamp metric missing for offsite-backup-sync"
json_add "backup_offsite_sync" "FAIL" "Metric missing"
elif [[ "$age_hours" == "error" ]]; then
[[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
warn "Failed to parse Pushgateway metric"
json_add "backup_offsite_sync" "WARN" "Parse error"
else
local age_int
age_int=$(printf '%.0f' "$age_hours")
if [[ "$age_int" -gt 27 ]]; then
[[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
fail "Offsite sync last success ${age_hours}h ago (>27h)"
status="FAIL"
else
pass "Offsite sync last success ${age_hours}h ago"
fi
detail="age=${age_hours}h"
json_add "backup_offsite_sync" "$status" "$detail"
fi
}
# --- 36. Backup Freshness: LVM PVC Snapshots ---
check_backup_lvm_snapshots() {
section 36 "Backup Freshness — LVM PVC Snapshots"
local snap_output detail="" status="PASS"
snap_output=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
root@192.168.1.127 "lvs -o lv_name,lv_time --noheadings 2>/dev/null | grep _snap" 2>/dev/null || true)
if [[ -z "$snap_output" ]]; then
[[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
warn "No LVM PVC snapshots found or SSH to 192.168.1.127 failed (BatchMode)"
json_add "backup_lvm_snapshots" "WARN" "SSH failed or no snapshots"
return 0
fi
local newest_age_hours
newest_age_hours=$(echo "$snap_output" | python3 -c '
import sys, re, time
from datetime import datetime
newest = None
for line in sys.stdin:
line = line.strip()
if not line:
continue
parts = line.split(None, 1)
if len(parts) < 2:
continue
date_str = parts[1].strip()
# lv_time format: "2026-04-19 03:00:01 +0000" or similar
for fmt in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(date_str, fmt)
ts = dt.timestamp()
if newest is None or ts > newest:
newest = ts
break
except ValueError:
continue
if newest is None:
print("parse_error")
else:
age = (time.time() - newest) / 3600
print(f"{age:.1f}")
' 2>/dev/null) || newest_age_hours="error"
if [[ "$newest_age_hours" == "parse_error" || "$newest_age_hours" == "error" ]]; then
[[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
warn "Could not parse LVM snapshot timestamps"
json_add "backup_lvm_snapshots" "WARN" "Parse error"
else
local count age_int
count=$(count_lines "$snap_output")
age_int=$(printf '%.0f' "$newest_age_hours")
if [[ "$age_int" -gt 25 ]]; then
[[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
fail "Newest LVM snapshot ${newest_age_hours}h old (>25h); $count total"
status="FAIL"
else
pass "LVM snapshots fresh — $count total, newest ${newest_age_hours}h old"
fi
detail="count=$count newest=${newest_age_hours}h"
json_add "backup_lvm_snapshots" "$status" "$detail"
fi
}
# --- 37. Monitoring: Prometheus + Alertmanager ---
check_monitoring_prom_am() {
section 37 "Monitoring — Prometheus + Alertmanager"
local detail="" had_issue=false status="PASS"
# Prometheus /-/ready
local prom_ready
prom_ready=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -qO- "http://localhost:9090/-/ready" 2>/dev/null || true)
if echo "$prom_ready" | grep -qi "ready"; then
detail+="prometheus=ready; "
else
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager"
fail "Prometheus /-/ready returned no Ready response"
detail+="prometheus=not-ready; "
had_issue=true
status="FAIL"
fi
# Alertmanager running pod count
local am_running
am_running=$($KUBECTL get pods -n monitoring --no-headers 2>/dev/null | \
grep alertmanager | awk '$3 == "Running"' | wc -l | tr -d ' ')
if [[ "$am_running" -gt 0 ]]; then
detail+="alertmanager=${am_running} running; "
else
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager"
fail "Alertmanager: 0 Running pods"
detail+="alertmanager=none-running; "
had_issue=true
status="FAIL"
fi
[[ "$had_issue" == false ]] && pass "Prometheus Ready, $am_running Alertmanager pod(s) Running"
json_add "monitoring_prom_am" "$status" "$detail"
}
# --- 38. Monitoring: Vault Sealed Status ---
check_monitoring_vault() {
section 38 "Monitoring — Vault Sealed Status"
local output detail="" status="PASS"
output=$($KUBECTL exec -n vault vault-0 -- \
sh -c 'VAULT_ADDR=http://127.0.0.1:8200 vault status' 2>&1 || true)
if [[ -z "$output" ]]; then
[[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
fail "Cannot exec vault status on vault-0"
json_add "monitoring_vault" "FAIL" "Exec failed"
return 0
fi
if echo "$output" | grep -qi "^Sealed[[:space:]]*false"; then
pass "Vault unsealed"
detail="sealed=false"
json_add "monitoring_vault" "PASS" "$detail"
elif echo "$output" | grep -qi "^Sealed[[:space:]]*true"; then
[[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
fail "Vault is SEALED — secrets unavailable"
detail="sealed=true"
status="FAIL"
json_add "monitoring_vault" "$status" "$detail"
else
[[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
warn "Cannot parse vault status output"
json_add "monitoring_vault" "WARN" "Parse error"
fi
}
# --- 39. Monitoring: ClusterSecretStore Ready ---
check_monitoring_css() {
section 39 "Monitoring — ClusterSecretStore Ready"
local css not_ready detail="" status="PASS"
css=$($KUBECTL get clustersecretstore -o json 2>/dev/null) || {
[[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready"
warn "ClusterSecretStore CRD not installed"
json_add "monitoring_css" "WARN" "CRD missing"
return 0
}
not_ready=$(echo "$css" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for item in data.get("items", []):
name = item["metadata"]["name"]
conds = item.get("status", {}).get("conditions", [])
ready = next((c for c in conds if c.get("type") == "Ready"), None)
if not ready or ready.get("status") != "True":
print(f"{name}:{ready.get(\"reason\", \"NoCondition\") if ready else \"NoCondition\"}")
' 2>/dev/null) || true
if [[ -z "$not_ready" ]]; then
local total
total=$(echo "$css" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("items",[])))' 2>/dev/null || echo "?")
pass "All $total ClusterSecretStores Ready"
json_add "monitoring_css" "PASS" "$total Ready"
else
[[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready"
while IFS= read -r line; do
fail "ClusterSecretStore not Ready: $line"
detail+="$line; "
done <<< "$not_ready"
status="FAIL"
json_add "monitoring_css" "$status" "$detail"
fi
}
# --- 40. External Reachability: Cloudflared + Authentik Replicas ---
check_external_replicas() {
section 40 "External — Cloudflared + Authentik Replicas"
local detail="" had_issue=false status="PASS"
# Cloudflared
local cf_json cf_ready cf_desired
cf_json=$($KUBECTL get deployment cloudflared -n cloudflared -o json 2>/dev/null || true)
if [[ -z "$cf_json" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
fail "Cloudflared deployment not found"
detail+="cloudflared=missing; "
had_issue=true
status="FAIL"
else
cf_ready=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0")
cf_desired=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0")
if [[ "$cf_ready" != "$cf_desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
fail "Cloudflared: $cf_ready/$cf_desired ready (external access degraded)"
detail+="cloudflared=${cf_ready}/${cf_desired}; "
had_issue=true
status="FAIL"
else
detail+="cloudflared=${cf_ready}/${cf_desired}; "
fi
fi
# Authentik server (Helm chart names the deployment goauthentik-server)
local auth_json auth_ready auth_desired
auth_json=$($KUBECTL get deployment goauthentik-server -n authentik -o json 2>/dev/null || true)
if [[ -z "$auth_json" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
warn "goauthentik-server deployment not found in authentik namespace"
detail+="authentik=missing; "
had_issue=true
[[ "$status" != "FAIL" ]] && status="WARN"
else
auth_ready=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0")
auth_desired=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0")
if [[ "$auth_ready" != "$auth_desired" ]]; then
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
fail "goauthentik-server: $auth_ready/$auth_desired ready (auth degraded)"
detail+="authentik=${auth_ready}/${auth_desired}; "
had_issue=true
status="FAIL"
else
detail+="authentik=${auth_ready}/${auth_desired}; "
fi
fi
[[ "$had_issue" == false ]] && pass "Cloudflared + authentik-server at full replicas ($detail)"
json_add "external_replicas" "$status" "$detail"
}
# --- 41. External Reachability: ExternalAccessDivergence Alert ---
check_external_divergence() {
section 41 "External — ExternalAccessDivergence Alert"
local alerts result detail="" status="PASS"
alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -qO- "http://localhost:9090/api/v1/alerts" 2>/dev/null || true)
if [[ -z "$alerts" ]]; then
[[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
warn "Cannot query Prometheus alerts"
json_add "external_divergence" "WARN" "Cannot query"
return 0
fi
result=$(echo "$alerts" | python3 -c '
import json, sys
try:
data = json.load(sys.stdin)
alerts = data.get("data", {}).get("alerts", []) if isinstance(data, dict) else data
firing = [a for a in alerts
if a.get("labels", {}).get("alertname") == "ExternalAccessDivergence"
and a.get("state") == "firing"]
if firing:
hosts = [a.get("labels", {}).get("host") or a.get("labels", {}).get("service") or "?" for a in firing]
print(f"{len(firing)}:" + ",".join(hosts))
else:
print("0:")
except Exception as e:
print(f"error:{e}")
' 2>/dev/null) || result="error:parse"
if [[ "$result" == error:* ]]; then
[[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
warn "Failed to parse alerts JSON: ${result#error:}"
json_add "external_divergence" "WARN" "Parse error"
return 0
fi
local count names
count=$(echo "$result" | cut -d: -f1)
names=$(echo "$result" | cut -d: -f2-)
if [[ "$count" -eq 0 ]]; then
pass "ExternalAccessDivergence not firing"
json_add "external_divergence" "PASS" "Not firing"
else
[[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
fail "ExternalAccessDivergence firing for $count target(s): $names"
status="FAIL"
detail="$count firing: $names"
json_add "external_divergence" "$status" "$detail"
fi
}
# --- 42. External Reachability: Traefik 5xx Rate ---
check_external_traefik_5xx() {
section 42 "External — Traefik 5xx Rate (15m)"
local query_result detail="" status="PASS"
query_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -qO- 'http://localhost:9090/api/v1/query?query=topk(10,rate(traefik_service_requests_total{code=~%225..%22}%5B15m%5D))' 2>/dev/null || true)
if [[ -z "$query_result" ]]; then
[[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
warn "Cannot query Prometheus for traefik 5xx rate"
json_add "external_traefik_5xx" "WARN" "Query failed"
return 0
fi
local parsed
parsed=$(echo "$query_result" | python3 -c '
import json, sys
try:
data = json.load(sys.stdin)
results = data.get("data", {}).get("result", [])
hot = [(r.get("metric", {}).get("service", "?"), float(r.get("value", [0, "0"])[1])) for r in results]
hot = [(s, v) for s, v in hot if v > 0.01] # 1% req/s threshold
hot.sort(key=lambda x: -x[1])
if not hot:
print("0:")
else:
top = [f"{s}={v:.2f}/s" for s, v in hot[:5]]
print(f"{len(hot)}:" + "; ".join(top))
except Exception as e:
print(f"error:{e}")
' 2>/dev/null) || parsed="error:parse"
if [[ "$parsed" == error:* ]]; then
[[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
warn "Parse failed: ${parsed#error:}"
json_add "external_traefik_5xx" "WARN" "Parse error"
return 0
fi
local count top
count=$(echo "$parsed" | cut -d: -f1)
top=$(echo "$parsed" | cut -d: -f2-)
if [[ "$count" -eq 0 ]]; then
pass "No Traefik services with 5xx rate >0.01 req/s (last 15m)"
json_add "external_traefik_5xx" "PASS" "None above threshold"
else
[[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
# WARN at any 5xx; FAIL if top service >1 req/s
local top_rate
top_rate=$(echo "$top" | grep -oE '[0-9.]+/s' | head -1 | tr -d '/s')
if awk "BEGIN{exit !($top_rate > 1.0)}" 2>/dev/null; then
fail "$count Traefik service(s) with elevated 5xx: $top"
status="FAIL"
else
warn "$count Traefik service(s) emitting 5xx: $top"
status="WARN"
fi
detail="$count services: $top"
json_add "external_traefik_5xx" "$status" "$detail"
fi
}
# --- Summary ---
print_summary() {
if [[ "$JSON" == true ]]; then
echo "{"
echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
echo " \"pass\": $PASS_COUNT,"
echo " \"warn\": $WARN_COUNT,"
echo " \"fail\": $FAIL_COUNT,"
echo " \"checks\": ["
local first=true
for r in "${JSON_RESULTS[@]}"; do
if [[ "$first" == true ]]; then
echo " $r"
first=false
else
echo " ,$r"
fi
done
echo " ]"
echo "}"
return 0
fi
echo ""
echo -e "${BOLD}═══════════════════════════════════════${NC}"
echo -e "${BOLD} Cluster Health Summary${NC}"
echo -e "${BOLD}═══════════════════════════════════════${NC}"
echo -e " ${GREEN}PASS${NC}: $PASS_COUNT ${YELLOW}WARN${NC}: $WARN_COUNT ${RED}FAIL${NC}: $FAIL_COUNT"
echo ""
if [[ "$FAIL_COUNT" -gt 0 ]]; then
echo -e " Overall: ${RED}UNHEALTHY${NC}"
elif [[ "$WARN_COUNT" -gt 0 ]]; then
echo -e " Overall: ${YELLOW}DEGRADED${NC}"
else
echo -e " Overall: ${GREEN}HEALTHY${NC}"
fi
echo ""
}
# --- Main ---
main() {
parse_args "$@"
if [[ "$JSON" != true ]]; then
echo -e "${BOLD}Cluster Health Check${NC}$(date '+%Y-%m-%d %H:%M:%S')"
echo -e "Kubeconfig: $KUBECONFIG_PATH"
if [[ "$FIX" == true ]]; then
echo -e "${YELLOW}Auto-fix mode enabled${NC}"
fi
fi
check_nodes
check_resources
check_conditions
check_pods
check_evicted
check_daemonsets
check_deployments
check_pvcs
check_hpa
check_cronjobs
check_crowdsec
check_ingresses
check_alerts
check_uptime_kuma
check_resourcequota
check_statefulsets
check_node_disk
check_helm_releases
check_kyverno
check_nfs
check_dns
check_tls_certs
check_gpu
check_cloudflare_tunnel
check_overcommit
check_ha_entities
check_ha_integrations
check_ha_automations
check_ha_system
check_hardware_exporters
check_cert_manager_certificates
check_cert_manager_expiry
check_cert_manager_requests
check_backup_per_db
check_backup_offsite_sync
check_backup_lvm_snapshots
check_monitoring_prom_am
check_monitoring_vault
check_monitoring_css
check_external_replicas
check_external_divergence
check_external_traefik_5xx
print_summary
# Exit code: 2 for failures, 1 for warnings, 0 for clean
if [[ "$FAIL_COUNT" -gt 0 ]]; then
exit 2
elif [[ "$WARN_COUNT" -gt 0 ]]; then
exit 1
fi
exit 0
}
main "$@"