infra/scripts/cluster_healthcheck.sh

#!/usr/bin/env bash

# Cluster health check script.
# Runs 42 diagnostic checks against the Kubernetes cluster and prints
# a colour-coded report with PASS / WARN / FAIL for each section.
#
# Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]

set -euo pipefail

# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m'

# --- Globals ---
PASS_COUNT=0
WARN_COUNT=0
FAIL_COUNT=0
FIX=false
QUIET=false
JSON=false
KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="$(pwd)/config"
KUBECTL=""
JSON_RESULTS=()
TOTAL_CHECKS=44

# --- Helpers ---
info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
pass()  { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e "  ${GREEN}[PASS]${NC} $*"; }
warn()  { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e "  ${YELLOW}[WARN]${NC} $*"; }
fail()  { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e "  ${RED}[FAIL]${NC} $*"; }

section() {
    local num="$1" title="$2"
    [[ "$JSON" == true ]] && return 0
    [[ "$QUIET" == true ]] && return 0
    echo ""
    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}

section_always() {
    local num="$1" title="$2"
    [[ "$JSON" == true ]] && return 0
    echo ""
    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}

json_add() {
    local name="$1" status="$2" detail="$3"
    local escaped
    escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))')
    JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}")
}

# count lines in a variable, returning 0 for empty strings
count_lines() {
    local input="$1"
    if [[ -z "$input" ]]; then
        echo 0
    else
        echo "$input" | wc -l | tr -d ' '
    fi
}

# --- Argument parsing ---
parse_args() {
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --fix)        FIX=true; shift ;;
            --no-fix)     FIX=false; shift ;;
            --quiet|-q)   QUIET=true; shift ;;
            --json)       JSON=true; shift ;;
            --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
            -h|--help)
                echo "Usage: $0 [--fix|--no-fix] [--quiet|-q] [--json] [--kubeconfig <path>]"
                echo ""
                echo "Flags:"
                echo "  --fix              Auto-remediate safe issues (delete evicted pods)"
                echo "  --no-fix           Disable auto-remediation (default)"
                echo "  --quiet, -q        Only show WARN and FAIL sections"
                echo "  --json             Machine-readable JSON output"
                echo "  --kubeconfig PATH  Override kubeconfig (default: \$(pwd)/config)"
                exit 0
                ;;
            *)
                echo "Unknown option: $1" >&2
                exit 1
                ;;
        esac
    done
    KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
}

# --- 1. Node Status ---
check_nodes() {
    section 1 "Node Status"
    local nodes not_ready versions unique_versions detail=""

    nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; }
    not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true)
    versions=$(echo "$nodes" | awk '{print $5}' | sort -u)
    unique_versions=$(echo "$versions" | wc -l | tr -d ' ')

    if [[ -n "$not_ready" ]]; then
        [[ "$QUIET" == true ]] && section_always 1 "Node Status"
        fail "NotReady nodes: $not_ready"
        detail="NotReady: $not_ready"
        json_add "node_status" "FAIL" "$detail"
    elif [[ "$unique_versions" -gt 1 ]]; then
        [[ "$QUIET" == true ]] && section_always 1 "Node Status"
        warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')"
        detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')"
        json_add "node_status" "WARN" "$detail"
    else
        pass "All nodes Ready, version $(echo "$versions" | head -1)"
        detail="All nodes Ready"
        json_add "node_status" "PASS" "$detail"
    fi
}

# --- 2. Node Resources ---
check_resources() {
    section 2 "Node Resources"
    local top detail="" had_issue=false status="PASS"

    top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; }

    while IFS= read -r line; do
        local node cpu_pct mem_pct
        node=$(echo "$line" | awk '{print $1}')
        cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
        mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')

        # Skip nodes where metrics are not yet available
        if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then
            detail+="$node metrics unavailable; "
            continue
        fi

        if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
            fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; "
            had_issue=true
            status="FAIL"
        elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
            warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; "
            had_issue=true
            [[ "$status" != "FAIL" ]] && status="WARN"
        else
            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; "
        fi
    done <<< "$top"

    [[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory"
    json_add "node_resources" "$status" "$detail"
}

# --- 3. Node Conditions ---
check_conditions() {
    section 3 "Node Conditions"
    local conditions detail=""

    conditions=$($KUBECTL get nodes -o json | python3 -c '
import json, sys
data = json.load(sys.stdin)
for node in data["items"]:
    name = node["metadata"]["name"]
    for c in node["status"]["conditions"]:
        if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True":
            print(name + ": " + c["type"])
' 2>&1) || true

    if [[ -n "$conditions" ]]; then
        [[ "$QUIET" == true ]] && section_always 3 "Node Conditions"
        while IFS= read -r line; do
            fail "$line"
        done <<< "$conditions"
        detail="$conditions"
        json_add "node_conditions" "FAIL" "$detail"
    else
        pass "No pressure conditions on any node"
        json_add "node_conditions" "PASS" "No pressure conditions"
    fi
}

# --- 4. Problematic Pods ---
check_pods() {
    section 4 "Problematic Pods"
    local bad count detail="" status="PASS"

    # Skip pods owned by Jobs (which are owned by CronJobs). A failed CronJob
    # retry isn't a problematic pod — the next CronJob fire will replace it.
    # Real problems are deployments / statefulsets / daemonsets in trouble.
    local job_owned_pods
    job_owned_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
import json, sys
d = json.load(sys.stdin)
for p in d["items"]:
    owners = p["metadata"].get("ownerReferences", [])
    if any(o.get("kind") == "Job" for o in owners):
        print(f"{p[\"metadata\"][\"namespace\"]} {p[\"metadata\"][\"name\"]}")
' 2>/dev/null || true)

    bad=$( {
        $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
            | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
        $KUBECTL get pods -A --no-headers 2>/dev/null \
            | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
    } | awk '!seen[$1,$2]++' | sed '/^$/d') || true

    # Filter out Job-owned pods
    if [[ -n "$job_owned_pods" && -n "$bad" ]]; then
        bad=$(echo "$bad" | awk -v jp="$job_owned_pods" '
            BEGIN { n = split(jp, lines, "\n"); for (i=1;i<=n;i++) skip[lines[i]] = 1 }
            { key = $1 " " $2; if (!(key in skip)) print }
        ')
    fi

    count=$(count_lines "$bad")

    if [[ "$count" -eq 0 ]]; then
        pass "No problematic pods"
        detail="None"
    elif [[ "$count" -le 10 ]]; then
        [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
        warn "$count problematic pod(s):"
        [[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo "    $line"; done
        detail="$count pods"
        status="WARN"
    else
        [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
        fail "$count problematic pods (showing first 10):"
        [[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo "    $line"; done
        detail="$count pods"
        status="FAIL"
    fi
    json_add "problematic_pods" "$status" "$detail"
}

# --- 5. Evicted/Failed Pods ---
check_evicted() {
    section 5 "Evicted/Failed Pods"
    local evicted count detail="" status="PASS"

    # Exclude pods owned by Jobs — those are CronJob retries that K8s leaves
    # behind for log inspection. They're not "evicted" in the cluster-health
    # sense and the next CronJob fire replaces them.
    evicted=$($KUBECTL get pods -A -o json --field-selector=status.phase=Failed 2>/dev/null | python3 -c '
import json, sys
try:
    d = json.load(sys.stdin)
except Exception:
    sys.exit(0)
for p in d.get("items", []):
    owners = p["metadata"].get("ownerReferences", [])
    if any(o.get("kind") == "Job" for o in owners):
        continue
    print(f"{p[\"metadata\"][\"namespace\"]}\t{p[\"metadata\"][\"name\"]}\t{p.get(\"status\",{}).get(\"reason\",\"\")}")
' 2>/dev/null || true)
    count=$(count_lines "$evicted")

    if [[ "$count" -eq 0 ]]; then
        pass "No evicted or failed pods"
        detail="0"
    elif [[ "$count" -le 50 ]]; then
        [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
        warn "$count evicted/failed pod(s)"
        detail="$count pods"
        status="WARN"
    else
        [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
        fail "$count evicted/failed pods"
        detail="$count pods"
        status="FAIL"
    fi

    if [[ "$FIX" == true && "$count" -gt 0 ]]; then
        info "Deleting $count evicted/failed pods..."
        $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true
        info "Deleted evicted/failed pods"
    fi
    json_add "evicted_pods" "$status" "$detail"
}

# --- 6. DaemonSets ---
check_daemonsets() {
    section 6 "DaemonSets"
    local ds detail="" had_issue=false

    ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; }

    while IFS= read -r line; do
        local ns name desired ready
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        desired=$(echo "$line" | awk '{print $3}')
        ready=$(echo "$line" | awk '{print $5}')

        if [[ "$desired" != "$ready" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets"
            fail "$ns/$name: desired=$desired ready=$ready"
            detail+="$ns/$name desired=$desired ready=$ready; "
            had_issue=true
        fi
    done <<< "$ds"

    if [[ "$had_issue" == false ]]; then
        pass "All DaemonSets healthy (desired == ready)"
        json_add "daemonsets" "PASS" "All healthy"
    else
        json_add "daemonsets" "FAIL" "$detail"
    fi
}

# --- 7. Deployments ---
check_deployments() {
    section 7 "Deployments"
    local deps detail="" had_issue=false

    deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; }

    while IFS= read -r line; do
        local ns name ready current desired
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        ready=$(echo "$line" | awk '{print $3}')
        current=$(echo "$ready" | cut -d/ -f1)
        desired=$(echo "$ready" | cut -d/ -f2)

        if [[ "$current" != "$desired" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments"
            fail "$ns/$name: $current/$desired ready"
            detail+="$ns/$name $current/$desired; "
            had_issue=true
        fi
    done <<< "$deps"

    if [[ "$had_issue" == false ]]; then
        pass "All deployments fully available"
        json_add "deployments" "PASS" "All available"
    else
        json_add "deployments" "FAIL" "$detail"
    fi
}

# --- 8. PVC Status ---
check_pvcs() {
    section 8 "PVC Status"
    local pvcs detail="" had_issue=false

    pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true
    if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then
        pass "No PVCs in cluster"
        json_add "pvcs" "PASS" "No PVCs"
        return 0
    fi

    while IFS= read -r line; do
        local ns name status
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        status=$(echo "$line" | awk '{print $3}')

        if [[ "$status" != "Bound" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status"
            fail "$ns/$name: $status"
            detail+="$ns/$name=$status; "
            had_issue=true
        fi
    done <<< "$pvcs"

    if [[ "$had_issue" == false ]]; then
        pass "All PVCs Bound"
        json_add "pvcs" "PASS" "All Bound"
    else
        json_add "pvcs" "FAIL" "$detail"
    fi
}

# --- 9. HPA Health ---
check_hpa() {
    section 9 "HPA Health"
    local hpas detail="" had_issue=false status="PASS"

    hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true
    if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then
        pass "No HPAs configured"
        json_add "hpa" "PASS" "No HPAs"
        return 0
    fi

    while IFS= read -r line; do
        local ns name targets
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        targets=$(echo "$line" | awk '{print $3}')

        if echo "$targets" | grep -q '<unknown>'; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
            fail "$ns/$name: targets=$targets (unknown metrics)"
            detail+="$ns/$name=unknown; "
            had_issue=true
            status="FAIL"
        else
            # Parse percentage values from targets like "45%/80%, 30%/50%"
            local pcts
            pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true)
            if [[ -n "$pcts" ]]; then
                while IFS= read -r pct; do
                    [[ -z "$pct" ]] && continue
                    if [[ "$pct" -gt 150 ]]; then
                        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
                        fail "$ns/$name: utilization at ${pct}%"
                        detail+="$ns/$name=${pct}%; "
                        had_issue=true
                        status="FAIL"
                        break
                    elif [[ "$pct" -gt 100 ]]; then
                        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
                        warn "$ns/$name: utilization at ${pct}%"
                        detail+="$ns/$name=${pct}%; "
                        had_issue=true
                        [[ "$status" != "FAIL" ]] && status="WARN"
                        break
                    fi
                done <<< "$pcts"
            fi
        fi
    done <<< "$hpas"

    [[ "$had_issue" == false ]] && pass "All HPAs healthy"
    json_add "hpa" "$status" "${detail:-All healthy}"
}

# --- 10. CronJob Failures ---
check_cronjobs() {
    section 10 "CronJob Failures"
    local failures detail=""

    failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
import json, sys
from datetime import datetime, timezone, timedelta

data = json.load(sys.stdin)
cutoff = datetime.now(timezone.utc) - timedelta(hours=24)

for job in data.get("items", []):
    meta = job.get("metadata", {})
    ns = meta.get("namespace", "")
    name = meta.get("name", "")

    owners = meta.get("ownerReferences", [])
    is_cronjob = any(o.get("kind") == "CronJob" for o in owners)
    if not is_cronjob:
        continue

    conditions = job.get("status", {}).get("conditions", [])
    for c in conditions:
        if c.get("type") == "Failed" and c.get("status") == "True":
            ts = c.get("lastTransitionTime", "")
            if ts:
                try:
                    t = datetime.fromisoformat(ts.replace("Z", "+00:00"))
                    if t > cutoff:
                        print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
                except:
                    print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
' 2>/dev/null) || true

    if [[ -z "$failures" ]]; then
        pass "No CronJob failures in last 24h"
        json_add "cronjob_failures" "PASS" "None"
    else
        [[ "$QUIET" == true ]] && section_always 10 "CronJob Failures"
        local count
        count=$(count_lines "$failures")
        fail "$count CronJob failure(s) in last 24h:"
        [[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo "    $line"; done
        json_add "cronjob_failures" "FAIL" "$count failures"
    fi
}

# --- 11. CrowdSec ---
check_crowdsec() {
    section 11 "CrowdSec Agents"
    local cs_pods not_running

    cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true)
    if [[ -z "$cs_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
        warn "CrowdSec namespace not found or empty"
        json_add "crowdsec" "WARN" "No CrowdSec pods found"
        return 0
    fi

    not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
    if [[ -n "$not_running" ]]; then
        [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
        while IFS= read -r line; do
            fail "CrowdSec pod not running: $line"
        done <<< "$not_running"
        json_add "crowdsec" "FAIL" "$not_running"
    else
        local total
        total=$(count_lines "$cs_pods")
        pass "All $total CrowdSec pods running"
        json_add "crowdsec" "PASS" "$total pods running"
    fi
}

# --- 12. Ingress ---
check_ingresses() {
    section 12 "Ingress Routes"
    local ingresses no_lb detail="" had_issue=false

    ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true)
    if [[ -n "$ingresses" ]]; then
        no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true)
        if [[ -n "$no_lb" ]]; then
            [[ "$QUIET" == true ]] && section_always 12 "Ingress Routes"
            while IFS= read -r line; do
                fail "Ingress missing LB IP: $line"
            done <<< "$no_lb"
            detail="Missing LB: $no_lb"
            had_issue=true
        fi
    fi

    # Check Traefik LB service
    local traefik_svc_ip
    traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
    if [[ -z "$traefik_svc_ip" ]]; then
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes"
        fail "Traefik LoadBalancer has no external IP"
        detail+="Traefik LB missing IP; "
        had_issue=true
    else
        detail+="Traefik LB=$traefik_svc_ip; "
    fi

    if [[ "$had_issue" == false ]]; then
        pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)"
        json_add "ingresses" "PASS" "$detail"
    else
        json_add "ingresses" "FAIL" "$detail"
    fi
}

# --- 13. Prometheus Alerts ---
check_alerts() {
    section 13 "Prometheus Alerts"
    local alerts firing_count

    # Try alertmanager first, then prometheus server
    alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
        wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true)

    if [[ -z "$alerts" ]]; then
        alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
            wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true)
    fi

    if [[ -z "$alerts" ]]; then
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        warn "Could not query Prometheus/Alertmanager"
        json_add "prometheus_alerts" "WARN" "Cannot query"
        return 0
    fi

    # Only count warning + critical alerts. Info-level alerts (RecentNodeReboot,
    # PVAutoExpanding, etc.) are informational by design and shouldn't be
    # treated as a script-level WARN — the alert rules themselves already
    # encode the severity.
    firing_count=$(echo "$alerts" | python3 -c '
import json, sys
ACTIONABLE = {"warning", "critical"}
def actionable(labels):
    return labels.get("severity", "info").lower() in ACTIONABLE
try:
    data = json.load(sys.stdin)
    if isinstance(data, list):
        active = [a for a in data if a.get("status", {}).get("state") == "active" and actionable(a.get("labels", {}))]
        count = len(active)
        names = [a.get("labels", {}).get("alertname", "?") for a in active]
        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
    elif isinstance(data, dict) and "data" in data:
        alerts_list = data["data"].get("alerts", [])
        firing = [a for a in alerts_list if a.get("state") == "firing" and actionable(a.get("labels", {}))]
        count = len(firing)
        names = [a.get("labels", {}).get("alertname", "?") for a in firing]
        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
    else:
        print("0:")
except:
    print("-1:")
' 2>/dev/null || echo "-1:")

    local count names
    count=$(echo "$firing_count" | cut -d: -f1)
    names=$(echo "$firing_count" | cut -d: -f2-)

    if [[ "$count" == "-1" ]]; then
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        warn "Failed to parse alert data"
        json_add "prometheus_alerts" "WARN" "Parse error"
    elif [[ "$count" -eq 0 ]]; then
        pass "No firing alerts"
        json_add "prometheus_alerts" "PASS" "0 firing"
    elif [[ "$count" -le 3 ]]; then
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        warn "$count firing alert(s): $names"
        json_add "prometheus_alerts" "WARN" "$count firing: $names"
    else
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        fail "$count firing alerts: $names"
        json_add "prometheus_alerts" "FAIL" "$count firing: $names"
    fi
}

# --- 14. Uptime Kuma ---
check_uptime_kuma() {
    section 14 "Uptime Kuma Monitors"
    local result

    # Get password from Vault (or env var fallback)
    local uk_pass="${UPTIME_KUMA_PASSWORD:-}"
    if [[ -z "$uk_pass" ]]; then
        uk_pass=$(vault kv get -field=uptime_kuma_admin_password secret/viktor 2>/dev/null) || true
    fi
    if [[ -z "$uk_pass" ]]; then
        warn "Uptime Kuma: password not available (set UPTIME_KUMA_PASSWORD or vault login)"
        json_add "uptime_kuma" "WARN" "password not available"
        return 0
    fi

    # Connect via kubectl port-forward to the internal Service. The public
    # URL (uptime.viktorbarzin.me) is behind Authentik forward-auth, which
    # 302-redirects the Socket.IO handshake the library uses — there's no
    # way for an unauthenticated script to complete the OAuth dance.
    # Port-forward gives us a direct path to the in-cluster ClusterIP
    # service and works from any host with kubectl access.
    local pf_port=18444 pf_pid
    $KUBECTL port-forward -n uptime-kuma svc/uptime-kuma "$pf_port:80" >/dev/null 2>&1 &
    pf_pid=$!
    # Detach from job control so bash doesn't print "Killed" to stderr
    # when we SIGKILL the port-forward at the end of this check — that
    # message corrupts stdout when stderr is merged for JSON parsing.
    disown "$pf_pid" 2>/dev/null || true
    # Wait up to 5s for the local listener to come up.
    local i
    for i in 1 2 3 4 5; do
        if (echo >"/dev/tcp/127.0.0.1/$pf_port") 2>/dev/null; then break; fi
        sleep 1
    done

    result=$(UPTIME_KUMA_PASSWORD="$uk_pass" UK_URL="http://127.0.0.1:$pf_port" \
        ~/.venvs/claude/bin/python3 -c '
import sys, os, time
try:
    from uptime_kuma_api import UptimeKumaApi
except ImportError:
    print("ERROR:uptime-kuma-api not installed")
    sys.exit(0)

# Retry up to 3 times — the Socket.IO handshake is occasionally flaky
# even against the internal service during cluster churn.
last_exc = None
api = None
for attempt in range(3):
    try:
        api = UptimeKumaApi(os.environ["UK_URL"], timeout=120, wait_events=0.2)
        api.login("admin", os.environ["UPTIME_KUMA_PASSWORD"])
        break
    except Exception as e:
        last_exc = e
        try: api.disconnect()
        except Exception: pass
        api = None
        time.sleep(2 * (attempt + 1))
if api is None:
    print(f"CONN_ERROR:{last_exc}")
    sys.exit(0)

try:

    monitors = api.get_monitors()
    heartbeats = api.get_heartbeats()

    # Separate internal and external monitors
    internal_up = 0
    internal_down = []
    external_up = 0
    external_down = []
    paused_count = 0

    for m in monitors:
        mid = m.get("id")
        name = m.get("name", "unknown")
        active = m.get("active", True)
        is_external = name.startswith("[External] ")

        if not active:
            paused_count += 1
            continue

        beats = heartbeats.get(mid, [])
        if beats:
            last_beat = beats[-1]
            if isinstance(last_beat, list):
                last_beat = last_beat[-1] if last_beat else {}
            status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
            if hasattr(status, "value"):
                status = status.value
            is_up = (status == 1)
        else:
            is_up = False

        if is_external:
            if is_up:
                external_up += 1
            else:
                external_down.append(name.replace("[External] ", ""))
        else:
            if is_up:
                internal_up += 1
            else:
                internal_down.append(name)

    api.disconnect()

    int_down_names = ", ".join(internal_down) if internal_down else ""
    ext_down_names = ", ".join(external_down) if external_down else ""
    # Format: int_down:int_up:ext_down:ext_up:paused:int_down_names|ext_down_names
    print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}")
except Exception as e:
    print(f"CONN_ERROR:{e}")
' 2>/dev/null) || result="CONN_ERROR:python execution failed"

    # Always tear down the port-forward. Use SIGKILL directly — kubectl
    # port-forward sometimes ignores SIGTERM during teardown and we don't
    # need a graceful exit for a localhost listener. Skip `wait` because
    # in `set -m` mode the backgrounded child may not be reapable here,
    # causing the script to hang indefinitely; the shell reaps it on exit.
    kill -9 "$pf_pid" 2>/dev/null || true

    if [[ "$result" == "ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
        warn "Uptime Kuma: ${result#ERROR:}"
        json_add "uptime_kuma" "WARN" "${result#ERROR:}"
    elif [[ "$result" == "CONN_ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
        warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}"
        json_add "uptime_kuma" "WARN" "Connection failed"
    else
        local int_down int_up ext_down ext_up paused_count down_details
        int_down=$(echo "$result" | cut -d: -f1)
        int_up=$(echo "$result" | cut -d: -f2)
        ext_down=$(echo "$result" | cut -d: -f3)
        ext_up=$(echo "$result" | cut -d: -f4)
        paused_count=$(echo "$result" | cut -d: -f5)
        down_details=$(echo "$result" | cut -d: -f6-)
        local int_down_names="${down_details%%|*}"
        local ext_down_names="${down_details#*|}"

        local total_down=$((int_down + ext_down))
        local total_up=$((int_up + ext_up))
        local total_active=$((total_up + total_down))

        if [[ "$total_down" -eq 0 ]]; then
            pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)"
            json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused"
        else
            [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
            local details=""
            [[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names"
            [[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; }
            if [[ "$total_down" -le 3 ]]; then
                warn "$total_down/$total_active down: $details"
                json_add "uptime_kuma" "WARN" "$details"
            else
                fail "$total_down/$total_active down: $details"
                json_add "uptime_kuma" "FAIL" "$details"
            fi
        fi
    fi
}

# --- 15. ResourceQuota Pressure ---
check_resourcequota() {
    section 15 "ResourceQuota Pressure"
    local quotas detail="" had_issue=false status="PASS"

    quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }

    local pressure
    pressure=$(echo "$quotas" | python3 -c '
import json, sys, re

def parse_cpu(val):
    """Convert CPU value to millicores."""
    val = str(val)
    if val.endswith("m"):
        return float(val[:-1])
    return float(val) * 1000

def parse_mem(val):
    """Convert memory value to bytes."""
    val = str(val)
    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
    for suffix, mult in units.items():
        if val.endswith(suffix):
            return float(val[:-len(suffix)]) * mult
    # Plain bytes or numeric
    return float(val)

data = json.load(sys.stdin)
for item in data.get("items", []):
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    status = item.get("status", {})
    hard = status.get("hard", {})
    used = status.get("used", {})

    for resource, hard_val in hard.items():
        used_val = used.get(resource, "0")
        try:
            if "cpu" in resource:
                h = parse_cpu(hard_val)
                u = parse_cpu(used_val)
            elif "memory" in resource or "storage" in resource:
                h = parse_mem(hard_val)
                u = parse_mem(used_val)
            elif resource == "pods":
                h = float(hard_val)
                u = float(used_val)
            else:
                continue
            if h <= 0:
                continue
            pct = (u / h) * 100
            if pct > 80:
                level = "FAIL" if pct > 95 else "WARN"
                print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
        except (ValueError, ZeroDivisionError):
            pass
' 2>/dev/null) || true

    if [[ -z "$pressure" ]]; then
        pass "All ResourceQuotas below 80% usage"
        json_add "resourcequota" "PASS" "All below 80%"
    else
        [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
        while IFS= read -r line; do
            local level ns_res resource pct
            level=$(echo "$line" | cut -d: -f1)
            ns_res=$(echo "$line" | cut -d: -f2)
            resource=$(echo "$line" | cut -d: -f3)
            pct=$(echo "$line" | cut -d: -f4)
            if [[ "$level" == "FAIL" ]]; then
                fail "$ns_res: $resource at $pct"
                status="FAIL"
            else
                warn "$ns_res: $resource at $pct"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$ns_res $resource=$pct; "
            had_issue=true
        done <<< "$pressure"
        json_add "resourcequota" "$status" "$detail"
    fi
}

# --- 16. StatefulSets ---
check_statefulsets() {
    section 16 "StatefulSets"
    local sts detail="" had_issue=false

    sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
    if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
        pass "No StatefulSets in cluster"
        json_add "statefulsets" "PASS" "No StatefulSets"
        return 0
    fi

    while IFS= read -r line; do
        local ns name ready current desired
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        ready=$(echo "$line" | awk '{print $3}')
        current=$(echo "$ready" | cut -d/ -f1)
        desired=$(echo "$ready" | cut -d/ -f2)

        if [[ "$current" != "$desired" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
            fail "$ns/$name: $current/$desired ready"
            detail+="$ns/$name $current/$desired; "
            had_issue=true
        fi
    done <<< "$sts"

    if [[ "$had_issue" == false ]]; then
        pass "All StatefulSets fully available"
        json_add "statefulsets" "PASS" "All available"
    else
        json_add "statefulsets" "FAIL" "$detail"
    fi
}

# --- 17. Node Disk Usage ---
check_node_disk() {
    section 17 "Node Disk Usage"
    local node_json detail="" had_issue=false status="PASS"

    node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }

    local disk_info
    disk_info=$(echo "$node_json" | python3 -c '
import json, sys

def parse_storage(val):
    """Convert storage value to bytes."""
    val = str(val)
    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
    for suffix, mult in units.items():
        if val.endswith(suffix):
            return float(val[:-len(suffix)]) * mult
    return float(val)

data = json.load(sys.stdin)
for node in data["items"]:
    name = node["metadata"]["name"]
    cap = node["status"].get("capacity", {})
    alloc = node["status"].get("allocatable", {})
    es_cap = cap.get("ephemeral-storage", "0")
    es_alloc = alloc.get("ephemeral-storage", "0")
    try:
        c = parse_storage(es_cap)
        a = parse_storage(es_alloc)
        if c > 0:
            used_pct = ((c - a) / c) * 100
            if used_pct > 80:
                level = "FAIL" if used_pct > 90 else "WARN"
                print(f"{level}:{name}:{used_pct:.0f}")
    except (ValueError, ZeroDivisionError):
        pass
' 2>/dev/null) || true

    if [[ -z "$disk_info" ]]; then
        pass "All nodes below 80% ephemeral-storage usage"
        json_add "node_disk" "PASS" "All below 80%"
    else
        [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
        while IFS= read -r line; do
            local level node pct
            level=$(echo "$line" | cut -d: -f1)
            node=$(echo "$line" | cut -d: -f2)
            pct=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "$node: ephemeral-storage at ${pct}%"
                status="FAIL"
            else
                warn "$node: ephemeral-storage at ${pct}%"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$node=${pct}%; "
            had_issue=true
        done <<< "$disk_info"
        json_add "node_disk" "$status" "$detail"
    fi
}

# --- 18. Helm Release Health ---
check_helm_releases() {
    section 18 "Helm Release Health"
    local releases detail="" had_issue=false status="PASS"

    releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
        warn "Cannot list Helm releases"
        json_add "helm_releases" "WARN" "Cannot list"
        return 0
    }

    local bad_releases
    bad_releases=$(echo "$releases" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for r in data:
    name = r.get("name", "?")
    ns = r.get("namespace", "?")
    st = r.get("status", "unknown")
    if st != "deployed":
        level = "FAIL" if st.startswith("pending") else "WARN"
        print(f"{level}:{ns}/{name}:{st}")
' 2>/dev/null) || true

    if [[ -z "$bad_releases" ]]; then
        pass "All Helm releases in deployed state"
        json_add "helm_releases" "PASS" "All deployed"
    else
        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
        while IFS= read -r line; do
            local level release_name release_status
            level=$(echo "$line" | cut -d: -f1)
            release_name=$(echo "$line" | cut -d: -f2)
            release_status=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "Helm release $release_name: $release_status (blocks terraform)"
                status="FAIL"
            else
                warn "Helm release $release_name: $release_status"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$release_name=$release_status; "
            had_issue=true
        done <<< "$bad_releases"
        json_add "helm_releases" "$status" "$detail"
    fi
}

# --- 19. Kyverno Policy Engine ---
check_kyverno() {
    section 19 "Kyverno Policy Engine"
    local kv_pods not_running

    kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
    if [[ -z "$kv_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
        fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
        json_add "kyverno" "FAIL" "No Kyverno pods found"
        return 0
    fi

    not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
    if [[ -n "$not_running" ]]; then
        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
        while IFS= read -r line; do
            fail "Kyverno pod not running: $line"
        done <<< "$not_running"
        json_add "kyverno" "FAIL" "$not_running"
    else
        local total
        total=$(count_lines "$kv_pods")
        pass "All $total Kyverno pods running"
        json_add "kyverno" "PASS" "$total pods running"
    fi
}

# --- 20. NFS Connectivity ---
check_nfs() {
    section 20 "NFS Connectivity"

    if showmount -e 192.168.1.127 &>/dev/null; then
        pass "NFS server 192.168.1.127 (Proxmox) reachable (exports listed)"
        json_add "nfs" "PASS" "NFS reachable"
    elif nc -z -G 3 192.168.1.127 2049 &>/dev/null; then
        pass "NFS server 192.168.1.127 port 2049 open"
        json_add "nfs" "PASS" "NFS port open"
    else
        [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
        fail "NFS server 192.168.1.127 (Proxmox) unreachable — 30+ services depend on NFS"
        json_add "nfs" "FAIL" "NFS unreachable"
    fi
}

# --- 21. DNS Resolution ---
check_dns() {
    section 21 "DNS Resolution"
    local internal_ok=false external_ok=false detail=""

    # Test DNS from inside the cluster via kubectl exec (MetalLB IPs may not be
    # reachable from outside the L2 network)
    local dns_pod
    dns_pod=$($KUBECTL get pods -n technitium -l app=technitium -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)

    if [[ -n "$dns_pod" ]]; then
        if $KUBECTL exec -n technitium "$dns_pod" -- nslookup viktorbarzin.me 127.0.0.1 &>/dev/null; then
            internal_ok=true
        fi
        if $KUBECTL exec -n technitium "$dns_pod" -- nslookup google.com 127.0.0.1 &>/dev/null; then
            external_ok=true
        fi
    fi

    if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
        pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
        json_add "dns" "PASS" "Both resolve"
    elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
        if [[ "$internal_ok" == false ]]; then
            warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
            detail="Internal failed"
        else
            warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
            detail="External failed"
        fi
        json_add "dns" "WARN" "$detail"
    else
        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
        fail "DNS server (Technitium) not resolving — both internal and external failed"
        json_add "dns" "FAIL" "Both failed"
    fi
}

# --- 22. TLS Certificate Expiry ---
check_tls_certs() {
    section 22 "TLS Certificate Expiry"
    local secrets detail="" had_issue=false status="PASS"

    secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
        warn "Cannot list secrets"
        json_add "tls_certs" "WARN" "Cannot list secrets"
        return 0
    }

    local cert_issues
    cert_issues=$(echo "$secrets" | python3 -c '
import json, sys, base64, subprocess, hashlib
from datetime import datetime, timezone

data = json.load(sys.stdin)
seen_fingerprints = set()
results = []

for item in data.get("items", []):
    if item.get("type") != "kubernetes.io/tls":
        continue
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    cert_data = item.get("data", {}).get("tls.crt", "")
    if not cert_data:
        continue

    # Deduplicate by cert fingerprint
    raw = base64.b64decode(cert_data)
    fp = hashlib.sha256(raw).hexdigest()[:16]
    if fp in seen_fingerprints:
        continue
    seen_fingerprints.add(fp)

    # Parse certificate expiry with openssl
    try:
        result = subprocess.run(
            ["openssl", "x509", "-noout", "-enddate", "-subject"],
            input=raw, capture_output=True, timeout=5
        )
        output = result.stdout.decode()
        for line in output.splitlines():
            if line.startswith("notAfter="):
                date_str = line.split("=", 1)[1]
                # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
                try:
                    expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
                    expiry = expiry.replace(tzinfo=timezone.utc)
                    days_left = (expiry - datetime.now(timezone.utc)).days
                    # Threshold rationale (lowered from 30d):
                    # - cnpg-webhook-cert: CNPG operator auto-rotates at 7d before expiry
                    # - kyverno-*-tls-pair: Kyverno auto-rotates at 15d before expiry
                    # - viktorbarzin.me Lets Encrypt wildcard: renewed weekly via Woodpecker
                    # Anything still <14d at check time is genuinely worth surfacing.
                    if days_left <= 7:
                        print(f"FAIL:{ns}/{name}:{days_left}d")
                    elif days_left <= 14:
                        print(f"WARN:{ns}/{name}:{days_left}d")
                except ValueError:
                    pass
    except (subprocess.TimeoutExpired, Exception):
        pass
' 2>/dev/null) || true

    if [[ -z "$cert_issues" ]]; then
        pass "All TLS certificates valid for >14 days"
        json_add "tls_certs" "PASS" "All valid >14d"
    else
        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
        while IFS= read -r line; do
            local level cert_name days
            level=$(echo "$line" | cut -d: -f1)
            cert_name=$(echo "$line" | cut -d: -f2)
            days=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "TLS cert $cert_name expires in $days"
                status="FAIL"
            else
                warn "TLS cert $cert_name expires in $days"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$cert_name=$days; "
            had_issue=true
        done <<< "$cert_issues"
        json_add "tls_certs" "$status" "$detail"
    fi
}

# --- 23. GPU Health ---
check_gpu() {
    section 23 "GPU Health"
    local gpu_pods not_running

    gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
    if [[ -z "$gpu_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
        warn "NVIDIA namespace not found or empty"
        json_add "gpu" "WARN" "No GPU pods found"
        return 0
    fi

    # Check specifically for device-plugin (critical for GPU scheduling)
    local device_plugin_down=false
    local other_down=false
    local detail=""

    while IFS= read -r line; do
        local pod_name pod_status
        pod_name=$(echo "$line" | awk '{print $1}')
        pod_status=$(echo "$line" | awk '{print $3}')
        if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
            if echo "$pod_name" | grep -q "device-plugin"; then
                device_plugin_down=true
                detail+="device-plugin $pod_name: $pod_status; "
            else
                other_down=true
                detail+="$pod_name: $pod_status; "
            fi
        fi
    done <<< "$gpu_pods"

    if [[ "$device_plugin_down" == true ]]; then
        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
        fail "GPU device-plugin is down — GPU workloads cannot schedule"
        json_add "gpu" "FAIL" "$detail"
    elif [[ "$other_down" == true ]]; then
        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
        warn "Some GPU pods not running: $detail"
        json_add "gpu" "WARN" "$detail"
    else
        local total
        total=$(count_lines "$gpu_pods")
        pass "All $total GPU pods running"
        json_add "gpu" "PASS" "$total pods running"
    fi
}

# --- 24. Cloudflare Tunnel ---
check_cloudflare_tunnel() {
    section 24 "Cloudflare Tunnel"
    local cf_pods running_count total_count

    cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
    if [[ -z "$cf_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
        fail "Cloudflare tunnel namespace not found or empty — external access broken"
        json_add "cloudflare_tunnel" "FAIL" "No pods found"
        return 0
    fi

    total_count=$(count_lines "$cf_pods")
    running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')

    if [[ "$running_count" -eq 0 ]]; then
        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
        fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
        json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
    elif [[ "$running_count" -lt "$total_count" ]]; then
        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
        warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
        json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
    else
        pass "Cloudflare tunnel: all $total_count pods running"
        json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
    fi
}

# --- 25. Resource Usage ---
check_overcommit() {
    section 25 "Resource Usage"
    local detail="" had_issue=false status="PASS"

    local usage
    usage=$($KUBECTL top nodes --no-headers 2>/dev/null) || { fail "Cannot get node metrics"; json_add "overcommit" "FAIL" "No metrics"; return 0; }

    if [[ -z "$usage" ]]; then
        fail "metrics-server returned no data"
        json_add "overcommit" "FAIL" "No data"
        return 0
    fi

    while IFS= read -r line; do
        local name cpu_pct mem_pct cpu_cores mem_bytes level node_detail
        name=$(echo "$line" | awk '{print $1}')
        cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
        mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
        cpu_cores=$(echo "$line" | awk '{print $2}')
        mem_bytes=$(echo "$line" | awk '{print $4}')

        if [[ "$cpu_pct" -gt 90 || "$mem_pct" -gt 90 ]]; then
            level="FAIL"
        elif [[ "$cpu_pct" -gt 80 || "$mem_pct" -gt 80 ]]; then
            level="WARN"
        else
            level="OK"
        fi

        node_detail="${name}: cpu ${cpu_cores} (${cpu_pct}%), mem ${mem_bytes} (${mem_pct}%)"

        if [[ "$level" == "FAIL" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Usage"
            fail "$node_detail"
            had_issue=true
            status="FAIL"
        elif [[ "$level" == "WARN" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Usage"
            warn "$node_detail"
            had_issue=true
            [[ "$status" != "FAIL" ]] && status="WARN"
        else
            pass "$node_detail"
        fi
        detail+="$node_detail; "
    done <<< "$usage"

    json_add "overcommit" "$status" "$detail"
}

# --- HA helpers ---
HA_CACHE_DIR=""

ha_sofia_available() {
    if [[ -z "${HOME_ASSISTANT_SOFIA_URL:-}" ]]; then
        export HOME_ASSISTANT_SOFIA_URL="https://ha-sofia.viktorbarzin.me"
    fi
    if [[ -z "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]]; then
        if command -v vault >/dev/null 2>&1 && [[ -n "${VAULT_TOKEN:-}${HOME:-}" ]]; then
            local t
            t=$(vault kv get -field=haos_api_token secret/viktor 2>/dev/null || true)
            [[ -n "$t" ]] && export HOME_ASSISTANT_SOFIA_TOKEN="$t"
        fi
    fi
    [[ -n "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]] || return 1
    return 0
}

# Fetch all HA data once and cache in temp files
ha_sofia_fetch_cache() {
    if [[ -n "$HA_CACHE_DIR" ]]; then
        return 0
    fi
    HA_CACHE_DIR=$(mktemp -d)
    export HA_CACHE_DIR
    trap "rm -rf $HA_CACHE_DIR" EXIT

    python3 << 'HA_FETCH_EOF'
import os, json, requests, sys

url = os.environ["HOME_ASSISTANT_SOFIA_URL"]
token = os.environ["HOME_ASSISTANT_SOFIA_TOKEN"]
cache = os.environ["HA_CACHE_DIR"]
headers = {"Authorization": f"Bearer {token}"}

errors = []

# Fetch states (used by checks 26, 28)
try:
    resp = requests.get(f"{url}/api/states", headers=headers, timeout=30)
    resp.raise_for_status()
    with open(f"{cache}/states.json", "w") as f:
        json.dump(resp.json(), f)
except Exception as e:
    errors.append(f"states:{e}")

# Fetch config entries (used by check 27)
try:
    resp = requests.get(f"{url}/api/config/config_entries/entry", headers=headers, timeout=30)
    resp.raise_for_status()
    with open(f"{cache}/entries.json", "w") as f:
        json.dump(resp.json(), f)
except Exception as e:
    errors.append(f"entries:{e}")

# Fetch config (used by check 29)
try:
    resp = requests.get(f"{url}/api/config", headers=headers, timeout=10)
    resp.raise_for_status()
    with open(f"{cache}/config.json", "w") as f:
        json.dump(resp.json(), f)
except Exception as e:
    errors.append(f"config:{e}")

if errors:
    with open(f"{cache}/errors.txt", "w") as f:
        f.write("\n".join(errors))
HA_FETCH_EOF
}

# --- 26. HA Entity Availability ---
check_ha_entities() {
    section 26 "HA Sofia — Entity Availability"

    if ! ha_sofia_available; then
        warn "HA Sofia token not configured — skipping"
        json_add "ha_entities" "WARN" "Token not configured"
        return 0
    fi

    ha_sofia_fetch_cache

    if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
        local err=""
        [[ -f "$HA_CACHE_DIR/errors.txt" ]] && err=$(grep "^states:" "$HA_CACHE_DIR/errors.txt" | head -1)
        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
        warn "HA Sofia API unreachable: ${err:-unknown error}"
        json_add "ha_entities" "WARN" "API unreachable"
        return 0
    fi

    local result
    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json
from datetime import datetime, timezone, timedelta

# Noise filter rationale:
# * The HA "unavailable" state covers everything from "the iDRAC scrape failed
#   30 seconds ago" to "this iPhone hasn't checked in in 6 hours" to
#   "this YAML rest sensor has been broken for a week". Counting all of them
#   produces 400+ alerts that are mostly expected (phones in standby, lights
#   off, TVs idle).
# * Three filters dramatically cut noise without hiding real outages:
#     1. SKIP_DOMAINS — domains that go unavailable transiently by design
#        (mobile_app on backgrounded apps, notify per-device, button/scene/
#        event are momentary).
#     2. STALE_HOURS — only count entities that have been unavailable for
#        this long. A flapping integration that recovers in <24h is noise;
#        one stuck for >24h is real.
#     3. SKIP_DEVICE_HINTS — friendly-name substrings for things that come
#        and go (laptops, phones, TVs, vacuums, washers).
SKIP_DOMAINS = {"mobile_app", "device_tracker", "notify", "button", "scene",
                "event", "image", "update"}
SKIP_DEVICE_HINTS = ("iphone", "ipad", "macbook", "mac mini", "tv", "bravia",
                     "playstation", "switch", "roomba", "vacuum", "rumi",
                     "ipad", "laptop", "phone", "перална", "сушилня",
                     "миялна", "laptop2")
STALE_HOURS = 24

cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/states.json") as f:
    states = json.load(f)

now = datetime.now(timezone.utc)
threshold = now - timedelta(hours=STALE_HOURS)

def is_stale(s):
    if s.get("state") not in ("unavailable", "unknown"):
        return False
    domain = s["entity_id"].split(".")[0]
    if domain in SKIP_DOMAINS:
        return False
    name = (s.get("attributes", {}).get("friendly_name") or "").lower()
    if any(h in name for h in SKIP_DEVICE_HINTS):
        return False
    # last_changed = when the state last flipped. If it flipped to unavailable
    # >24h ago and stayed there, the integration is genuinely broken.
    lc = s.get("last_changed") or s.get("last_updated")
    if not lc:
        return True  # no timestamp = treat as old
    try:
        dt = datetime.fromisoformat(lc.replace("Z", "+00:00"))
    except ValueError:
        return True
    return dt < threshold

unavail = [s for s in states if is_stale(s)]
domains = {}
for s in unavail:
    d = s["entity_id"].split(".")[0]
    domains[d] = domains.get(d, 0) + 1

total = len(states)
count = len(unavail)
summary = ", ".join(f"{d}:{n}" for d, n in sorted(domains.items(), key=lambda x: -x[1]))
entity_list = "\n".join("ENTITY:" + s["entity_id"] for s in unavail)
print(f"{count}:{total}:{summary}")
if entity_list:
    print(entity_list)
PYEOF
) || result="ERROR:python execution failed"

    if [[ "$result" == "ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
        warn "HA Sofia: ${result#ERROR:}"
        json_add "ha_entities" "WARN" "${result#ERROR:}"
        return 0
    fi

    local first_line count total summary
    first_line=$(echo "$result" | head -1)
    count=$(echo "$first_line" | cut -d: -f1)
    total=$(echo "$first_line" | cut -d: -f2)
    summary=$(echo "$first_line" | cut -d: -f3-)

    if [[ "$count" -eq 0 ]]; then
        pass "All $total HA entities available"
        json_add "ha_entities" "PASS" "0/$total unavailable"
    elif [[ "$count" -le 10 ]]; then
        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
        warn "$count/$total entities unavailable ($summary)"
        if [[ "$JSON" != true && "$QUIET" != true ]]; then
            echo "$result" | grep "^ENTITY:" | sed 's/^ENTITY:/    /'
        fi
        json_add "ha_entities" "WARN" "$count/$total: $summary"
    else
        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
        fail "$count/$total entities unavailable ($summary)"
        if [[ "$JSON" != true && "$QUIET" != true ]]; then
            echo "$result" | grep "^ENTITY:" | head -20 | sed 's/^ENTITY:/    /'
            local entity_count
            entity_count=$(echo "$result" | grep -c "^ENTITY:" || true)
            if [[ "$entity_count" -gt 20 ]]; then
                echo "    ... and $((entity_count - 20)) more"
            fi
        fi
        json_add "ha_entities" "FAIL" "$count/$total: $summary"
    fi
}

# --- 27. HA Integration Health ---
check_ha_integrations() {
    section 27 "HA Sofia — Integration Health"

    if ! ha_sofia_available; then
        warn "HA Sofia token not configured — skipping"
        json_add "ha_integrations" "WARN" "Token not configured"
        return 0
    fi

    ha_sofia_fetch_cache

    if [[ ! -f "$HA_CACHE_DIR/entries.json" ]]; then
        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
        warn "HA Sofia config entries API unavailable"
        json_add "ha_integrations" "WARN" "API unavailable"
        return 0
    fi

    local result
    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json

cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/entries.json") as f:
    entries = json.load(f)

total = len(entries)
not_loaded = []
setup_error = []
for e in entries:
    state = e.get("state", "loaded")
    domain = e.get("domain", "?")
    title = e.get("title", "?")
    if state == "setup_error" or state == "setup_retry":
        setup_error.append(f"{domain} ({title})")
    elif state == "not_loaded":
        not_loaded.append(f"{domain} ({title})")

error_count = len(setup_error)
unloaded_count = len(not_loaded)
error_names = "; ".join(setup_error) if setup_error else ""
unloaded_names = "; ".join(not_loaded) if not_loaded else ""
print(f"{total}:{error_count}:{unloaded_count}:{error_names}:{unloaded_names}")
PYEOF
) || result="ERROR:python execution failed"

    if [[ "$result" == "ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
        warn "HA Sofia: ${result#ERROR:}"
        json_add "ha_integrations" "WARN" "${result#ERROR:}"
        return 0
    fi

    local total error_count unloaded_count error_names unloaded_names
    total=$(echo "$result" | cut -d: -f1)
    error_count=$(echo "$result" | cut -d: -f2)
    unloaded_count=$(echo "$result" | cut -d: -f3)
    error_names=$(echo "$result" | cut -d: -f4)
    unloaded_names=$(echo "$result" | cut -d: -f5-)

    if [[ "$error_count" -gt 0 ]]; then
        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
        fail "$error_count integration(s) in error state: $error_names"
        json_add "ha_integrations" "FAIL" "$error_count errors: $error_names"
    elif [[ "$unloaded_count" -gt 0 ]]; then
        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
        warn "$unloaded_count integration(s) not loaded: $unloaded_names"
        json_add "ha_integrations" "WARN" "$unloaded_count not loaded: $unloaded_names"
    else
        pass "All $total integrations loaded"
        json_add "ha_integrations" "PASS" "All $total loaded"
    fi
}

# --- 28. HA Automation Status ---
check_ha_automations() {
    section 28 "HA Sofia — Automation Status"

    if ! ha_sofia_available; then
        warn "HA Sofia token not configured — skipping"
        json_add "ha_automations" "WARN" "Token not configured"
        return 0
    fi

    ha_sofia_fetch_cache

    if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
        [[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
        warn "HA Sofia states API unavailable"
        json_add "ha_automations" "WARN" "API unavailable"
        return 0
    fi

    local result
    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json
from datetime import datetime, timezone

cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/states.json") as f:
    states = json.load(f)

autos = [s for s in states if s["entity_id"].startswith("automation.")]
total = len(autos)

# Noise filter rationale (was: any disabled OR not-triggered-in-30d):
# * "Disabled" alone is fine — Viktor disables automations intentionally
#   (seasonal, holiday-only, paused). Only flag when ABANDONED, i.e.
#   disabled for >180 days AND never triggered recently.
# * "Stale" alone is fine for low-frequency automations (annual reminders,
#   manual triggers). Raise the bar to 180d (was 30d).
DISABLED_STALE_DAYS = 180
STALE_DAYS = 180

now = datetime.now(timezone.utc)

def days_since(ts):
    if not ts:
        return None
    try:
        return (now - datetime.fromisoformat(ts.replace("Z", "+00:00"))).days
    except Exception:
        return None

disabled = []
stale = []
for a in autos:
    lt_days = days_since(a.get("attributes", {}).get("last_triggered"))
    changed_days = days_since(a.get("last_changed"))
    if a["state"] == "off":
        # Only flag a disabled automation if it has ALSO been untouched for
        # the threshold — i.e. genuinely abandoned, not "paused for now".
        # Use last_changed as a proxy for "user-touched recently".
        if changed_days is None or changed_days > DISABLED_STALE_DAYS:
            disabled.append(a["entity_id"])
    else:
        if lt_days is not None and lt_days > STALE_DAYS:
            stale.append(f"{a['entity_id']}={lt_days}d")

disabled_count = len(disabled)
stale_count = len(stale)
disabled_names = "; ".join(disabled)
stale_names = "; ".join(stale[:10])
print(f"{total}:{disabled_count}:{stale_count}:{disabled_names}:{stale_names}")
PYEOF
) || result="ERROR:python execution failed"

    if [[ "$result" == "ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
        warn "HA Sofia: ${result#ERROR:}"
        json_add "ha_automations" "WARN" "${result#ERROR:}"
        return 0
    fi

    local total disabled_count stale_count disabled_names stale_names
    total=$(echo "$result" | cut -d: -f1)
    disabled_count=$(echo "$result" | cut -d: -f2)
    stale_count=$(echo "$result" | cut -d: -f3)
    disabled_names=$(echo "$result" | cut -d: -f4)
    stale_names=$(echo "$result" | cut -d: -f5-)

    local status="PASS" detail=""
    if [[ "$disabled_count" -gt 0 ]]; then
        [[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
        warn "$disabled_count/$total automation(s) disabled"
        if [[ "$JSON" != true && "$QUIET" != true && -n "$disabled_names" ]]; then
            echo "$disabled_names" | tr ';' '\n' | sed 's/^ */    /'
        fi
        status="WARN"
        detail+="$disabled_count disabled; "
    fi

    if [[ "$stale_count" -gt 0 ]]; then
        [[ "$status" == "PASS" && "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
        warn "$stale_count automation(s) not triggered in 30+ days"
        if [[ "$JSON" != true && "$QUIET" != true && -n "$stale_names" ]]; then
            echo "$stale_names" | tr ';' '\n' | sed 's/^ */    /'
        fi
        [[ "$status" == "PASS" ]] && status="WARN"
        detail+="$stale_count stale; "
    fi

    if [[ "$status" == "PASS" ]]; then
        pass "All $total automations enabled and recently active"
        json_add "ha_automations" "PASS" "All $total active"
    else
        json_add "ha_automations" "$status" "$detail"
    fi
}

# --- 29. HA System Resources ---
check_ha_system() {
    section 29 "HA Sofia — System Resources"

    if ! ha_sofia_available; then
        warn "HA Sofia token not configured — skipping"
        json_add "ha_system" "WARN" "Token not configured"
        return 0
    fi

    ha_sofia_fetch_cache

    if [[ ! -f "$HA_CACHE_DIR/states.json" ]] || [[ ! -f "$HA_CACHE_DIR/config.json" ]]; then
        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
        warn "HA Sofia API unavailable for system check"
        json_add "ha_system" "WARN" "API unavailable"
        return 0
    fi

    local result
    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
import os, json

cache = os.environ["HA_CACHE_DIR"]
with open(f"{cache}/states.json") as f:
    states = json.load(f)
with open(f"{cache}/config.json") as f:
    config = json.load(f)

version = config.get("version", "unknown")
entity_map = {s["entity_id"]: s for s in states}

cpu_patterns = ["sensor.processor_use", "sensor.system_monitor_processor_use"]
mem_patterns = ["sensor.memory_use_percent", "sensor.system_monitor_memory_use_percent"]
disk_patterns = ["sensor.disk_use_percent", "sensor.disk_use_percent_", "sensor.system_monitor_disk_use_percent"]

def find_entity(patterns):
    for p in patterns:
        if p in entity_map:
            try:
                return float(entity_map[p]["state"])
            except (ValueError, TypeError):
                pass
    for eid, s in entity_map.items():
        for p in patterns:
            if p.rstrip("_") in eid and "percent" in eid:
                try:
                    return float(s["state"])
                except (ValueError, TypeError):
                    pass
    return None

cpu = find_entity(cpu_patterns)
mem = find_entity(mem_patterns)
disk = find_entity(disk_patterns)

parts = ["version=" + version]
if cpu is not None:
    parts.append("cpu=" + str(int(cpu)))
if mem is not None:
    parts.append("mem=" + str(int(mem)))
if disk is not None:
    parts.append("disk=" + str(int(disk)))

level = "PASS"
for val in [cpu, mem, disk]:
    if val is not None:
        if val > 90:
            level = "FAIL"
            break
        elif val > 80:
            level = "WARN"

print(level + ":" + ":".join(parts))
PYEOF
) || result="ERROR:python execution failed"

    if [[ "$result" == "ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
        warn "HA Sofia: ${result#ERROR:}"
        json_add "ha_system" "WARN" "${result#ERROR:}"
        return 0
    fi

    local level detail
    level=$(echo "$result" | cut -d: -f1)
    detail=$(echo "$result" | cut -d: -f2-)

    if [[ "$level" == "FAIL" ]]; then
        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
        fail "HA Sofia resources critical: $detail"
        json_add "ha_system" "FAIL" "$detail"
    elif [[ "$level" == "WARN" ]]; then
        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
        warn "HA Sofia resources elevated: $detail"
        json_add "ha_system" "WARN" "$detail"
    else
        pass "HA Sofia healthy ($detail)"
        json_add "ha_system" "PASS" "$detail"
    fi
}

# --- 30. Hardware Exporters ---
check_hardware_exporters() {
    section 30 "Hardware Exporters"
    local detail="" had_issue=false status="PASS"

    # Check exporter pods are Running
    local exporters=(
        "monitoring:snmp-exporter"
        "monitoring:idrac-redfish-exporter"
        "monitoring:proxmox-exporter"
        "tuya-bridge:tuya-bridge"
    )

    for entry in "${exporters[@]}"; do
        local ns="${entry%%:*}"
        local name="${entry##*:}"
        local pods
        pods=$($KUBECTL get pods -n "$ns" -l "app=$name" --no-headers 2>/dev/null || true)

        # If label selector returns nothing, try matching by deployment name prefix
        if [[ -z "$pods" ]]; then
            pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep "^${name}-" || true)
        fi

        if [[ -z "$pods" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
            fail "$ns/$name: no pods found"
            detail+="$ns/$name=missing; "
            had_issue=true
            status="FAIL"
            continue
        fi

        local not_running
        not_running=$(echo "$pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
        if [[ -n "$not_running" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
            fail "$ns/$name pod not running: $not_running"
            detail+="$ns/$name=not-running; "
            had_issue=true
            status="FAIL"
        fi
    done

    # Check Prometheus scrape targets for hardware exporters
    local prom_jobs=("snmp-idrac" "snmp-ups" "redfish-idrac" "proxmox-host")
    local up_result
    up_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
        wget -q -O- 'http://localhost:9090/api/v1/query?query=up' 2>/dev/null || true)

    if [[ -n "$up_result" ]]; then
        for job in "${prom_jobs[@]}"; do
            local job_up
            job_up=$(echo "$up_result" | python3 -c "
import json, sys
data = json.load(sys.stdin)
for r in data.get('data', {}).get('result', []):
    if r.get('metric', {}).get('job') == '$job':
        print(r.get('value', [0, '0'])[1])
        break
else:
    print('missing')
" 2>/dev/null) || job_up="error"

            if [[ "$job_up" == "1" ]]; then
                detail+="$job=up; "
            elif [[ "$job_up" == "missing" ]]; then
                [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
                warn "Prometheus target '$job' not found"
                detail+="$job=missing; "
                had_issue=true
                [[ "$status" != "FAIL" ]] && status="WARN"
            else
                [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
                fail "Prometheus target '$job' is down (up=$job_up)"
                detail+="$job=down; "
                had_issue=true
                status="FAIL"
            fi
        done
    else
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
        warn "Cannot query Prometheus for exporter targets"
        detail+="prometheus-query-failed; "
        had_issue=true
        [[ "$status" != "FAIL" ]] && status="WARN"
    fi

    if [[ "$had_issue" == false ]]; then
        pass "All hardware exporters running and scraped by Prometheus"
    fi
    json_add "hardware_exporters" "$status" "${detail:-All healthy}"
}

# Returns 0 if cert-manager CRDs are installed, 1 otherwise.
cert_manager_installed() {
    $KUBECTL get crd certificates.cert-manager.io -o name >/dev/null 2>&1
}

# --- 31. cert-manager: Certificate Readiness ---
check_cert_manager_certificates() {
    section 31 "cert-manager — Certificate Readiness"
    local certs not_ready detail="" status="PASS"

    if ! cert_manager_installed; then
        pass "cert-manager not installed — N/A"
        json_add "certmanager_certificates" "PASS" "N/A (cert-manager not installed)"
        return 0
    fi

    certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || {
        warn "cert-manager CRDs installed but API query failed"
        json_add "certmanager_certificates" "WARN" "API query failed"
        return 0
    }

    not_ready=$(echo "$certs" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for item in data.get("items", []):
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    conds = item.get("status", {}).get("conditions", [])
    ready = next((c for c in conds if c.get("type") == "Ready"), None)
    if not ready or ready.get("status") != "True":
        reason = ready.get("reason", "NoCondition") if ready else "NoCondition"
        print(f"{ns}/{name}:{reason}")
' 2>/dev/null) || true

    if [[ -z "$not_ready" ]]; then
        pass "All Certificate CRs Ready"
        json_add "certmanager_certificates" "PASS" "All Ready"
    else
        [[ "$QUIET" == true ]] && section_always 31 "cert-manager — Certificate Readiness"
        local count
        count=$(count_lines "$not_ready")
        while IFS= read -r line; do
            fail "Certificate not Ready: $line"
            detail+="$line; "
        done <<< "$not_ready"
        status="FAIL"
        json_add "certmanager_certificates" "$status" "$count not Ready: $detail"
    fi
}

# --- 32. cert-manager: Certificate Expiry (<14d) ---
check_cert_manager_expiry() {
    section 32 "cert-manager — Certificate Expiry (<14d)"
    local certs expiring detail="" status="PASS"

    if ! cert_manager_installed; then
        pass "cert-manager not installed — N/A"
        json_add "certmanager_expiry" "PASS" "N/A (cert-manager not installed)"
        return 0
    fi

    certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || {
        warn "cert-manager CRDs installed but API query failed"
        json_add "certmanager_expiry" "WARN" "API query failed"
        return 0
    }

    expiring=$(echo "$certs" | python3 -c '
import json, sys
from datetime import datetime, timezone, timedelta
data = json.load(sys.stdin)
cutoff = datetime.now(timezone.utc) + timedelta(days=14)
for item in data.get("items", []):
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    not_after = item.get("status", {}).get("notAfter")
    if not not_after:
        continue
    try:
        expiry = datetime.fromisoformat(not_after.replace("Z", "+00:00"))
        if expiry < cutoff:
            days = (expiry - datetime.now(timezone.utc)).days
            level = "FAIL" if days <= 3 else "WARN"
            print(f"{level}:{ns}/{name}:{days}")
    except ValueError:
        pass
' 2>/dev/null) || true

    if [[ -z "$expiring" ]]; then
        pass "No Certificate CRs expiring within 14 days"
        json_add "certmanager_expiry" "PASS" "None expiring <14d"
    else
        [[ "$QUIET" == true ]] && section_always 32 "cert-manager — Certificate Expiry (<14d)"
        while IFS= read -r line; do
            local level cert_name days
            level=$(echo "$line" | cut -d: -f1)
            cert_name=$(echo "$line" | cut -d: -f2)
            days=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "Certificate $cert_name expires in ${days}d"
                status="FAIL"
            else
                warn "Certificate $cert_name expires in ${days}d"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$cert_name=${days}d; "
        done <<< "$expiring"
        json_add "certmanager_expiry" "$status" "$detail"
    fi
}

# --- 33. cert-manager: Failed CertificateRequests ---
check_cert_manager_requests() {
    section 33 "cert-manager — Failed CertificateRequests"
    local requests failed detail="" status="PASS"

    if ! cert_manager_installed; then
        pass "cert-manager not installed — N/A"
        json_add "certmanager_requests" "PASS" "N/A (cert-manager not installed)"
        return 0
    fi

    requests=$($KUBECTL get certificaterequests.cert-manager.io -A -o json 2>/dev/null) || {
        warn "cert-manager CRDs installed but API query failed"
        json_add "certmanager_requests" "WARN" "API query failed"
        return 0
    }

    failed=$(echo "$requests" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for item in data.get("items", []):
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    conds = item.get("status", {}).get("conditions", [])
    for c in conds:
        if c.get("type") == "Ready" and c.get("status") == "False" and c.get("reason") == "Failed":
            print(f"{ns}/{name}:{c.get(\"message\", \"\")[:80]}")
            break
' 2>/dev/null) || true

    if [[ -z "$failed" ]]; then
        pass "No failed CertificateRequests"
        json_add "certmanager_requests" "PASS" "None failed"
    else
        [[ "$QUIET" == true ]] && section_always 33 "cert-manager — Failed CertificateRequests"
        local count
        count=$(count_lines "$failed")
        while IFS= read -r line; do
            fail "CertificateRequest failed: $line"
            detail+="$line; "
        done <<< "$failed"
        status="FAIL"
        json_add "certmanager_requests" "$status" "$count failed: $detail"
    fi
}

# --- 34. Backup Freshness: Per-DB Dumps ---
check_backup_per_db() {
    section 34 "Backup Freshness — Per-DB Dumps"
    local detail="" had_issue=false status="PASS"

    # Freshness threshold: 25 hours
    local now_epoch max_age_sec
    now_epoch=$(date -u +%s)
    max_age_sec=$((25 * 3600))

    _check_cronjob_fresh() {
        local ns="$1" cj="$2" label="$3"
        local ts age_sec
        ts=$($KUBECTL get cronjob -n "$ns" "$cj" -o jsonpath='{.status.lastSuccessfulTime}' 2>/dev/null || true)
        if [[ -z "$ts" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps"
            fail "$label: CronJob $ns/$cj has no lastSuccessfulTime"
            detail+="${label}=no-success; "
            had_issue=true
            status="FAIL"
            return 0
        fi
        local ts_epoch
        ts_epoch=$(date -u -d "$ts" +%s 2>/dev/null || echo 0)
        age_sec=$((now_epoch - ts_epoch))
        if [[ "$age_sec" -gt "$max_age_sec" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps"
            local age_h=$((age_sec / 3600))
            fail "$label: last success ${age_h}h ago (>25h)"
            detail+="${label}=${age_h}h; "
            had_issue=true
            status="FAIL"
        else
            local age_h=$((age_sec / 3600))
            detail+="${label}=${age_h}h; "
        fi
    }

    _check_cronjob_fresh dbaas mysql-backup-per-db mysql
    _check_cronjob_fresh dbaas postgresql-backup-per-db pg

    [[ "$had_issue" == false ]] && pass "Per-DB dumps fresh — $detail"
    json_add "backup_per_db" "$status" "$detail"
}

# --- 35. Backup Freshness: Offsite Sync ---
check_backup_offsite_sync() {
    section 35 "Backup Freshness — Offsite Sync"
    local metrics detail="" status="PASS"

    metrics=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
        wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true)

    if [[ -z "$metrics" ]]; then
        [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
        warn "Cannot query Pushgateway"
        json_add "backup_offsite_sync" "WARN" "Pushgateway unreachable"
        return 0
    fi

    local age_hours
    age_hours=$(echo "$metrics" | python3 -c '
import sys, re, time
ts = None
for line in sys.stdin:
    if line.startswith("#"):
        continue
    if "backup_last_success_timestamp" in line and "offsite-backup-sync" in line:
        m = re.search(r"\s([0-9.eE+]+)\s*$", line.strip())
        if m:
            try:
                ts = float(m.group(1))
                break
            except ValueError:
                pass
if ts is None:
    print("missing")
else:
    age = (time.time() - ts) / 3600
    print(f"{age:.1f}")
' 2>/dev/null) || age_hours="error"

    if [[ "$age_hours" == "missing" ]]; then
        [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
        fail "backup_last_success_timestamp metric missing for offsite-backup-sync"
        json_add "backup_offsite_sync" "FAIL" "Metric missing"
    elif [[ "$age_hours" == "error" ]]; then
        [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
        warn "Failed to parse Pushgateway metric"
        json_add "backup_offsite_sync" "WARN" "Parse error"
    else
        local age_int
        age_int=$(printf '%.0f' "$age_hours")
        if [[ "$age_int" -gt 27 ]]; then
            [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
            fail "Offsite sync last success ${age_hours}h ago (>27h)"
            status="FAIL"
        else
            pass "Offsite sync last success ${age_hours}h ago"
        fi
        detail="age=${age_hours}h"
        json_add "backup_offsite_sync" "$status" "$detail"
    fi
}

# --- 36. Backup Freshness: LVM PVC Snapshots ---
check_backup_lvm_snapshots() {
    section 36 "Backup Freshness — LVM PVC Snapshots"
    local snap_output detail="" status="PASS"

    snap_output=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
        root@192.168.1.127 "lvs -o lv_name,lv_time --noheadings 2>/dev/null | grep _snap" 2>/dev/null || true)

    if [[ -z "$snap_output" ]]; then
        [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
        warn "No LVM PVC snapshots found or SSH to 192.168.1.127 failed (BatchMode)"
        json_add "backup_lvm_snapshots" "WARN" "SSH failed or no snapshots"
        return 0
    fi

    local newest_age_hours
    newest_age_hours=$(echo "$snap_output" | python3 -c '
import sys, re, time
from datetime import datetime
newest = None
for line in sys.stdin:
    line = line.strip()
    if not line:
        continue
    parts = line.split(None, 1)
    if len(parts) < 2:
        continue
    date_str = parts[1].strip()
    # lv_time format: "2026-04-19 03:00:01 +0000" or similar
    for fmt in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"):
        try:
            dt = datetime.strptime(date_str, fmt)
            ts = dt.timestamp()
            if newest is None or ts > newest:
                newest = ts
            break
        except ValueError:
            continue
if newest is None:
    print("parse_error")
else:
    age = (time.time() - newest) / 3600
    print(f"{age:.1f}")
' 2>/dev/null) || newest_age_hours="error"

    if [[ "$newest_age_hours" == "parse_error" || "$newest_age_hours" == "error" ]]; then
        [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
        warn "Could not parse LVM snapshot timestamps"
        json_add "backup_lvm_snapshots" "WARN" "Parse error"
    else
        local count age_int
        count=$(count_lines "$snap_output")
        age_int=$(printf '%.0f' "$newest_age_hours")
        if [[ "$age_int" -gt 25 ]]; then
            [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
            fail "Newest LVM snapshot ${newest_age_hours}h old (>25h); $count total"
            status="FAIL"
        else
            pass "LVM snapshots fresh — $count total, newest ${newest_age_hours}h old"
        fi
        detail="count=$count newest=${newest_age_hours}h"
        json_add "backup_lvm_snapshots" "$status" "$detail"
    fi
}

# --- 37. Monitoring: Prometheus + Alertmanager ---
check_monitoring_prom_am() {
    section 37 "Monitoring — Prometheus + Alertmanager"
    local detail="" had_issue=false status="PASS"

    # Prometheus /-/ready
    local prom_ready
    prom_ready=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
        wget -qO- "http://localhost:9090/-/ready" 2>/dev/null || true)
    if echo "$prom_ready" | grep -qi "ready"; then
        detail+="prometheus=ready; "
    else
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager"
        fail "Prometheus /-/ready returned no Ready response"
        detail+="prometheus=not-ready; "
        had_issue=true
        status="FAIL"
    fi

    # Alertmanager running pod count
    local am_running
    am_running=$($KUBECTL get pods -n monitoring --no-headers 2>/dev/null | \
        grep alertmanager | awk '$3 == "Running"' | wc -l | tr -d ' ')
    if [[ "$am_running" -gt 0 ]]; then
        detail+="alertmanager=${am_running} running; "
    else
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager"
        fail "Alertmanager: 0 Running pods"
        detail+="alertmanager=none-running; "
        had_issue=true
        status="FAIL"
    fi

    [[ "$had_issue" == false ]] && pass "Prometheus Ready, $am_running Alertmanager pod(s) Running"
    json_add "monitoring_prom_am" "$status" "$detail"
}

# --- 38. Monitoring: Vault Sealed Status ---
check_monitoring_vault() {
    section 38 "Monitoring — Vault Sealed Status"
    local output detail="" status="PASS"

    output=$($KUBECTL exec -n vault vault-0 -- \
        sh -c 'VAULT_ADDR=http://127.0.0.1:8200 vault status' 2>&1 || true)

    if [[ -z "$output" ]]; then
        [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
        fail "Cannot exec vault status on vault-0"
        json_add "monitoring_vault" "FAIL" "Exec failed"
        return 0
    fi

    if echo "$output" | grep -qi "^Sealed[[:space:]]*false"; then
        pass "Vault unsealed"
        detail="sealed=false"
        json_add "monitoring_vault" "PASS" "$detail"
    elif echo "$output" | grep -qi "^Sealed[[:space:]]*true"; then
        [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
        fail "Vault is SEALED — secrets unavailable"
        detail="sealed=true"
        status="FAIL"
        json_add "monitoring_vault" "$status" "$detail"
    else
        [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
        warn "Cannot parse vault status output"
        json_add "monitoring_vault" "WARN" "Parse error"
    fi
}

# --- 39. Monitoring: ClusterSecretStore Ready ---
check_monitoring_css() {
    section 39 "Monitoring — ClusterSecretStore Ready"
    local css not_ready detail="" status="PASS"

    css=$($KUBECTL get clustersecretstore -o json 2>/dev/null) || {
        [[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready"
        warn "ClusterSecretStore CRD not installed"
        json_add "monitoring_css" "WARN" "CRD missing"
        return 0
    }

    not_ready=$(echo "$css" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for item in data.get("items", []):
    name = item["metadata"]["name"]
    conds = item.get("status", {}).get("conditions", [])
    ready = next((c for c in conds if c.get("type") == "Ready"), None)
    if not ready or ready.get("status") != "True":
        print(f"{name}:{ready.get(\"reason\", \"NoCondition\") if ready else \"NoCondition\"}")
' 2>/dev/null) || true

    if [[ -z "$not_ready" ]]; then
        local total
        total=$(echo "$css" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("items",[])))' 2>/dev/null || echo "?")
        pass "All $total ClusterSecretStores Ready"
        json_add "monitoring_css" "PASS" "$total Ready"
    else
        [[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready"
        while IFS= read -r line; do
            fail "ClusterSecretStore not Ready: $line"
            detail+="$line; "
        done <<< "$not_ready"
        status="FAIL"
        json_add "monitoring_css" "$status" "$detail"
    fi
}

# --- 40. External Reachability: Cloudflared + Authentik Replicas ---
check_external_replicas() {
    section 40 "External — Cloudflared + Authentik Replicas"
    local detail="" had_issue=false status="PASS"

    # Cloudflared
    local cf_json cf_ready cf_desired
    cf_json=$($KUBECTL get deployment cloudflared -n cloudflared -o json 2>/dev/null || true)
    if [[ -z "$cf_json" ]]; then
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
        fail "Cloudflared deployment not found"
        detail+="cloudflared=missing; "
        had_issue=true
        status="FAIL"
    else
        cf_ready=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0")
        cf_desired=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0")
        if [[ "$cf_ready" != "$cf_desired" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
            fail "Cloudflared: $cf_ready/$cf_desired ready (external access degraded)"
            detail+="cloudflared=${cf_ready}/${cf_desired}; "
            had_issue=true
            status="FAIL"
        else
            detail+="cloudflared=${cf_ready}/${cf_desired}; "
        fi
    fi

    # Authentik server (Helm chart names the deployment goauthentik-server)
    local auth_json auth_ready auth_desired
    auth_json=$($KUBECTL get deployment goauthentik-server -n authentik -o json 2>/dev/null || true)
    if [[ -z "$auth_json" ]]; then
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
        warn "goauthentik-server deployment not found in authentik namespace"
        detail+="authentik=missing; "
        had_issue=true
        [[ "$status" != "FAIL" ]] && status="WARN"
    else
        auth_ready=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0")
        auth_desired=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0")
        if [[ "$auth_ready" != "$auth_desired" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
            fail "goauthentik-server: $auth_ready/$auth_desired ready (auth degraded)"
            detail+="authentik=${auth_ready}/${auth_desired}; "
            had_issue=true
            status="FAIL"
        else
            detail+="authentik=${auth_ready}/${auth_desired}; "
        fi
    fi

    [[ "$had_issue" == false ]] && pass "Cloudflared + authentik-server at full replicas ($detail)"
    json_add "external_replicas" "$status" "$detail"
}

# --- 41. External Reachability: ExternalAccessDivergence Alert ---
check_external_divergence() {
    section 41 "External — ExternalAccessDivergence Alert"
    local alerts result detail="" status="PASS"

    alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
        wget -qO- "http://localhost:9090/api/v1/alerts" 2>/dev/null || true)

    if [[ -z "$alerts" ]]; then
        [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
        warn "Cannot query Prometheus alerts"
        json_add "external_divergence" "WARN" "Cannot query"
        return 0
    fi

    result=$(echo "$alerts" | python3 -c '
import json, sys
try:
    data = json.load(sys.stdin)
    alerts = data.get("data", {}).get("alerts", []) if isinstance(data, dict) else data
    firing = [a for a in alerts
              if a.get("labels", {}).get("alertname") == "ExternalAccessDivergence"
              and a.get("state") == "firing"]
    if firing:
        hosts = [a.get("labels", {}).get("host") or a.get("labels", {}).get("service") or "?" for a in firing]
        print(f"{len(firing)}:" + ",".join(hosts))
    else:
        print("0:")
except Exception as e:
    print(f"error:{e}")
' 2>/dev/null) || result="error:parse"

    if [[ "$result" == error:* ]]; then
        [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
        warn "Failed to parse alerts JSON: ${result#error:}"
        json_add "external_divergence" "WARN" "Parse error"
        return 0
    fi

    local count names
    count=$(echo "$result" | cut -d: -f1)
    names=$(echo "$result" | cut -d: -f2-)

    if [[ "$count" -eq 0 ]]; then
        pass "ExternalAccessDivergence not firing"
        json_add "external_divergence" "PASS" "Not firing"
    else
        [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
        fail "ExternalAccessDivergence firing for $count target(s): $names"
        status="FAIL"
        detail="$count firing: $names"
        json_add "external_divergence" "$status" "$detail"
    fi
}

# --- 42. External Reachability: Traefik 5xx Rate ---
check_pve_thermals() {
    section 43 "PVE Host Thermals — Xeon E5-2699v4 package + per-core temps"
    local raw status="PASS"

    # Read all hwmon temp inputs in one SSH round-trip. Output: one line per
    # sensor, "<sensor_label> <celsius>". Falls back gracefully on missing
    # labels (Xeon coretemp driver exposes both `Package id 0` and `Core N`).
    raw=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
        root@192.168.1.127 '
        cd /sys/class/hwmon/hwmon0 2>/dev/null || exit 1
        for tfile in temp*_input; do
            [[ -e "$tfile" ]] || continue
            base=${tfile%_input}
            label=$(cat "${base}_label" 2>/dev/null || echo "$base")
            val=$(cat "$tfile" 2>/dev/null)
            [[ -n "$val" ]] && echo "$label $((val/1000))"
        done
        ' 2>/dev/null || true)

    if [[ -z "$raw" ]]; then
        [[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
        warn "Could not read hwmon temps from 192.168.1.127 (SSH BatchMode failed or path missing)"
        json_add "pve_thermals" "WARN" "SSH failed or hwmon path missing"
        return 0
    fi

    local pkg_temp max_core_temp max_core_label
    pkg_temp=$(echo "$raw" | awk '/^Package id/{print $NF; exit}')
    max_core_temp=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print m}')
    max_core_label=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print lbl}')

    # Healthy baseline for this R730 (verified Apr 20-May 8 2026 from
    # Prometheus): peak 61-69°C, avg 51-55°C. Treat anything above 65°C
    # as a signal that some VM/workload is using too much CPU and warrants
    # investigation, even though the Xeon E5-2699v4 has TjMax=83°C /
    # Tcrit=93°C. This catches load creep early, well before throttling.
    #   PASS  < 65°C package    (within baseline 55-65 °C band)
    #   WARN  65-82°C package   (elevated — investigate top CPU consumer)
    #   FAIL  >= 83°C package   (at/above TjMax — throttling imminent)
    local detail="package=${pkg_temp}°C max_core=${max_core_temp}°C (${max_core_label})"
    if [[ -z "$pkg_temp" ]]; then
        [[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
        warn "Package temp not found in hwmon output"
        json_add "pve_thermals" "WARN" "$detail"
    elif [[ "$pkg_temp" -ge 83 ]]; then
        [[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
        fail "PVE package temp ${pkg_temp}°C >= TjMax (83°C) — throttling imminent. $detail"
        json_add "pve_thermals" "FAIL" "$detail"
        status="FAIL"
    elif [[ "$pkg_temp" -ge 65 ]]; then
        [[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
        warn "PVE package temp ${pkg_temp}°C above baseline (>65°C) — some VM is using too much CPU; check top kvm processes. $detail"
        json_add "pve_thermals" "WARN" "$detail"
    else
        pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label}) — within 55-65°C baseline"
        json_add "pve_thermals" "PASS" "$detail"
    fi
}

check_pve_load() {
    section 44 "PVE Host Load — load avg vs 44-thread capacity"
    local raw load_1 load_5 load_15

    raw=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
        root@192.168.1.127 'cat /proc/loadavg' 2>/dev/null || true)

    if [[ -z "$raw" ]]; then
        [[ "$QUIET" == true ]] && section_always 44 "PVE Host Load"
        warn "Could not read /proc/loadavg from 192.168.1.127"
        json_add "pve_load" "WARN" "SSH failed"
        return 0
    fi

    load_1=$(echo "$raw" | awk '{print $1}')
    load_5=$(echo "$raw" | awk '{print $2}')
    load_15=$(echo "$raw" | awk '{print $3}')
    # Round load_5 down for integer comparison (avoid bc dep)
    local load_5_int
    load_5_int=$(printf '%.0f' "$load_5")

    # R730: 44 hw threads (22c × HT). Healthy avg ~ 15-22 (~30-50% utilisation
    # of thread count). Warn when sustained 5-min above 30 (~70% threads
    # busy). Fail when 5-min above 38 (~85% — close to scheduler saturation).
    #   PASS  load_5 < 30
    #   WARN  30 <= load_5 < 38
    #   FAIL  load_5 >= 38
    local detail="1m=${load_1} 5m=${load_5} 15m=${load_15}"
    if [[ "$load_5_int" -ge 38 ]]; then
        [[ "$QUIET" == true ]] && section_always 44 "PVE Host Load"
        fail "PVE 5-min load ${load_5} >= 38 of 44 threads — saturation. $detail"
        json_add "pve_load" "FAIL" "$detail"
    elif [[ "$load_5_int" -ge 30 ]]; then
        [[ "$QUIET" == true ]] && section_always 44 "PVE Host Load"
        warn "PVE 5-min load ${load_5} in warn band (30-37 of 44 threads). $detail"
        json_add "pve_load" "WARN" "$detail"
    else
        pass "PVE load avg $detail (< 30/44 threads)"
        json_add "pve_load" "PASS" "$detail"
    fi
}

check_external_traefik_5xx() {
    section 42 "External — Traefik 5xx Rate (15m)"
    local query_result detail="" status="PASS"

    query_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
        wget -qO- 'http://localhost:9090/api/v1/query?query=topk(10,rate(traefik_service_requests_total{code=~%225..%22}%5B15m%5D))' 2>/dev/null || true)

    if [[ -z "$query_result" ]]; then
        [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
        warn "Cannot query Prometheus for traefik 5xx rate"
        json_add "external_traefik_5xx" "WARN" "Query failed"
        return 0
    fi

    local parsed
    parsed=$(echo "$query_result" | python3 -c '
import json, sys
try:
    data = json.load(sys.stdin)
    results = data.get("data", {}).get("result", [])
    hot = [(r.get("metric", {}).get("service", "?"), float(r.get("value", [0, "0"])[1])) for r in results]
    hot = [(s, v) for s, v in hot if v > 0.01]  # 1% req/s threshold
    hot.sort(key=lambda x: -x[1])
    if not hot:
        print("0:")
    else:
        top = [f"{s}={v:.2f}/s" for s, v in hot[:5]]
        print(f"{len(hot)}:" + "; ".join(top))
except Exception as e:
    print(f"error:{e}")
' 2>/dev/null) || parsed="error:parse"

    if [[ "$parsed" == error:* ]]; then
        [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
        warn "Parse failed: ${parsed#error:}"
        json_add "external_traefik_5xx" "WARN" "Parse error"
        return 0
    fi

    local count top
    count=$(echo "$parsed" | cut -d: -f1)
    top=$(echo "$parsed" | cut -d: -f2-)

    if [[ "$count" -eq 0 ]]; then
        pass "No Traefik services with 5xx rate >0.01 req/s (last 15m)"
        json_add "external_traefik_5xx" "PASS" "None above threshold"
    else
        [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
        # WARN at any 5xx; FAIL if top service >1 req/s
        local top_rate
        top_rate=$(echo "$top" | grep -oE '[0-9.]+/s' | head -1 | tr -d '/s')
        if awk "BEGIN{exit !($top_rate > 1.0)}" 2>/dev/null; then
            fail "$count Traefik service(s) with elevated 5xx: $top"
            status="FAIL"
        else
            warn "$count Traefik service(s) emitting 5xx: $top"
            status="WARN"
        fi
        detail="$count services: $top"
        json_add "external_traefik_5xx" "$status" "$detail"
    fi
}

# --- Summary ---
print_summary() {
    if [[ "$JSON" == true ]]; then
        echo "{"
        echo "  \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
        echo "  \"pass\": $PASS_COUNT,"
        echo "  \"warn\": $WARN_COUNT,"
        echo "  \"fail\": $FAIL_COUNT,"
        echo "  \"checks\": ["
        local first=true
        for r in "${JSON_RESULTS[@]}"; do
            if [[ "$first" == true ]]; then
                echo "    $r"
                first=false
            else
                echo "    ,$r"
            fi
        done
        echo "  ]"
        echo "}"
        return 0
    fi

    echo ""
    echo -e "${BOLD}═══════════════════════════════════════${NC}"
    echo -e "${BOLD}  Cluster Health Summary${NC}"
    echo -e "${BOLD}═══════════════════════════════════════${NC}"
    echo -e "  ${GREEN}PASS${NC}: $PASS_COUNT    ${YELLOW}WARN${NC}: $WARN_COUNT    ${RED}FAIL${NC}: $FAIL_COUNT"
    echo ""

    if [[ "$FAIL_COUNT" -gt 0 ]]; then
        echo -e "  Overall: ${RED}UNHEALTHY${NC}"
    elif [[ "$WARN_COUNT" -gt 0 ]]; then
        echo -e "  Overall: ${YELLOW}DEGRADED${NC}"
    else
        echo -e "  Overall: ${GREEN}HEALTHY${NC}"
    fi
    echo ""
}

# --- Main ---
main() {
    parse_args "$@"

    if [[ "$JSON" != true ]]; then
        echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')"
        echo -e "Kubeconfig: $KUBECONFIG_PATH"
        if [[ "$FIX" == true ]]; then
            echo -e "${YELLOW}Auto-fix mode enabled${NC}"
        fi
    fi

    check_nodes
    check_resources
    check_conditions
    check_pods
    check_evicted
    check_daemonsets
    check_deployments
    check_pvcs
    check_hpa
    check_cronjobs
    check_crowdsec
    check_ingresses
    check_alerts
    check_uptime_kuma
    check_resourcequota
    check_statefulsets
    check_node_disk
    check_helm_releases
    check_kyverno
    check_nfs
    check_dns
    check_tls_certs
    check_gpu
    check_cloudflare_tunnel
    check_overcommit
    check_ha_entities
    check_ha_integrations
    check_ha_automations
    check_ha_system
    check_hardware_exporters
    check_cert_manager_certificates
    check_cert_manager_expiry
    check_cert_manager_requests
    check_backup_per_db
    check_backup_offsite_sync
    check_backup_lvm_snapshots
    check_monitoring_prom_am
    check_monitoring_vault
    check_monitoring_css
    check_external_replicas
    check_external_divergence
    check_pve_thermals
    check_pve_load
    check_external_traefik_5xx
    print_summary

    # Exit code: 2 for failures, 1 for warnings, 0 for clean
    if [[ "$FAIL_COUNT" -gt 0 ]]; then
        exit 2
    elif [[ "$WARN_COUNT" -gt 0 ]]; then
        exit 1
    fi
    exit 0
}

main "$@"