infra/.claude/cluster-health.sh

#!/usr/bin/env bash

# Cluster health check script (pod-compatible version).
# Runs 24 diagnostic checks against the Kubernetes cluster and prints
# a colour-coded report with PASS / WARN / FAIL for each section.
# Optionally posts results to Slack.
#
# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]
#
# Environment:
#   KUBECONFIG         — path to kubeconfig (used in pod environment)
#   SLACK_WEBHOOK_URL  — Slack incoming webhook URL (required unless --no-slack)
#   UPTIME_KUMA_PASSWORD — Uptime Kuma admin password

set -euo pipefail

# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m'

# --- Globals ---
PASS_COUNT=0
WARN_COUNT=0
FAIL_COUNT=0
FIX=false
QUIET=false
JSON=false
SEND_SLACK=true
KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}"
KUBECTL=""
JSON_RESULTS=()
TOTAL_CHECKS=24

# --- Helpers ---
info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
pass()  { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e "  ${GREEN}[PASS]${NC} $*"; }
warn()  { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e "  ${YELLOW}[WARN]${NC} $*"; }
fail()  { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e "  ${RED}[FAIL]${NC} $*"; }

section() {
    local num="$1" title="$2"
    [[ "$JSON" == true ]] && return 0
    [[ "$QUIET" == true ]] && return 0
    echo ""
    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}

section_always() {
    local num="$1" title="$2"
    [[ "$JSON" == true ]] && return 0
    echo ""
    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
}

json_add() {
    local name="$1" status="$2" detail="$3"
    local escaped
    escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))')
    JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}")
}

# count lines in a variable, returning 0 for empty strings
count_lines() {
    local input="$1"
    if [[ -z "$input" ]]; then
        echo 0
    else
        echo "$input" | wc -l | tr -d ' '
    fi
}

# --- Argument parsing ---
parse_args() {
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --fix)        FIX=true; shift ;;
            --quiet|-q)   QUIET=true; shift ;;
            --json)       JSON=true; shift ;;
            --no-slack)   SEND_SLACK=false; shift ;;
            --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
            -h|--help)
                echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]"
                echo ""
                echo "Flags:"
                echo "  --fix              Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)"
                echo "  --quiet, -q        Only show WARN and FAIL sections"
                echo "  --json             Machine-readable JSON output"
                echo "  --kubeconfig PATH  Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)"
                echo "  --no-slack         Skip Slack notification"
                exit 0
                ;;
            *)
                echo "Unknown option: $1" >&2
                exit 1
                ;;
        esac
    done
    KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"

    # Auto-source UPTIME_KUMA_PASSWORD from terraform.tfvars if not set
    if [[ -z "${UPTIME_KUMA_PASSWORD:-}" ]]; then
        local script_dir tfvars_file
        script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
        tfvars_file="${script_dir}/../terraform.tfvars"
        if [[ -f "$tfvars_file" ]]; then
            UPTIME_KUMA_PASSWORD=$(grep 'uptime_kuma_password' "$tfvars_file" | head -1 | sed 's/.*= *"\(.*\)"/\1/')
            export UPTIME_KUMA_PASSWORD
        fi
    fi
}

# --- 1. Node Status ---
check_nodes() {
    section 1 "Node Status"
    local nodes not_ready versions unique_versions detail=""

    nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; }
    not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true)
    versions=$(echo "$nodes" | awk '{print $5}' | sort -u)
    unique_versions=$(echo "$versions" | wc -l | tr -d ' ')

    if [[ -n "$not_ready" ]]; then
        [[ "$QUIET" == true ]] && section_always 1 "Node Status"
        fail "NotReady nodes: $not_ready"
        detail="NotReady: $not_ready"
        json_add "node_status" "FAIL" "$detail"
    elif [[ "$unique_versions" -gt 1 ]]; then
        [[ "$QUIET" == true ]] && section_always 1 "Node Status"
        warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')"
        detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')"
        json_add "node_status" "WARN" "$detail"
    else
        pass "All nodes Ready, version $(echo "$versions" | head -1)"
        detail="All nodes Ready"
        json_add "node_status" "PASS" "$detail"
    fi
}

# --- 2. Node Resources ---
check_resources() {
    section 2 "Node Resources"
    local top detail="" had_issue=false status="PASS"

    top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; }

    while IFS= read -r line; do
        local node cpu_pct mem_pct
        node=$(echo "$line" | awk '{print $1}')
        cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
        mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')

        # Skip nodes where metrics are not yet available
        if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then
            detail+="$node metrics unavailable; "
            continue
        fi

        if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
            fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; "
            had_issue=true
            status="FAIL"
        elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
            warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; "
            had_issue=true
            [[ "$status" != "FAIL" ]] && status="WARN"
        else
            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; "
        fi
    done <<< "$top"

    [[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory"
    json_add "node_resources" "$status" "$detail"
}

# --- 3. Node Conditions ---
check_conditions() {
    section 3 "Node Conditions"
    local conditions detail=""

    conditions=$($KUBECTL get nodes -o json | python3 -c '
import json, sys
data = json.load(sys.stdin)
for node in data["items"]:
    name = node["metadata"]["name"]
    for c in node["status"]["conditions"]:
        if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True":
            print(name + ": " + c["type"])
' 2>&1) || true

    if [[ -n "$conditions" ]]; then
        [[ "$QUIET" == true ]] && section_always 3 "Node Conditions"
        while IFS= read -r line; do
            fail "$line"
        done <<< "$conditions"
        detail="$conditions"
        json_add "node_conditions" "FAIL" "$detail"
    else
        pass "No pressure conditions on any node"
        json_add "node_conditions" "PASS" "No pressure conditions"
    fi
}

# --- 4. Problematic Pods ---
check_pods() {
    section 4 "Problematic Pods"
    local bad count detail="" status="PASS"

    bad=$( {
        $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
            | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
        $KUBECTL get pods -A --no-headers 2>/dev/null \
            | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
    } | awk '!seen[$1,$2]++' | sed '/^$/d') || true

    count=$(count_lines "$bad")

    # Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled
    if [[ "$FIX" == true && "$count" -gt 0 ]]; then
        local fixed_count=0
        while IFS= read -r line; do
            [[ -z "$line" ]] && continue
            local ns pod pod_status restarts restarts_clean
            ns=$(echo "$line" | awk '{print $1}')
            pod=$(echo "$line" | awk '{print $2}')
            pod_status=$(echo "$line" | awk '{print $4}')
            restarts=$(echo "$line" | awk '{print $5}')
            restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0")

            if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then
                info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)"
                $KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true
                fixed_count=$((fixed_count + 1))
            fi
        done <<< "$bad"
        if [[ "$fixed_count" -gt 0 ]]; then
            info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts"
        fi
    fi

    if [[ "$count" -eq 0 ]]; then
        pass "No problematic pods"
        detail="None"
    elif [[ "$count" -le 10 ]]; then
        [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
        warn "$count problematic pod(s):"
        [[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo "    $line"; done
        detail="$count pods"
        status="WARN"
    else
        [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
        fail "$count problematic pods (showing first 10):"
        [[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo "    $line"; done
        detail="$count pods"
        status="FAIL"
    fi
    json_add "problematic_pods" "$status" "$detail"
}

# --- 5. Evicted/Failed Pods ---
check_evicted() {
    section 5 "Evicted/Failed Pods"
    local evicted count detail="" status="PASS"

    evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true)
    count=$(count_lines "$evicted")

    if [[ "$count" -eq 0 ]]; then
        pass "No evicted or failed pods"
        detail="0"
    elif [[ "$count" -le 50 ]]; then
        [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
        warn "$count evicted/failed pod(s)"
        detail="$count pods"
        status="WARN"
    else
        [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
        fail "$count evicted/failed pods"
        detail="$count pods"
        status="FAIL"
    fi

    if [[ "$FIX" == true && "$count" -gt 0 ]]; then
        info "Deleting $count evicted/failed pods..."
        $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true
        info "Deleted evicted/failed pods"
    fi
    json_add "evicted_pods" "$status" "$detail"
}

# --- 6. DaemonSets ---
check_daemonsets() {
    section 6 "DaemonSets"
    local ds detail="" had_issue=false

    ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; }

    while IFS= read -r line; do
        local ns name desired ready
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        desired=$(echo "$line" | awk '{print $3}')
        ready=$(echo "$line" | awk '{print $5}')

        if [[ "$desired" != "$ready" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets"
            fail "$ns/$name: desired=$desired ready=$ready"
            detail+="$ns/$name desired=$desired ready=$ready; "
            had_issue=true
        fi
    done <<< "$ds"

    if [[ "$had_issue" == false ]]; then
        pass "All DaemonSets healthy (desired == ready)"
        json_add "daemonsets" "PASS" "All healthy"
    else
        json_add "daemonsets" "FAIL" "$detail"
    fi
}

# --- 7. Deployments ---
check_deployments() {
    section 7 "Deployments"
    local deps detail="" had_issue=false

    deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; }

    while IFS= read -r line; do
        local ns name ready current desired
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        ready=$(echo "$line" | awk '{print $3}')
        current=$(echo "$ready" | cut -d/ -f1)
        desired=$(echo "$ready" | cut -d/ -f2)

        if [[ "$current" != "$desired" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments"
            fail "$ns/$name: $current/$desired ready"
            detail+="$ns/$name $current/$desired; "
            had_issue=true
        fi
    done <<< "$deps"

    if [[ "$had_issue" == false ]]; then
        pass "All deployments fully available"
        json_add "deployments" "PASS" "All available"
    else
        json_add "deployments" "FAIL" "$detail"
    fi
}

# --- 8. PVC Status ---
check_pvcs() {
    section 8 "PVC Status"
    local pvcs detail="" had_issue=false

    pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true
    if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then
        pass "No PVCs in cluster"
        json_add "pvcs" "PASS" "No PVCs"
        return 0
    fi

    while IFS= read -r line; do
        local ns name status
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        status=$(echo "$line" | awk '{print $3}')

        if [[ "$status" != "Bound" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status"
            fail "$ns/$name: $status"
            detail+="$ns/$name=$status; "
            had_issue=true
        fi
    done <<< "$pvcs"

    if [[ "$had_issue" == false ]]; then
        pass "All PVCs Bound"
        json_add "pvcs" "PASS" "All Bound"
    else
        json_add "pvcs" "FAIL" "$detail"
    fi
}

# --- 9. HPA Health ---
check_hpa() {
    section 9 "HPA Health"
    local hpas detail="" had_issue=false status="PASS"

    hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true
    if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then
        pass "No HPAs configured"
        json_add "hpa" "PASS" "No HPAs"
        return 0
    fi

    while IFS= read -r line; do
        local ns name targets
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        targets=$(echo "$line" | awk '{print $3}')

        if echo "$targets" | grep -q '<unknown>'; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
            fail "$ns/$name: targets=$targets (unknown metrics)"
            detail+="$ns/$name=unknown; "
            had_issue=true
            status="FAIL"
        else
            # Parse percentage values from targets like "45%/80%, 30%/50%"
            local pcts
            pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true)
            if [[ -n "$pcts" ]]; then
                while IFS= read -r pct; do
                    [[ -z "$pct" ]] && continue
                    if [[ "$pct" -gt 150 ]]; then
                        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
                        fail "$ns/$name: utilization at ${pct}%"
                        detail+="$ns/$name=${pct}%; "
                        had_issue=true
                        status="FAIL"
                        break
                    elif [[ "$pct" -gt 100 ]]; then
                        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
                        warn "$ns/$name: utilization at ${pct}%"
                        detail+="$ns/$name=${pct}%; "
                        had_issue=true
                        [[ "$status" != "FAIL" ]] && status="WARN"
                        break
                    fi
                done <<< "$pcts"
            fi
        fi
    done <<< "$hpas"

    [[ "$had_issue" == false ]] && pass "All HPAs healthy"
    json_add "hpa" "$status" "${detail:-All healthy}"
}

# --- 10. CronJob Failures ---
check_cronjobs() {
    section 10 "CronJob Failures"
    local failures detail=""

    failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
import json, sys
from datetime import datetime, timezone, timedelta

data = json.load(sys.stdin)
cutoff = datetime.now(timezone.utc) - timedelta(hours=24)

for job in data.get("items", []):
    meta = job.get("metadata", {})
    ns = meta.get("namespace", "")
    name = meta.get("name", "")

    owners = meta.get("ownerReferences", [])
    is_cronjob = any(o.get("kind") == "CronJob" for o in owners)
    if not is_cronjob:
        continue

    conditions = job.get("status", {}).get("conditions", [])
    for c in conditions:
        if c.get("type") == "Failed" and c.get("status") == "True":
            ts = c.get("lastTransitionTime", "")
            if ts:
                try:
                    t = datetime.fromisoformat(ts.replace("Z", "+00:00"))
                    if t > cutoff:
                        print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
                except:
                    print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
' 2>/dev/null) || true

    if [[ -z "$failures" ]]; then
        pass "No CronJob failures in last 24h"
        json_add "cronjob_failures" "PASS" "None"
    else
        [[ "$QUIET" == true ]] && section_always 10 "CronJob Failures"
        local count
        count=$(count_lines "$failures")
        fail "$count CronJob failure(s) in last 24h:"
        [[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo "    $line"; done
        json_add "cronjob_failures" "FAIL" "$count failures"
    fi
}

# --- 11. CrowdSec ---
check_crowdsec() {
    section 11 "CrowdSec Agents"
    local cs_pods not_running

    cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true)
    if [[ -z "$cs_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
        warn "CrowdSec namespace not found or empty"
        json_add "crowdsec" "WARN" "No CrowdSec pods found"
        return 0
    fi

    not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
    if [[ -n "$not_running" ]]; then
        [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
        while IFS= read -r line; do
            fail "CrowdSec pod not running: $line"
        done <<< "$not_running"
        json_add "crowdsec" "FAIL" "$not_running"
    else
        local total
        total=$(count_lines "$cs_pods")
        pass "All $total CrowdSec pods running"
        json_add "crowdsec" "PASS" "$total pods running"
    fi
}

# --- 12. Ingress ---
check_ingresses() {
    section 12 "Ingress Routes"
    local ingresses no_lb detail="" had_issue=false

    ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true)
    if [[ -n "$ingresses" ]]; then
        no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true)
        if [[ -n "$no_lb" ]]; then
            [[ "$QUIET" == true ]] && section_always 12 "Ingress Routes"
            while IFS= read -r line; do
                fail "Ingress missing LB IP: $line"
            done <<< "$no_lb"
            detail="Missing LB: $no_lb"
            had_issue=true
        fi
    fi

    # Check Traefik LB service
    local traefik_svc_ip
    traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
    if [[ -z "$traefik_svc_ip" ]]; then
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes"
        fail "Traefik LoadBalancer has no external IP"
        detail+="Traefik LB missing IP; "
        had_issue=true
    else
        detail+="Traefik LB=$traefik_svc_ip; "
    fi

    if [[ "$had_issue" == false ]]; then
        pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)"
        json_add "ingresses" "PASS" "$detail"
    else
        json_add "ingresses" "FAIL" "$detail"
    fi
}

# --- 13. Prometheus Alerts ---
check_alerts() {
    section 13 "Prometheus Alerts"
    local alerts firing_count

    # Try alertmanager first, then prometheus server
    alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
        wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true)

    if [[ -z "$alerts" ]]; then
        alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
            wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true)
    fi

    if [[ -z "$alerts" ]]; then
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        warn "Could not query Prometheus/Alertmanager"
        json_add "prometheus_alerts" "WARN" "Cannot query"
        return 0
    fi

    firing_count=$(echo "$alerts" | python3 -c '
import json, sys
try:
    data = json.load(sys.stdin)
    if isinstance(data, list):
        active = [a for a in data if a.get("status", {}).get("state") == "active"]
        count = len(active)
        names = [a.get("labels", {}).get("alertname", "?") for a in active]
        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
    elif isinstance(data, dict) and "data" in data:
        alerts_list = data["data"].get("alerts", [])
        firing = [a for a in alerts_list if a.get("state") == "firing"]
        count = len(firing)
        names = [a.get("labels", {}).get("alertname", "?") for a in firing]
        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
    else:
        print("0:")
except:
    print("-1:")
' 2>/dev/null || echo "-1:")

    local count names
    count=$(echo "$firing_count" | cut -d: -f1)
    names=$(echo "$firing_count" | cut -d: -f2-)

    if [[ "$count" == "-1" ]]; then
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        warn "Failed to parse alert data"
        json_add "prometheus_alerts" "WARN" "Parse error"
    elif [[ "$count" -eq 0 ]]; then
        pass "No firing alerts"
        json_add "prometheus_alerts" "PASS" "0 firing"
    elif [[ "$count" -le 3 ]]; then
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        warn "$count firing alert(s): $names"
        json_add "prometheus_alerts" "WARN" "$count firing: $names"
    else
        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
        fail "$count firing alerts: $names"
        json_add "prometheus_alerts" "FAIL" "$count firing: $names"
    fi
}

# --- 14. Uptime Kuma ---
check_uptime_kuma() {
    section 14 "Uptime Kuma Monitors"
    local result

    result=$(python3 -c '
import sys, os
try:
    from uptime_kuma_api import UptimeKumaApi
except ImportError:
    print("ERROR:uptime-kuma-api not installed")
    sys.exit(0)

try:
    password = os.environ.get("UPTIME_KUMA_PASSWORD", "")
    if not password:
        print("ERROR:UPTIME_KUMA_PASSWORD not set")
        sys.exit(0)
    api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2)
    api.login("admin", password)

    monitors = api.get_monitors()
    heartbeats = api.get_heartbeats()

    internal_up = 0
    internal_down = []
    external_up = 0
    external_down = []
    paused_count = 0

    for m in monitors:
        mid = m.get("id")
        name = m.get("name", "unknown")
        active = m.get("active", True)
        is_external = name.startswith("[External] ")

        if not active:
            paused_count += 1
            continue

        beats = heartbeats.get(mid, [])
        if beats:
            last_beat = beats[-1]
            if isinstance(last_beat, list):
                last_beat = last_beat[-1] if last_beat else {}
            status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
            if hasattr(status, "value"):
                status = status.value
            is_up = (status == 1)
        else:
            is_up = False

        if is_external:
            if is_up:
                external_up += 1
            else:
                external_down.append(name.replace("[External] ", ""))
        else:
            if is_up:
                internal_up += 1
            else:
                internal_down.append(name)

    api.disconnect()

    int_down_names = ", ".join(internal_down) if internal_down else ""
    ext_down_names = ", ".join(external_down) if external_down else ""
    print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}")
except Exception as e:
    print(f"CONN_ERROR:{e}")
' 2>/dev/null) || result="CONN_ERROR:python execution failed"

    if [[ "$result" == "ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
        warn "Uptime Kuma: ${result#ERROR:}"
        json_add "uptime_kuma" "WARN" "${result#ERROR:}"
    elif [[ "$result" == "CONN_ERROR:"* ]]; then
        [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
        warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}"
        json_add "uptime_kuma" "WARN" "Connection failed"
    else
        local int_down int_up ext_down ext_up paused_count down_details
        int_down=$(echo "$result" | cut -d: -f1)
        int_up=$(echo "$result" | cut -d: -f2)
        ext_down=$(echo "$result" | cut -d: -f3)
        ext_up=$(echo "$result" | cut -d: -f4)
        paused_count=$(echo "$result" | cut -d: -f5)
        down_details=$(echo "$result" | cut -d: -f6-)
        local int_down_names="${down_details%%|*}"
        local ext_down_names="${down_details#*|}"

        local total_down=$((int_down + ext_down))
        local total_up=$((int_up + ext_up))
        local total_active=$((total_up + total_down))

        if [[ "$total_down" -eq 0 ]]; then
            pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)"
            json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused"
        else
            [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
            local details=""
            [[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names"
            [[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; }
            if [[ "$total_down" -le 3 ]]; then
                warn "$total_down/$total_active down: $details"
                json_add "uptime_kuma" "WARN" "$details"
            else
                fail "$total_down/$total_active down: $details"
                json_add "uptime_kuma" "FAIL" "$details"
            fi
        fi
    fi
}

# --- 15. ResourceQuota Pressure ---
check_resourcequota() {
    section 15 "ResourceQuota Pressure"
    local quotas detail="" had_issue=false status="PASS"

    quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }

    local pressure
    pressure=$(echo "$quotas" | python3 -c '
import json, sys, re

def parse_cpu(val):
    """Convert CPU value to millicores."""
    val = str(val)
    if val.endswith("m"):
        return float(val[:-1])
    return float(val) * 1000

def parse_mem(val):
    """Convert memory value to bytes."""
    val = str(val)
    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
    for suffix, mult in units.items():
        if val.endswith(suffix):
            return float(val[:-len(suffix)]) * mult
    # Plain bytes or numeric
    return float(val)

data = json.load(sys.stdin)
for item in data.get("items", []):
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    status = item.get("status", {})
    hard = status.get("hard", {})
    used = status.get("used", {})

    for resource, hard_val in hard.items():
        used_val = used.get(resource, "0")
        try:
            if "cpu" in resource:
                h = parse_cpu(hard_val)
                u = parse_cpu(used_val)
            elif "memory" in resource or "storage" in resource:
                h = parse_mem(hard_val)
                u = parse_mem(used_val)
            elif resource == "pods":
                h = float(hard_val)
                u = float(used_val)
            else:
                continue
            if h <= 0:
                continue
            pct = (u / h) * 100
            if pct > 80:
                level = "FAIL" if pct > 95 else "WARN"
                print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
        except (ValueError, ZeroDivisionError):
            pass
' 2>/dev/null) || true

    if [[ -z "$pressure" ]]; then
        pass "All ResourceQuotas below 80% usage"
        json_add "resourcequota" "PASS" "All below 80%"
    else
        [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
        while IFS= read -r line; do
            local level ns_res resource pct
            level=$(echo "$line" | cut -d: -f1)
            ns_res=$(echo "$line" | cut -d: -f2)
            resource=$(echo "$line" | cut -d: -f3)
            pct=$(echo "$line" | cut -d: -f4)
            if [[ "$level" == "FAIL" ]]; then
                fail "$ns_res: $resource at $pct"
                status="FAIL"
            else
                warn "$ns_res: $resource at $pct"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$ns_res $resource=$pct; "
            had_issue=true
        done <<< "$pressure"
        json_add "resourcequota" "$status" "$detail"
    fi
}

# --- 16. StatefulSets ---
check_statefulsets() {
    section 16 "StatefulSets"
    local sts detail="" had_issue=false

    sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
    if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
        pass "No StatefulSets in cluster"
        json_add "statefulsets" "PASS" "No StatefulSets"
        return 0
    fi

    while IFS= read -r line; do
        local ns name ready current desired
        ns=$(echo "$line" | awk '{print $1}')
        name=$(echo "$line" | awk '{print $2}')
        ready=$(echo "$line" | awk '{print $3}')
        current=$(echo "$ready" | cut -d/ -f1)
        desired=$(echo "$ready" | cut -d/ -f2)

        if [[ "$current" != "$desired" ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
            fail "$ns/$name: $current/$desired ready"
            detail+="$ns/$name $current/$desired; "
            had_issue=true
        fi
    done <<< "$sts"

    if [[ "$had_issue" == false ]]; then
        pass "All StatefulSets fully available"
        json_add "statefulsets" "PASS" "All available"
    else
        json_add "statefulsets" "FAIL" "$detail"
    fi
}

# --- 17. Node Disk Usage ---
check_node_disk() {
    section 17 "Node Disk Usage"
    local node_json detail="" had_issue=false status="PASS"

    node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }

    local disk_info
    disk_info=$(echo "$node_json" | python3 -c '
import json, sys

def parse_storage(val):
    """Convert storage value to bytes."""
    val = str(val)
    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
    for suffix, mult in units.items():
        if val.endswith(suffix):
            return float(val[:-len(suffix)]) * mult
    return float(val)

data = json.load(sys.stdin)
for node in data["items"]:
    name = node["metadata"]["name"]
    cap = node["status"].get("capacity", {})
    alloc = node["status"].get("allocatable", {})
    es_cap = cap.get("ephemeral-storage", "0")
    es_alloc = alloc.get("ephemeral-storage", "0")
    try:
        c = parse_storage(es_cap)
        a = parse_storage(es_alloc)
        if c > 0:
            used_pct = ((c - a) / c) * 100
            if used_pct > 70:  # Lower threshold after node2 containerd corruption incident
                if used_pct > 85:
                    level = "FAIL"  # Critical: Risk of containerd corruption
                elif used_pct > 75:
                    level = "WARN"  # Warning: Monitor closely
                else:
                    level = "WARN"  # Early warning
                print(f"{level}:{name}:{used_pct:.0f}")
    except (ValueError, ZeroDivisionError):
        pass
' 2>/dev/null) || true

    if [[ -z "$disk_info" ]]; then
        pass "All nodes below 70% ephemeral-storage usage"
        json_add "node_disk" "PASS" "All below 70%"
    else
        [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
        while IFS= read -r line; do
            local level node pct
            level=$(echo "$line" | cut -d: -f1)
            node=$(echo "$line" | cut -d: -f2)
            pct=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "$node: ephemeral-storage at ${pct}%"
                status="FAIL"
            else
                warn "$node: ephemeral-storage at ${pct}%"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$node=${pct}%; "
            had_issue=true
        done <<< "$disk_info"
        json_add "node_disk" "$status" "$detail"
    fi
}

# --- 18. Helm Release Health ---
check_helm_releases() {
    section 18 "Helm Release Health"

    # Helm may not be available in the pod environment
    if ! command -v helm &>/dev/null; then
        pass "Helm not available (skipped)"
        json_add "helm_releases" "PASS" "Helm not available"
        return 0
    fi

    local releases detail="" had_issue=false status="PASS"

    releases=$(helm list --all-namespaces --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
        warn "Cannot list Helm releases"
        json_add "helm_releases" "WARN" "Cannot list"
        return 0
    }

    local bad_releases
    bad_releases=$(echo "$releases" | python3 -c '
import json, sys
data = json.load(sys.stdin)
for r in data:
    name = r.get("name", "?")
    ns = r.get("namespace", "?")
    st = r.get("status", "unknown")
    if st != "deployed":
        level = "FAIL" if st.startswith("pending") else "WARN"
        print(f"{level}:{ns}/{name}:{st}")
' 2>/dev/null) || true

    if [[ -z "$bad_releases" ]]; then
        pass "All Helm releases in deployed state"
        json_add "helm_releases" "PASS" "All deployed"
    else
        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
        while IFS= read -r line; do
            local level release_name release_status
            level=$(echo "$line" | cut -d: -f1)
            release_name=$(echo "$line" | cut -d: -f2)
            release_status=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "Helm release $release_name: $release_status (blocks terraform)"
                status="FAIL"
            else
                warn "Helm release $release_name: $release_status"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$release_name=$release_status; "
            had_issue=true
        done <<< "$bad_releases"
        json_add "helm_releases" "$status" "$detail"
    fi
}

# --- 19. Kyverno Policy Engine ---
check_kyverno() {
    section 19 "Kyverno Policy Engine"
    local kv_pods not_running

    kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
    if [[ -z "$kv_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
        fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
        json_add "kyverno" "FAIL" "No Kyverno pods found"
        return 0
    fi

    not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
    if [[ -n "$not_running" ]]; then
        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
        while IFS= read -r line; do
            fail "Kyverno pod not running: $line"
        done <<< "$not_running"
        json_add "kyverno" "FAIL" "$not_running"
    else
        local total
        total=$(count_lines "$kv_pods")
        pass "All $total Kyverno pods running"
        json_add "kyverno" "PASS" "$total pods running"
    fi
}

# --- 20. NFS Connectivity ---
check_nfs() {
    section 20 "NFS Connectivity"

    # Try native tools first (available locally), fall back to kubectl-based check (pod environment)
    if command -v showmount &>/dev/null; then
        if showmount -e 192.168.1.127 &>/dev/null; then
            pass "NFS server 192.168.1.127 reachable (exports listed)"
            json_add "nfs" "PASS" "NFS reachable"
            return 0
        fi
    fi

    if command -v nc &>/dev/null; then
        if nc -z -G 3 192.168.1.127 2049 &>/dev/null; then
            pass "NFS server 192.168.1.127 port 2049 open"
            json_add "nfs" "PASS" "NFS port open"
            return 0
        fi
    fi

    # Fallback: check if NFS-backed pods are running (works in pod environment)
    local nfs_pods
    nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
import json, sys
data = json.load(sys.stdin)
count = 0
for pod in data.get("items", []):
    for vol in pod.get("spec", {}).get("volumes", []):
        if "nfs" in vol:
            if pod.get("status", {}).get("phase") == "Running":
                count += 1
            break
print(count)
' 2>/dev/null) || nfs_pods="0"

    if [[ "$nfs_pods" -gt 0 ]]; then
        pass "NFS healthy ($nfs_pods pods using NFS volumes are running)"
        json_add "nfs" "PASS" "$nfs_pods NFS pods running"
    else
        [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
        warn "Cannot verify NFS (showmount not available, no NFS pods found)"
        json_add "nfs" "WARN" "Cannot verify"
    fi
}

# --- 21. DNS Resolution ---
check_dns() {
    section 21 "DNS Resolution"
    local internal_ok=false external_ok=false detail=""

    # Try dig first (available locally), fall back to python3 (pod environment)
    # Use system resolver (no @server) so it works from any host or pod
    if command -v dig &>/dev/null; then
        if dig viktorbarzin.me +short +time=3 +tries=1 2>/dev/null | grep -q .; then
            internal_ok=true
        fi
        if dig google.com +short +time=3 +tries=1 2>/dev/null | grep -q .; then
            external_ok=true
        fi
    else
        # Fallback: use python3 for DNS resolution (works in pod environment)
        local result
        result=$(python3 -c "
import socket
try:
    socket.getaddrinfo('viktorbarzin.me', 443)
    print('INTERNAL_OK')
except Exception:
    print('INTERNAL_FAIL')
try:
    socket.getaddrinfo('google.com', 443)
    print('EXTERNAL_OK')
except Exception:
    print('EXTERNAL_FAIL')
" 2>/dev/null) || result=""

        if echo "$result" | grep -q "INTERNAL_OK"; then
            internal_ok=true
        fi
        if echo "$result" | grep -q "EXTERNAL_OK"; then
            external_ok=true
        fi
    fi

    if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
        pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
        json_add "dns" "PASS" "Both resolve"
    elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
        if [[ "$internal_ok" == false ]]; then
            warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
            detail="Internal failed"
        else
            warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
            detail="External failed"
        fi
        json_add "dns" "WARN" "$detail"
    else
        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
        fail "DNS not resolving — both internal and external failed"
        json_add "dns" "FAIL" "Both failed"
    fi
}

# --- 22. TLS Certificate Expiry ---
check_tls_certs() {
    section 22 "TLS Certificate Expiry"
    local secrets detail="" had_issue=false status="PASS"

    secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
        warn "Cannot list secrets"
        json_add "tls_certs" "WARN" "Cannot list secrets"
        return 0
    }

    local cert_issues
    cert_issues=$(echo "$secrets" | python3 -c '
import json, sys, base64, subprocess, hashlib
from datetime import datetime, timezone

data = json.load(sys.stdin)
seen_fingerprints = set()
results = []

for item in data.get("items", []):
    if item.get("type") != "kubernetes.io/tls":
        continue
    ns = item["metadata"]["namespace"]
    name = item["metadata"]["name"]
    cert_data = item.get("data", {}).get("tls.crt", "")
    if not cert_data:
        continue

    # Deduplicate by cert fingerprint
    raw = base64.b64decode(cert_data)
    fp = hashlib.sha256(raw).hexdigest()[:16]
    if fp in seen_fingerprints:
        continue
    seen_fingerprints.add(fp)

    # Parse certificate expiry with openssl
    try:
        result = subprocess.run(
            ["openssl", "x509", "-noout", "-enddate", "-subject"],
            input=raw, capture_output=True, timeout=5
        )
        output = result.stdout.decode()
        for line in output.splitlines():
            if line.startswith("notAfter="):
                date_str = line.split("=", 1)[1]
                # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
                try:
                    expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
                    expiry = expiry.replace(tzinfo=timezone.utc)
                    days_left = (expiry - datetime.now(timezone.utc)).days
                    if days_left <= 7:
                        print(f"FAIL:{ns}/{name}:{days_left}d")
                    elif days_left <= 30:
                        print(f"WARN:{ns}/{name}:{days_left}d")
                except ValueError:
                    pass
    except (subprocess.TimeoutExpired, Exception):
        pass
' 2>/dev/null) || true

    if [[ -z "$cert_issues" ]]; then
        pass "All TLS certificates valid for >30 days"
        json_add "tls_certs" "PASS" "All valid >30d"
    else
        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
        while IFS= read -r line; do
            local level cert_name days
            level=$(echo "$line" | cut -d: -f1)
            cert_name=$(echo "$line" | cut -d: -f2)
            days=$(echo "$line" | cut -d: -f3)
            if [[ "$level" == "FAIL" ]]; then
                fail "TLS cert $cert_name expires in $days"
                status="FAIL"
            else
                warn "TLS cert $cert_name expires in $days"
                [[ "$status" != "FAIL" ]] && status="WARN"
            fi
            detail+="$cert_name=$days; "
            had_issue=true
        done <<< "$cert_issues"
        json_add "tls_certs" "$status" "$detail"
    fi
}

# --- 23. GPU Health ---
check_gpu() {
    section 23 "GPU Health"
    local gpu_pods not_running

    gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
    if [[ -z "$gpu_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
        warn "NVIDIA namespace not found or empty"
        json_add "gpu" "WARN" "No GPU pods found"
        return 0
    fi

    # Check specifically for device-plugin (critical for GPU scheduling)
    local device_plugin_down=false
    local other_down=false
    local detail=""

    while IFS= read -r line; do
        local pod_name pod_status
        pod_name=$(echo "$line" | awk '{print $1}')
        pod_status=$(echo "$line" | awk '{print $3}')
        if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
            if echo "$pod_name" | grep -q "device-plugin"; then
                device_plugin_down=true
                detail+="device-plugin $pod_name: $pod_status; "
            else
                other_down=true
                detail+="$pod_name: $pod_status; "
            fi
        fi
    done <<< "$gpu_pods"

    if [[ "$device_plugin_down" == true ]]; then
        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
        fail "GPU device-plugin is down — GPU workloads cannot schedule"
        json_add "gpu" "FAIL" "$detail"
    elif [[ "$other_down" == true ]]; then
        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
        warn "Some GPU pods not running: $detail"
        json_add "gpu" "WARN" "$detail"
    else
        local total
        total=$(count_lines "$gpu_pods")
        pass "All $total GPU pods running"
        json_add "gpu" "PASS" "$total pods running"
    fi
}

# --- 24. Cloudflare Tunnel ---
check_cloudflare_tunnel() {
    section 24 "Cloudflare Tunnel"
    local cf_pods running_count total_count

    cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
    if [[ -z "$cf_pods" ]]; then
        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
        fail "Cloudflare tunnel namespace not found or empty — external access broken"
        json_add "cloudflare_tunnel" "FAIL" "No pods found"
        return 0
    fi

    total_count=$(count_lines "$cf_pods")
    running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')

    if [[ "$running_count" -eq 0 ]]; then
        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
        fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
        json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
    elif [[ "$running_count" -lt "$total_count" ]]; then
        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
        warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
        json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
    else
        pass "Cloudflare tunnel: all $total_count pods running"
        json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
    fi
}

# --- 25. Advanced CPU Monitoring (Prometheus) ---
check_prometheus_cpu() {
    section 25 "Advanced CPU Monitoring"
    local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
    local detail="" had_issue=false status="PASS"

    # Start port-forward to Prometheus if not using in-cluster DNS
    local prom_url pf_pid=""
    if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
        prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
    else
        local pf_port
        pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
        $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
        pf_pid=$!
        sleep 2
        prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
    fi
    # Cleanup port-forward on exit from this function
    trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN

    # Try to query Prometheus for CPU metrics
    local cpu_data
    cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
        warn "Prometheus not accessible for CPU monitoring"
        json_add "prometheus_cpu" "WARN" "Prometheus unreachable"
        return 0
    }

    # Parse JSON and check CPU usage
    local cpu_results
    cpu_results=$(echo "$cpu_data" | python3 -c "
import json, sys
try:
    data = json.load(sys.stdin)
    if data.get('status') == 'success':
        for result in data['data']['result']:
            instance = result['metric']['instance']
            usage = float(result['value'][1])
            # Map IP to node name
            if '10.0.20.100' in instance:
                node = 'k8s-master'
            elif '10.0.20.101' in instance:
                node = 'k8s-node1'
            elif '10.0.20.102' in instance:
                node = 'k8s-node2'
            elif '10.0.20.103' in instance:
                node = 'k8s-node3'
            elif '10.0.20.104' in instance:
                node = 'k8s-node4'
            elif 'pve-node' in instance:
                node = 'proxmox-host'
            else:
                node = instance
            print(f'{node}:{usage:.1f}')
except Exception as e:
    print(f'ERROR:{e}')
" 2>/dev/null) || true

    if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then
        warn "Failed to parse Prometheus CPU data"
        json_add "prometheus_cpu" "WARN" "Parse failed"
        return 0
    fi

    # Check CPU thresholds
    while IFS=':' read -r node usage; do
        [[ -z "$node" || -z "$usage" ]] && continue
        usage_int=${usage%.*}  # Remove decimal

        if [[ "$usage_int" -gt 85 ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
            fail "$node: ${usage}% CPU (critical)"
            detail+="$node=${usage}% [CRIT]; "
            had_issue=true
            status="FAIL"
        elif [[ "$usage_int" -gt 70 ]]; then
            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
            warn "$node: ${usage}% CPU (high)"
            detail+="$node=${usage}% [HIGH]; "
            had_issue=true
            [[ "$status" != "FAIL" ]] && status="WARN"
        else
            detail+="$node=${usage}% [OK]; "
        fi
    done <<< "$cpu_results"

    [[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)"
    json_add "prometheus_cpu" "$status" "$detail"
}

# --- 26. Power Monitoring ---
check_power_monitoring() {
    section 26 "Power Monitoring"
    local detail="" had_issue=false status="PASS"

    # Start port-forward to Prometheus if not using in-cluster DNS
    local prom_url pf_pid=""
    if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
        prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
    else
        local pf_port
        pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
        $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
        pf_pid=$!
        sleep 2
        prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
    fi
    trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN

    # GPU Power monitoring
    local gpu_query="DCGM_FI_DEV_POWER_USAGE"
    local gpu_data
    gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || {
        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
        warn "GPU power metrics unavailable"
        detail+="GPU metrics unavailable; "
        had_issue=true
        status="WARN"
    }

    if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then
        local gpu_results
        gpu_results=$(echo "$gpu_data" | python3 -c "
import json, sys
try:
    data = json.load(sys.stdin)
    if data.get('status') == 'success':
        for result in data['data']['result']:
            hostname = result['metric'].get('Hostname', 'unknown')
            power = float(result['value'][1])
            print(f'{hostname}:{power:.1f}')
except Exception:
    pass
" 2>/dev/null) || true

        # Check GPU power thresholds (Tesla T4 TDP is ~70W)
        while IFS=':' read -r node power; do
            [[ -z "$node" || -z "$power" ]] && continue
            power_int=${power%.*}

            if [[ "$power_int" -gt 65 ]]; then  # > 90% of T4 TDP
                [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
                warn "GPU $node: ${power}W (high power draw)"
                detail+="GPU-$node=${power}W [HIGH]; "
                had_issue=true
                [[ "$status" != "FAIL" ]] && status="WARN"
            elif [[ "$power_int" -gt 50 ]]; then  # > 70% of T4 TDP
                detail+="GPU-$node=${power}W [ACTIVE]; "
            else
                detail+="GPU-$node=${power}W [IDLE]; "
            fi
        done <<< "$gpu_results"
    fi

    [[ "$had_issue" == false ]] && pass "Power consumption within normal ranges"
    json_add "power_monitoring" "$status" "$detail"
}

# --- Summary ---
print_summary() {
    if [[ "$JSON" == true ]]; then
        echo "{"
        echo "  \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
        echo "  \"pass\": $PASS_COUNT,"
        echo "  \"warn\": $WARN_COUNT,"
        echo "  \"fail\": $FAIL_COUNT,"
        echo "  \"checks\": ["
        local first=true
        for r in "${JSON_RESULTS[@]}"; do
            if [[ "$first" == true ]]; then
                echo "    $r"
                first=false
            else
                echo "    ,$r"
            fi
        done
        echo "  ]"
        echo "}"
        return 0
    fi

    echo ""
    echo -e "${BOLD}═══════════════════════════════════════${NC}"
    echo -e "${BOLD}  Cluster Health Summary${NC}"
    echo -e "${BOLD}═══════════════════════════════════════${NC}"
    echo -e "  ${GREEN}PASS${NC}: $PASS_COUNT    ${YELLOW}WARN${NC}: $WARN_COUNT    ${RED}FAIL${NC}: $FAIL_COUNT"
    echo ""

    if [[ "$FAIL_COUNT" -gt 0 ]]; then
        echo -e "  Overall: ${RED}UNHEALTHY${NC}"
    elif [[ "$WARN_COUNT" -gt 0 ]]; then
        echo -e "  Overall: ${YELLOW}DEGRADED${NC}"
    else
        echo -e "  Overall: ${GREEN}HEALTHY${NC}"
    fi
    echo ""
}

# --- Slack Notification ---

# Human-readable check name mapping
friendly_check_name() {
    case "$1" in
        node_status)        echo "Node Status" ;;
        node_resources)     echo "Node Resources" ;;
        node_conditions)    echo "Node Conditions" ;;
        problematic_pods)   echo "Problematic Pods" ;;
        evicted_pods)       echo "Evicted Pods" ;;
        daemonsets)         echo "DaemonSets" ;;
        deployments)        echo "Deployments" ;;
        pvcs)               echo "PVCs" ;;
        hpa)                echo "HPAs" ;;
        cronjob_failures)   echo "CronJob Failures" ;;
        crowdsec)           echo "CrowdSec" ;;
        ingresses)          echo "Ingresses" ;;
        prometheus_alerts)  echo "Prometheus Alerts" ;;
        uptime_kuma)        echo "Uptime Kuma" ;;
        resourcequota)      echo "Resource Quotas" ;;
        statefulsets)       echo "StatefulSets" ;;
        node_disk)          echo "Node Disk" ;;
        helm_releases)      echo "Helm Releases" ;;
        kyverno)            echo "Kyverno" ;;
        nfs)                echo "NFS Storage" ;;
        dns)                echo "DNS Resolution" ;;
        tls_certs)          echo "TLS Certificates" ;;
        gpu)                echo "GPU" ;;
        cloudflare_tunnel)  echo "Cloudflare Tunnel" ;;
        prometheus_cpu)     echo "Advanced CPU Monitoring" ;;
        power_monitoring)   echo "Power Monitoring" ;;
        *)                  echo "$1" ;;
    esac
}

send_slack() {
    if [[ "$SEND_SLACK" != true ]]; then
        return 0
    fi
    if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then
        [[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification"
        return 0
    fi

    # Gather stats for summary line
    local node_count pod_count
    node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
    pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ')

    local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT))

    # Use python3 to build the entire Slack payload from JSON_RESULTS
    local json_results_str
    json_results_str=$(printf '%s\n' "${JSON_RESULTS[@]}")

    local json_payload
    json_payload=$(echo "$json_results_str" | python3 -c "
import json, sys

CHECK_NAMES = {
    'node_status': 'Node Status',
    'node_resources': 'Node Resources',
    'node_conditions': 'Node Conditions',
    'problematic_pods': 'Problematic Pods',
    'evicted_pods': 'Evicted Pods',
    'daemonsets': 'DaemonSets',
    'deployments': 'Deployments',
    'pvcs': 'PVCs',
    'hpa': 'HPAs',
    'cronjob_failures': 'CronJob Failures',
    'crowdsec': 'CrowdSec',
    'ingresses': 'Ingresses',
    'prometheus_alerts': 'Prometheus Alerts',
    'uptime_kuma': 'Uptime Kuma',
    'resourcequota': 'Resource Quotas',
    'statefulsets': 'StatefulSets',
    'node_disk': 'Node Disk',
    'helm_releases': 'Helm Releases',
    'kyverno': 'Kyverno',
    'nfs': 'NFS Storage',
    'dns': 'DNS Resolution',
    'tls_certs': 'TLS Certificates',
    'gpu': 'GPU',
    'cloudflare_tunnel': 'Cloudflare Tunnel',
    'prometheus_cpu': 'Advanced CPU Monitoring',
    'power_monitoring': 'Power Monitoring',
}

def format_detail(check, detail):
    \"\"\"Format detail text for readability. Truncate long lists, split semicolons.\"\"\"
    detail = detail.rstrip('; ').strip()

    # For checks with long comma-separated lists (e.g. Uptime Kuma down monitors),
    # truncate to first 5 items with a count
    if check == 'uptime_kuma' and ': ' in detail:
        prefix, names_str = detail.split(': ', 1)
        names = [n.strip() for n in names_str.split(',') if n.strip()]
        if len(names) > 5:
            shown = ', '.join(names[:5])
            detail = f'{prefix}: {shown} (+{len(names) - 5} more)'
        elif names:
            detail = prefix + ': ' + ', '.join(names)

    # For resource quotas and similar semicolon-separated items,
    # split into separate lines
    if '; ' in detail:
        parts = [p.strip() for p in detail.split(';') if p.strip()]
        if len(parts) > 1:
            lines = '\\n'.join(f'     \u2022 {p}' for p in parts)
            return lines

    return detail

# Parse results
fails = []
warns = []
for line in sys.stdin:
    line = line.strip()
    if not line:
        continue
    try:
        d = json.loads(line)
    except json.JSONDecodeError:
        continue
    status = d.get('status', '')
    check = d.get('check', '')
    detail = d.get('detail', '')
    name = CHECK_NAMES.get(check, check)
    formatted = format_detail(check, detail)

    if status == 'FAIL':
        fails.append((name, formatted))
    elif status == 'WARN':
        warns.append((name, formatted))

pass_count = ${PASS_COUNT}
warn_count = ${WARN_COUNT}
fail_count = ${FAIL_COUNT}
total = ${total_checks}
nodes = '${node_count}'
pods = '${pod_count}'

blocks = []

# Header block
if fail_count == 0 and warn_count == 0:
    header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*'
    summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods'
    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
else:
    issue_count = fail_count + warn_count
    emoji = ':rotating_light:' if fail_count > 0 else ':warning:'
    header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*'
    summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed'
    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})

# Failed section
if fails:
    blocks.append({'type': 'divider'})
    lines = [':x: *Failed*']
    for name, detail in fails:
        if '\\n' in detail:
            lines.append(f'\u2022 *{name}*:')
            lines.append(detail)
        else:
            lines.append(f'\u2022 *{name}*: {detail}')
    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})

# Warnings section
if warns:
    blocks.append({'type': 'divider'})
    lines = [':warning: *Warnings*']
    for name, detail in warns:
        if '\\n' in detail:
            lines.append(f'\u2022 *{name}*:')
            lines.append(detail)
        else:
            lines.append(f'\u2022 *{name}*: {detail}')
    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})

# Footer with timestamp
from datetime import datetime, timezone
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
blocks.append({'type': 'context', 'elements': [{'type': 'mrkdwn', 'text': f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}'}]})

payload = {'blocks': blocks}
print(json.dumps(payload))
")

    curl -s -X POST "$SLACK_WEBHOOK_URL" \
        -H 'Content-Type: application/json' \
        -d "$json_payload" >/dev/null 2>&1 || {
        [[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification"
    }

    [[ "$JSON" != true ]] && echo "Slack notification sent."
}

# --- Main ---
main() {
    parse_args "$@"

    if [[ "$JSON" != true ]]; then
        echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')"
        echo -e "Kubeconfig: $KUBECONFIG_PATH"
        if [[ "$FIX" == true ]]; then
            echo -e "${YELLOW}Auto-fix mode enabled${NC}"
        fi
    fi

    check_nodes
    check_resources
    check_conditions
    check_pods
    check_evicted
    check_daemonsets
    check_deployments
    check_pvcs
    check_hpa
    check_cronjobs
    check_crowdsec
    check_ingresses
    check_alerts
    check_uptime_kuma
    check_resourcequota
    check_statefulsets
    check_node_disk
    check_helm_releases
    check_kyverno
    check_nfs
    check_dns
    check_tls_certs
    check_gpu
    check_cloudflare_tunnel
    check_prometheus_cpu
    check_power_monitoring
    print_summary
    send_slack

    # Always exit 0 — reporting is done via Slack notification.
    # Non-zero exits mark the CronJob as Failed, which triggers Prometheus
    # JobFailed alerts, creating a circular alert loop.
    exit 0
}

main "$@"