From a0d770d9a720b23fe9db138abd8443ac046a19d0 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sun, 19 Apr 2026 15:13:03 +0000
Subject: [PATCH] [cluster-health] Expand to 42 checks, remove pod CronJob path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- scripts/cluster_healthcheck.sh: add 12 new checks (cert-manager
  readiness/expiry/requests, backup freshness per-DB/offsite/LVM,
  monitoring prom+AM/vault-sealed/CSS, external reachability cloudflared
  +authentik/ExternalAccessDivergence/traefik-5xx). Bump TOTAL_CHECKS
  to 42, add --no-fix flag.
- Remove the duplicate pod-version .claude/cluster-health.sh (1728
  lines) and the openclaw cluster_healthcheck CronJob (local CLI is
  now the single authoritative runner). Keep the healthcheck SA +
  Role + RoleBinding — still reused by task_processor CronJob.
- Remove SLACK_WEBHOOK_URL env from openclaw deployment and delete
  the unused setup-monitoring.sh.
- Rewrite .claude/skills/cluster-health/SKILL.md: mandates running
  the script first, refreshes the 42-check table, drops stale
  CronJob/Slack/post-mortem sections, documents the monorepo-canonical
  + hardlink layout. File is hardlinked to
  /home/wizard/code/.claude/skills/cluster-health/SKILL.md for
  dual discovery.
- AGENTS.md + k8s-portal agent page: 25-check → 42-check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/cluster-health.sh                     | 1728 -----------------
 .claude/skills/cluster-health/SKILL.md        |  505 +++--
 AGENTS.md                                     |    2 +-
 scripts/cluster_healthcheck.sh                |  607 +++++-
 setup-monitoring.sh                           |   29 -
 .../files/src/routes/agent/+server.ts         |    2 +-
 stacks/openclaw/main.tf                       |   85 +-
 7 files changed, 853 insertions(+), 2105 deletions(-)
 delete mode 100755 .claude/cluster-health.sh
 delete mode 100755 setup-monitoring.sh
diff --git a/.claude/cluster-health.sh b/.claude/cluster-health.sh
deleted file mode 100755
index 001d8ebb..00000000
--- a/.claude/cluster-health.sh
+++ /dev/null
@@ -1,1728 +0,0 @@
-#!/usr/bin/env bash
-
-# Cluster health check script (pod-compatible version).
-# Runs 24 diagnostic checks against the Kubernetes cluster and prints
-# a colour-coded report with PASS / WARN / FAIL for each section.
-# Optionally posts results to Slack.
-#
-# Usage: ./cluster-health.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]
-#
-# Environment:
-#   KUBECONFIG         — path to kubeconfig (used in pod environment)
-#   SLACK_WEBHOOK_URL  — Slack incoming webhook URL (required unless --no-slack)
-#   UPTIME_KUMA_PASSWORD — Uptime Kuma admin password
-
-set -euo pipefail
-
-# --- Colors ---
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-BLUE='\033[0;34m'
-BOLD='\033[1m'
-NC='\033[0m'
-
-# --- Globals ---
-PASS_COUNT=0
-WARN_COUNT=0
-FAIL_COUNT=0
-FIX=false
-QUIET=false
-JSON=false
-SEND_SLACK=true
-KUBECONFIG_PATH="${KUBECONFIG:-$(pwd)/config}"
-KUBECTL=""
-JSON_RESULTS=()
-TOTAL_CHECKS=24
-
-# --- Helpers ---
-info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
-pass()  { PASS_COUNT=$((PASS_COUNT + 1)); [[ "$JSON" == true ]] && return 0; [[ "$QUIET" == true ]] && return 0; echo -e "  ${GREEN}[PASS]${NC} $*"; }
-warn()  { WARN_COUNT=$((WARN_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e "  ${YELLOW}[WARN]${NC} $*"; }
-fail()  { FAIL_COUNT=$((FAIL_COUNT + 1)); [[ "$JSON" == true ]] && return 0; echo -e "  ${RED}[FAIL]${NC} $*"; }
-
-section() {
-    local num="$1" title="$2"
-    [[ "$JSON" == true ]] && return 0
-    [[ "$QUIET" == true ]] && return 0
-    echo ""
-    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
-}
-
-section_always() {
-    local num="$1" title="$2"
-    [[ "$JSON" == true ]] && return 0
-    echo ""
-    echo -e "${BOLD}[$num/$TOTAL_CHECKS] $title${NC}"
-}
-
-json_add() {
-    local name="$1" status="$2" detail="$3"
-    local escaped
-    escaped=$(echo "$detail" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read().strip()))')
-    JSON_RESULTS+=("{\"check\":\"$name\",\"status\":\"$status\",\"detail\":$escaped}")
-}
-
-# count lines in a variable, returning 0 for empty strings
-count_lines() {
-    local input="$1"
-    if [[ -z "$input" ]]; then
-        echo 0
-    else
-        echo "$input" | wc -l | tr -d ' '
-    fi
-}
-
-# --- Argument parsing ---
-parse_args() {
-    while [[ $# -gt 0 ]]; do
-        case "$1" in
-            --fix)        FIX=true; shift ;;
-            --quiet|-q)   QUIET=true; shift ;;
-            --json)       JSON=true; shift ;;
-            --no-slack)   SEND_SLACK=false; shift ;;
-            --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
-            -h|--help)
-                echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig <path>] [--no-slack]"
-                echo ""
-                echo "Flags:"
-                echo "  --fix              Auto-remediate safe issues (delete evicted/CrashLoopBackOff pods)"
-                echo "  --quiet, -q        Only show WARN and FAIL sections"
-                echo "  --json             Machine-readable JSON output"
-                echo "  --kubeconfig PATH  Override kubeconfig (default: \$KUBECONFIG or \$(pwd)/config)"
-                echo "  --no-slack         Skip Slack notification"
-                exit 0
-                ;;
-            *)
-                echo "Unknown option: $1" >&2
-                exit 1
-                ;;
-        esac
-    done
-    KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
-
-    # Auto-source UPTIME_KUMA_PASSWORD from terraform.tfvars if not set
-    if [[ -z "${UPTIME_KUMA_PASSWORD:-}" ]]; then
-        local script_dir tfvars_file
-        script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-        tfvars_file="${script_dir}/../terraform.tfvars"
-        if [[ -f "$tfvars_file" ]]; then
-            UPTIME_KUMA_PASSWORD=$(grep 'uptime_kuma_password' "$tfvars_file" | head -1 | sed 's/.*= *"\(.*\)"/\1/')
-            export UPTIME_KUMA_PASSWORD
-        fi
-    fi
-}
-
-# --- 1. Node Status ---
-check_nodes() {
-    section 1 "Node Status"
-    local nodes not_ready versions unique_versions detail=""
-
-    nodes=$($KUBECTL get nodes --no-headers 2>&1) || { fail "Cannot reach cluster"; json_add "node_status" "FAIL" "Cannot reach cluster"; return 0; }
-    not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true)
-    versions=$(echo "$nodes" | awk '{print $5}' | sort -u)
-    unique_versions=$(echo "$versions" | wc -l | tr -d ' ')
-
-    if [[ -n "$not_ready" ]]; then
-        [[ "$QUIET" == true ]] && section_always 1 "Node Status"
-        fail "NotReady nodes: $not_ready"
-        detail="NotReady: $not_ready"
-        json_add "node_status" "FAIL" "$detail"
-    elif [[ "$unique_versions" -gt 1 ]]; then
-        [[ "$QUIET" == true ]] && section_always 1 "Node Status"
-        warn "Version mismatch across nodes: $(echo "$versions" | tr '\n' ' ')"
-        detail="Version mismatch: $(echo "$versions" | tr '\n' ' ')"
-        json_add "node_status" "WARN" "$detail"
-    else
-        pass "All nodes Ready, version $(echo "$versions" | head -1)"
-        detail="All nodes Ready"
-        json_add "node_status" "PASS" "$detail"
-    fi
-}
-
-# --- 2. Node Resources ---
-check_resources() {
-    section 2 "Node Resources"
-    local top detail="" had_issue=false status="PASS"
-
-    top=$($KUBECTL top nodes --no-headers 2>&1) || { fail "metrics-server unavailable"; json_add "node_resources" "FAIL" "metrics-server unavailable"; return 0; }
-
-    while IFS= read -r line; do
-        local node cpu_pct mem_pct
-        node=$(echo "$line" | awk '{print $1}')
-        cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%')
-        mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
-
-        # Skip nodes where metrics are not yet available
-        if [[ "$cpu_pct" == *"unknown"* ]] || [[ "$mem_pct" == *"unknown"* ]]; then
-            detail+="$node metrics unavailable; "
-            continue
-        fi
-
-        if [[ "$cpu_pct" -gt 90 ]] || [[ "$mem_pct" -gt 90 ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
-            fail "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
-            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [FAIL]; "
-            had_issue=true
-            status="FAIL"
-        elif [[ "$cpu_pct" -gt 80 ]] || [[ "$mem_pct" -gt 80 ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 2 "Node Resources"
-            warn "$node: CPU ${cpu_pct}%, Mem ${mem_pct}%"
-            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [WARN]; "
-            had_issue=true
-            [[ "$status" != "FAIL" ]] && status="WARN"
-        else
-            detail+="$node CPU=${cpu_pct}% Mem=${mem_pct}% [OK]; "
-        fi
-    done <<< "$top"
-
-    [[ "$had_issue" == false ]] && pass "All nodes below 80% CPU and memory"
-    json_add "node_resources" "$status" "$detail"
-}
-
-# --- 3. Node Conditions ---
-check_conditions() {
-    section 3 "Node Conditions"
-    local conditions detail=""
-
-    conditions=$($KUBECTL get nodes -o json | python3 -c '
-import json, sys
-data = json.load(sys.stdin)
-for node in data["items"]:
-    name = node["metadata"]["name"]
-    for c in node["status"]["conditions"]:
-        if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True":
-            print(name + ": " + c["type"])
-' 2>&1) || true
-
-    if [[ -n "$conditions" ]]; then
-        [[ "$QUIET" == true ]] && section_always 3 "Node Conditions"
-        while IFS= read -r line; do
-            fail "$line"
-        done <<< "$conditions"
-        detail="$conditions"
-        json_add "node_conditions" "FAIL" "$detail"
-    else
-        pass "No pressure conditions on any node"
-        json_add "node_conditions" "PASS" "No pressure conditions"
-    fi
-}
-
-# --- 4. Problematic Pods ---
-check_pods() {
-    section 4 "Problematic Pods"
-    local bad count detail="" status="PASS"
-
-    bad=$( {
-        $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
-            | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
-        $KUBECTL get pods -A --no-headers 2>/dev/null \
-            | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
-    } | awk '!seen[$1,$2]++' | sed '/^$/d') || true
-
-    count=$(count_lines "$bad")
-
-    # Auto-fix CrashLoopBackOff pods with >10 restarts when --fix is enabled
-    if [[ "$FIX" == true && "$count" -gt 0 ]]; then
-        local fixed_count=0
-        while IFS= read -r line; do
-            [[ -z "$line" ]] && continue
-            local ns pod pod_status restarts restarts_clean
-            ns=$(echo "$line" | awk '{print $1}')
-            pod=$(echo "$line" | awk '{print $2}')
-            pod_status=$(echo "$line" | awk '{print $4}')
-            restarts=$(echo "$line" | awk '{print $5}')
-            restarts_clean=$(echo "$restarts" | grep -oE '^[0-9]+' || echo "0")
-
-            if [[ "$pod_status" == "CrashLoopBackOff" ]] && [[ "$restarts_clean" -gt 10 ]]; then
-                info "Deleting CrashLoopBackOff pod $ns/$pod (restarts: $restarts_clean)"
-                $KUBECTL delete pod -n "$ns" "$pod" --grace-period=0 2>/dev/null || true
-                fixed_count=$((fixed_count + 1))
-            fi
-        done <<< "$bad"
-        if [[ "$fixed_count" -gt 0 ]]; then
-            info "Deleted $fixed_count CrashLoopBackOff pod(s) with >10 restarts"
-        fi
-    fi
-
-    if [[ "$count" -eq 0 ]]; then
-        pass "No problematic pods"
-        detail="None"
-    elif [[ "$count" -le 10 ]]; then
-        [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
-        warn "$count problematic pod(s):"
-        [[ "$JSON" != true ]] && echo "$bad" | while IFS= read -r line; do echo "    $line"; done
-        detail="$count pods"
-        status="WARN"
-    else
-        [[ "$QUIET" == true ]] && section_always 4 "Problematic Pods"
-        fail "$count problematic pods (showing first 10):"
-        [[ "$JSON" != true ]] && echo "$bad" | head -10 | while IFS= read -r line; do echo "    $line"; done
-        detail="$count pods"
-        status="FAIL"
-    fi
-    json_add "problematic_pods" "$status" "$detail"
-}
-
-# --- 5. Evicted/Failed Pods ---
-check_evicted() {
-    section 5 "Evicted/Failed Pods"
-    local evicted count detail="" status="PASS"
-
-    evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true)
-    count=$(count_lines "$evicted")
-
-    if [[ "$count" -eq 0 ]]; then
-        pass "No evicted or failed pods"
-        detail="0"
-    elif [[ "$count" -le 50 ]]; then
-        [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
-        warn "$count evicted/failed pod(s)"
-        detail="$count pods"
-        status="WARN"
-    else
-        [[ "$QUIET" == true ]] && section_always 5 "Evicted/Failed Pods"
-        fail "$count evicted/failed pods"
-        detail="$count pods"
-        status="FAIL"
-    fi
-
-    if [[ "$FIX" == true && "$count" -gt 0 ]]; then
-        info "Deleting $count evicted/failed pods..."
-        $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null || true
-        info "Deleted evicted/failed pods"
-    fi
-    json_add "evicted_pods" "$status" "$detail"
-}
-
-# --- 6. DaemonSets ---
-check_daemonsets() {
-    section 6 "DaemonSets"
-    local ds detail="" had_issue=false
-
-    ds=$($KUBECTL get daemonsets -A --no-headers 2>&1) || { fail "Cannot list DaemonSets"; json_add "daemonsets" "FAIL" "Cannot list"; return 0; }
-
-    while IFS= read -r line; do
-        local ns name desired ready
-        ns=$(echo "$line" | awk '{print $1}')
-        name=$(echo "$line" | awk '{print $2}')
-        desired=$(echo "$line" | awk '{print $3}')
-        ready=$(echo "$line" | awk '{print $5}')
-
-        if [[ "$desired" != "$ready" ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 6 "DaemonSets"
-            fail "$ns/$name: desired=$desired ready=$ready"
-            detail+="$ns/$name desired=$desired ready=$ready; "
-            had_issue=true
-        fi
-    done <<< "$ds"
-
-    if [[ "$had_issue" == false ]]; then
-        pass "All DaemonSets healthy (desired == ready)"
-        json_add "daemonsets" "PASS" "All healthy"
-    else
-        json_add "daemonsets" "FAIL" "$detail"
-    fi
-}
-
-# --- 7. Deployments ---
-check_deployments() {
-    section 7 "Deployments"
-    local deps detail="" had_issue=false
-
-    deps=$($KUBECTL get deployments -A --no-headers 2>&1) || { fail "Cannot list Deployments"; json_add "deployments" "FAIL" "Cannot list"; return 0; }
-
-    while IFS= read -r line; do
-        local ns name ready current desired
-        ns=$(echo "$line" | awk '{print $1}')
-        name=$(echo "$line" | awk '{print $2}')
-        ready=$(echo "$line" | awk '{print $3}')
-        current=$(echo "$ready" | cut -d/ -f1)
-        desired=$(echo "$ready" | cut -d/ -f2)
-
-        if [[ "$current" != "$desired" ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 7 "Deployments"
-            fail "$ns/$name: $current/$desired ready"
-            detail+="$ns/$name $current/$desired; "
-            had_issue=true
-        fi
-    done <<< "$deps"
-
-    if [[ "$had_issue" == false ]]; then
-        pass "All deployments fully available"
-        json_add "deployments" "PASS" "All available"
-    else
-        json_add "deployments" "FAIL" "$detail"
-    fi
-}
-
-# --- 8. PVC Status ---
-check_pvcs() {
-    section 8 "PVC Status"
-    local pvcs detail="" had_issue=false
-
-    pvcs=$($KUBECTL get pvc -A --no-headers 2>&1) || true
-    if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then
-        pass "No PVCs in cluster"
-        json_add "pvcs" "PASS" "No PVCs"
-        return 0
-    fi
-
-    while IFS= read -r line; do
-        local ns name status
-        ns=$(echo "$line" | awk '{print $1}')
-        name=$(echo "$line" | awk '{print $2}')
-        status=$(echo "$line" | awk '{print $3}')
-
-        if [[ "$status" != "Bound" ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 8 "PVC Status"
-            fail "$ns/$name: $status"
-            detail+="$ns/$name=$status; "
-            had_issue=true
-        fi
-    done <<< "$pvcs"
-
-    if [[ "$had_issue" == false ]]; then
-        pass "All PVCs Bound"
-        json_add "pvcs" "PASS" "All Bound"
-    else
-        json_add "pvcs" "FAIL" "$detail"
-    fi
-}
-
-# --- 9. HPA Health ---
-check_hpa() {
-    section 9 "HPA Health"
-    local hpas detail="" had_issue=false status="PASS"
-
-    hpas=$($KUBECTL get hpa -A --no-headers 2>&1) || true
-    if [[ -z "$hpas" || "$hpas" == *"No resources found"* ]]; then
-        pass "No HPAs configured"
-        json_add "hpa" "PASS" "No HPAs"
-        return 0
-    fi
-
-    while IFS= read -r line; do
-        local ns name targets
-        ns=$(echo "$line" | awk '{print $1}')
-        name=$(echo "$line" | awk '{print $2}')
-        targets=$(echo "$line" | awk '{print $3}')
-
-        if echo "$targets" | grep -q '<unknown>'; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
-            fail "$ns/$name: targets=$targets (unknown metrics)"
-            detail+="$ns/$name=unknown; "
-            had_issue=true
-            status="FAIL"
-        else
-            # Parse percentage values from targets like "45%/80%, 30%/50%"
-            local pcts
-            pcts=$(echo "$targets" | grep -oE '[0-9]+%/' | tr -d '%/' || true)
-            if [[ -n "$pcts" ]]; then
-                while IFS= read -r pct; do
-                    [[ -z "$pct" ]] && continue
-                    if [[ "$pct" -gt 150 ]]; then
-                        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
-                        fail "$ns/$name: utilization at ${pct}%"
-                        detail+="$ns/$name=${pct}%; "
-                        had_issue=true
-                        status="FAIL"
-                        break
-                    elif [[ "$pct" -gt 100 ]]; then
-                        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 9 "HPA Health"
-                        warn "$ns/$name: utilization at ${pct}%"
-                        detail+="$ns/$name=${pct}%; "
-                        had_issue=true
-                        [[ "$status" != "FAIL" ]] && status="WARN"
-                        break
-                    fi
-                done <<< "$pcts"
-            fi
-        fi
-    done <<< "$hpas"
-
-    [[ "$had_issue" == false ]] && pass "All HPAs healthy"
-    json_add "hpa" "$status" "${detail:-All healthy}"
-}
-
-# --- 10. CronJob Failures ---
-check_cronjobs() {
-    section 10 "CronJob Failures"
-    local failures detail=""
-
-    failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c '
-import json, sys
-from datetime import datetime, timezone, timedelta
-
-data = json.load(sys.stdin)
-cutoff = datetime.now(timezone.utc) - timedelta(hours=24)
-
-for job in data.get("items", []):
-    meta = job.get("metadata", {})
-    ns = meta.get("namespace", "")
-    name = meta.get("name", "")
-
-    owners = meta.get("ownerReferences", [])
-    is_cronjob = any(o.get("kind") == "CronJob" for o in owners)
-    if not is_cronjob:
-        continue
-
-    conditions = job.get("status", {}).get("conditions", [])
-    for c in conditions:
-        if c.get("type") == "Failed" and c.get("status") == "True":
-            ts = c.get("lastTransitionTime", "")
-            if ts:
-                try:
-                    t = datetime.fromisoformat(ts.replace("Z", "+00:00"))
-                    if t > cutoff:
-                        print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
-                except:
-                    print(f"{ns}/{name}: {c.get(\"reason\", \"Unknown\")}")
-' 2>/dev/null) || true
-
-    if [[ -z "$failures" ]]; then
-        pass "No CronJob failures in last 24h"
-        json_add "cronjob_failures" "PASS" "None"
-    else
-        [[ "$QUIET" == true ]] && section_always 10 "CronJob Failures"
-        local count
-        count=$(count_lines "$failures")
-        fail "$count CronJob failure(s) in last 24h:"
-        [[ "$JSON" != true ]] && echo "$failures" | while IFS= read -r line; do echo "    $line"; done
-        json_add "cronjob_failures" "FAIL" "$count failures"
-    fi
-}
-
-# --- 11. CrowdSec ---
-check_crowdsec() {
-    section 11 "CrowdSec Agents"
-    local cs_pods not_running
-
-    cs_pods=$($KUBECTL get pods -n crowdsec --no-headers 2>/dev/null || true)
-    if [[ -z "$cs_pods" ]]; then
-        [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
-        warn "CrowdSec namespace not found or empty"
-        json_add "crowdsec" "WARN" "No CrowdSec pods found"
-        return 0
-    fi
-
-    not_running=$(echo "$cs_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
-    if [[ -n "$not_running" ]]; then
-        [[ "$QUIET" == true ]] && section_always 11 "CrowdSec Agents"
-        while IFS= read -r line; do
-            fail "CrowdSec pod not running: $line"
-        done <<< "$not_running"
-        json_add "crowdsec" "FAIL" "$not_running"
-    else
-        local total
-        total=$(count_lines "$cs_pods")
-        pass "All $total CrowdSec pods running"
-        json_add "crowdsec" "PASS" "$total pods running"
-    fi
-}
-
-# --- 12. Ingress ---
-check_ingresses() {
-    section 12 "Ingress Routes"
-    local ingresses no_lb detail="" had_issue=false
-
-    ingresses=$($KUBECTL get ingress -A --no-headers 2>/dev/null || true)
-    if [[ -n "$ingresses" ]]; then
-        no_lb=$(echo "$ingresses" | awk '{if ($5 == "" || $5 == "<none>") print $1"/"$2}' || true)
-        if [[ -n "$no_lb" ]]; then
-            [[ "$QUIET" == true ]] && section_always 12 "Ingress Routes"
-            while IFS= read -r line; do
-                fail "Ingress missing LB IP: $line"
-            done <<< "$no_lb"
-            detail="Missing LB: $no_lb"
-            had_issue=true
-        fi
-    fi
-
-    # Check Traefik LB service
-    local traefik_svc_ip
-    traefik_svc_ip=$($KUBECTL get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true)
-    if [[ -z "$traefik_svc_ip" ]]; then
-        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 12 "Ingress Routes"
-        fail "Traefik LoadBalancer has no external IP"
-        detail+="Traefik LB missing IP; "
-        had_issue=true
-    else
-        detail+="Traefik LB=$traefik_svc_ip; "
-    fi
-
-    if [[ "$had_issue" == false ]]; then
-        pass "All ingresses have LB assignment (Traefik LB=$traefik_svc_ip)"
-        json_add "ingresses" "PASS" "$detail"
-    else
-        json_add "ingresses" "FAIL" "$detail"
-    fi
-}
-
-# --- 13. Prometheus Alerts ---
-check_alerts() {
-    section 13 "Prometheus Alerts"
-    local alerts firing_count
-
-    # Try alertmanager first, then prometheus server
-    alerts=$($KUBECTL exec -n monitoring deploy/prometheus-alertmanager -- \
-        wget -q -O- http://localhost:9093/api/v2/alerts 2>/dev/null || true)
-
-    if [[ -z "$alerts" ]]; then
-        alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
-            wget -q -O- http://localhost:9090/api/v1/alerts 2>/dev/null || true)
-    fi
-
-    if [[ -z "$alerts" ]]; then
-        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
-        warn "Could not query Prometheus/Alertmanager"
-        json_add "prometheus_alerts" "WARN" "Cannot query"
-        return 0
-    fi
-
-    firing_count=$(echo "$alerts" | python3 -c '
-import json, sys
-try:
-    data = json.load(sys.stdin)
-    if isinstance(data, list):
-        active = [a for a in data if a.get("status", {}).get("state") == "active"]
-        count = len(active)
-        names = [a.get("labels", {}).get("alertname", "?") for a in active]
-        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
-    elif isinstance(data, dict) and "data" in data:
-        alerts_list = data["data"].get("alerts", [])
-        firing = [a for a in alerts_list if a.get("state") == "firing"]
-        count = len(firing)
-        names = [a.get("labels", {}).get("alertname", "?") for a in firing]
-        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
-    else:
-        print("0:")
-except:
-    print("-1:")
-' 2>/dev/null || echo "-1:")
-
-    local count names
-    count=$(echo "$firing_count" | cut -d: -f1)
-    names=$(echo "$firing_count" | cut -d: -f2-)
-
-    if [[ "$count" == "-1" ]]; then
-        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
-        warn "Failed to parse alert data"
-        json_add "prometheus_alerts" "WARN" "Parse error"
-    elif [[ "$count" -eq 0 ]]; then
-        pass "No firing alerts"
-        json_add "prometheus_alerts" "PASS" "0 firing"
-    elif [[ "$count" -le 3 ]]; then
-        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
-        warn "$count firing alert(s): $names"
-        json_add "prometheus_alerts" "WARN" "$count firing: $names"
-    else
-        [[ "$QUIET" == true ]] && section_always 13 "Prometheus Alerts"
-        fail "$count firing alerts: $names"
-        json_add "prometheus_alerts" "FAIL" "$count firing: $names"
-    fi
-}
-
-# --- 14. Uptime Kuma ---
-check_uptime_kuma() {
-    section 14 "Uptime Kuma Monitors"
-    local result
-
-    result=$(python3 -c '
-import sys, os
-try:
-    from uptime_kuma_api import UptimeKumaApi
-except ImportError:
-    print("ERROR:uptime-kuma-api not installed")
-    sys.exit(0)
-
-try:
-    password = os.environ.get("UPTIME_KUMA_PASSWORD", "")
-    if not password:
-        print("ERROR:UPTIME_KUMA_PASSWORD not set")
-        sys.exit(0)
-    api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2)
-    api.login("admin", password)
-
-    monitors = api.get_monitors()
-    heartbeats = api.get_heartbeats()
-
-    internal_up = 0
-    internal_down = []
-    external_up = 0
-    external_down = []
-    paused_count = 0
-
-    for m in monitors:
-        mid = m.get("id")
-        name = m.get("name", "unknown")
-        active = m.get("active", True)
-        is_external = name.startswith("[External] ")
-
-        if not active:
-            paused_count += 1
-            continue
-
-        beats = heartbeats.get(mid, [])
-        if beats:
-            last_beat = beats[-1]
-            if isinstance(last_beat, list):
-                last_beat = last_beat[-1] if last_beat else {}
-            status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
-            if hasattr(status, "value"):
-                status = status.value
-            is_up = (status == 1)
-        else:
-            is_up = False
-
-        if is_external:
-            if is_up:
-                external_up += 1
-            else:
-                external_down.append(name.replace("[External] ", ""))
-        else:
-            if is_up:
-                internal_up += 1
-            else:
-                internal_down.append(name)
-
-    api.disconnect()
-
-    int_down_names = ", ".join(internal_down) if internal_down else ""
-    ext_down_names = ", ".join(external_down) if external_down else ""
-    print(f"{len(internal_down)}:{internal_up}:{len(external_down)}:{external_up}:{paused_count}:{int_down_names}|{ext_down_names}")
-except Exception as e:
-    print(f"CONN_ERROR:{e}")
-' 2>/dev/null) || result="CONN_ERROR:python execution failed"
-
-    if [[ "$result" == "ERROR:"* ]]; then
-        [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
-        warn "Uptime Kuma: ${result#ERROR:}"
-        json_add "uptime_kuma" "WARN" "${result#ERROR:}"
-    elif [[ "$result" == "CONN_ERROR:"* ]]; then
-        [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
-        warn "Cannot connect to Uptime Kuma: ${result#CONN_ERROR:}"
-        json_add "uptime_kuma" "WARN" "Connection failed"
-    else
-        local int_down int_up ext_down ext_up paused_count down_details
-        int_down=$(echo "$result" | cut -d: -f1)
-        int_up=$(echo "$result" | cut -d: -f2)
-        ext_down=$(echo "$result" | cut -d: -f3)
-        ext_up=$(echo "$result" | cut -d: -f4)
-        paused_count=$(echo "$result" | cut -d: -f5)
-        down_details=$(echo "$result" | cut -d: -f6-)
-        local int_down_names="${down_details%%|*}"
-        local ext_down_names="${down_details#*|}"
-
-        local total_down=$((int_down + ext_down))
-        local total_up=$((int_up + ext_up))
-        local total_active=$((total_up + total_down))
-
-        if [[ "$total_down" -eq 0 ]]; then
-            pass "All monitors up — internal: ${int_up}, external: ${ext_up} ($paused_count paused)"
-            json_add "uptime_kuma" "PASS" "internal: $int_up up, external: $ext_up up, $paused_count paused"
-        else
-            [[ "$QUIET" == true ]] && section_always 14 "Uptime Kuma Monitors"
-            local details=""
-            [[ "$int_down" -gt 0 ]] && details="internal down($int_down): $int_down_names"
-            [[ "$ext_down" -gt 0 ]] && { [[ -n "$details" ]] && details="$details; "; details="${details}external down($ext_down): $ext_down_names"; }
-            if [[ "$total_down" -le 3 ]]; then
-                warn "$total_down/$total_active down: $details"
-                json_add "uptime_kuma" "WARN" "$details"
-            else
-                fail "$total_down/$total_active down: $details"
-                json_add "uptime_kuma" "FAIL" "$details"
-            fi
-        fi
-    fi
-}
-
-# --- 15. ResourceQuota Pressure ---
-check_resourcequota() {
-    section 15 "ResourceQuota Pressure"
-    local quotas detail="" had_issue=false status="PASS"
-
-    quotas=$($KUBECTL get resourcequota -A -o json 2>/dev/null) || { pass "No ResourceQuotas configured"; json_add "resourcequota" "PASS" "No quotas"; return 0; }
-
-    local pressure
-    pressure=$(echo "$quotas" | python3 -c '
-import json, sys, re
-
-def parse_cpu(val):
-    """Convert CPU value to millicores."""
-    val = str(val)
-    if val.endswith("m"):
-        return float(val[:-1])
-    return float(val) * 1000
-
-def parse_mem(val):
-    """Convert memory value to bytes."""
-    val = str(val)
-    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
-    for suffix, mult in units.items():
-        if val.endswith(suffix):
-            return float(val[:-len(suffix)]) * mult
-    # Plain bytes or numeric
-    return float(val)
-
-data = json.load(sys.stdin)
-for item in data.get("items", []):
-    ns = item["metadata"]["namespace"]
-    name = item["metadata"]["name"]
-    status = item.get("status", {})
-    hard = status.get("hard", {})
-    used = status.get("used", {})
-
-    for resource, hard_val in hard.items():
-        used_val = used.get(resource, "0")
-        try:
-            if "cpu" in resource:
-                h = parse_cpu(hard_val)
-                u = parse_cpu(used_val)
-            elif "memory" in resource or "storage" in resource:
-                h = parse_mem(hard_val)
-                u = parse_mem(used_val)
-            elif resource == "pods":
-                h = float(hard_val)
-                u = float(used_val)
-            else:
-                continue
-            if h <= 0:
-                continue
-            pct = (u / h) * 100
-            if pct > 80:
-                level = "FAIL" if pct > 95 else "WARN"
-                print(f"{level}:{ns}/{name}:{resource}:{pct:.0f}%")
-        except (ValueError, ZeroDivisionError):
-            pass
-' 2>/dev/null) || true
-
-    if [[ -z "$pressure" ]]; then
-        pass "All ResourceQuotas below 80% usage"
-        json_add "resourcequota" "PASS" "All below 80%"
-    else
-        [[ "$QUIET" == true ]] && section_always 15 "ResourceQuota Pressure"
-        while IFS= read -r line; do
-            local level ns_res resource pct
-            level=$(echo "$line" | cut -d: -f1)
-            ns_res=$(echo "$line" | cut -d: -f2)
-            resource=$(echo "$line" | cut -d: -f3)
-            pct=$(echo "$line" | cut -d: -f4)
-            if [[ "$level" == "FAIL" ]]; then
-                fail "$ns_res: $resource at $pct"
-                status="FAIL"
-            else
-                warn "$ns_res: $resource at $pct"
-                [[ "$status" != "FAIL" ]] && status="WARN"
-            fi
-            detail+="$ns_res $resource=$pct; "
-            had_issue=true
-        done <<< "$pressure"
-        json_add "resourcequota" "$status" "$detail"
-    fi
-}
-
-# --- 16. StatefulSets ---
-check_statefulsets() {
-    section 16 "StatefulSets"
-    local sts detail="" had_issue=false
-
-    sts=$($KUBECTL get statefulsets -A --no-headers 2>&1) || true
-    if [[ -z "$sts" || "$sts" == *"No resources found"* ]]; then
-        pass "No StatefulSets in cluster"
-        json_add "statefulsets" "PASS" "No StatefulSets"
-        return 0
-    fi
-
-    while IFS= read -r line; do
-        local ns name ready current desired
-        ns=$(echo "$line" | awk '{print $1}')
-        name=$(echo "$line" | awk '{print $2}')
-        ready=$(echo "$line" | awk '{print $3}')
-        current=$(echo "$ready" | cut -d/ -f1)
-        desired=$(echo "$ready" | cut -d/ -f2)
-
-        if [[ "$current" != "$desired" ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 16 "StatefulSets"
-            fail "$ns/$name: $current/$desired ready"
-            detail+="$ns/$name $current/$desired; "
-            had_issue=true
-        fi
-    done <<< "$sts"
-
-    if [[ "$had_issue" == false ]]; then
-        pass "All StatefulSets fully available"
-        json_add "statefulsets" "PASS" "All available"
-    else
-        json_add "statefulsets" "FAIL" "$detail"
-    fi
-}
-
-# --- 17. Node Disk Usage ---
-check_node_disk() {
-    section 17 "Node Disk Usage"
-    local node_json detail="" had_issue=false status="PASS"
-
-    node_json=$($KUBECTL get nodes -o json 2>/dev/null) || { fail "Cannot get node info"; json_add "node_disk" "FAIL" "Cannot get nodes"; return 0; }
-
-    local disk_info
-    disk_info=$(echo "$node_json" | python3 -c '
-import json, sys
-
-def parse_storage(val):
-    """Convert storage value to bytes."""
-    val = str(val)
-    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4}
-    for suffix, mult in units.items():
-        if val.endswith(suffix):
-            return float(val[:-len(suffix)]) * mult
-    return float(val)
-
-data = json.load(sys.stdin)
-for node in data["items"]:
-    name = node["metadata"]["name"]
-    cap = node["status"].get("capacity", {})
-    alloc = node["status"].get("allocatable", {})
-    es_cap = cap.get("ephemeral-storage", "0")
-    es_alloc = alloc.get("ephemeral-storage", "0")
-    try:
-        c = parse_storage(es_cap)
-        a = parse_storage(es_alloc)
-        if c > 0:
-            used_pct = ((c - a) / c) * 100
-            if used_pct > 70:  # Lower threshold after node2 containerd corruption incident
-                if used_pct > 85:
-                    level = "FAIL"  # Critical: Risk of containerd corruption
-                elif used_pct > 75:
-                    level = "WARN"  # Warning: Monitor closely
-                else:
-                    level = "WARN"  # Early warning
-                print(f"{level}:{name}:{used_pct:.0f}")
-    except (ValueError, ZeroDivisionError):
-        pass
-' 2>/dev/null) || true
-
-    if [[ -z "$disk_info" ]]; then
-        pass "All nodes below 70% ephemeral-storage usage"
-        json_add "node_disk" "PASS" "All below 70%"
-    else
-        [[ "$QUIET" == true ]] && section_always 17 "Node Disk Usage"
-        while IFS= read -r line; do
-            local level node pct
-            level=$(echo "$line" | cut -d: -f1)
-            node=$(echo "$line" | cut -d: -f2)
-            pct=$(echo "$line" | cut -d: -f3)
-            if [[ "$level" == "FAIL" ]]; then
-                fail "$node: ephemeral-storage at ${pct}%"
-                status="FAIL"
-            else
-                warn "$node: ephemeral-storage at ${pct}%"
-                [[ "$status" != "FAIL" ]] && status="WARN"
-            fi
-            detail+="$node=${pct}%; "
-            had_issue=true
-        done <<< "$disk_info"
-        json_add "node_disk" "$status" "$detail"
-    fi
-}
-
-# --- 18. Helm Release Health ---
-check_helm_releases() {
-    section 18 "Helm Release Health"
-
-    # Helm may not be available in the pod environment
-    if ! command -v helm &>/dev/null; then
-        pass "Helm not available (skipped)"
-        json_add "helm_releases" "PASS" "Helm not available"
-        return 0
-    fi
-
-    local releases detail="" had_issue=false status="PASS"
-
-    releases=$(helm list --all-namespaces --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
-        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
-        warn "Cannot list Helm releases"
-        json_add "helm_releases" "WARN" "Cannot list"
-        return 0
-    }
-
-    local bad_releases
-    bad_releases=$(echo "$releases" | python3 -c '
-import json, sys
-data = json.load(sys.stdin)
-for r in data:
-    name = r.get("name", "?")
-    ns = r.get("namespace", "?")
-    st = r.get("status", "unknown")
-    if st != "deployed":
-        level = "FAIL" if st.startswith("pending") else "WARN"
-        print(f"{level}:{ns}/{name}:{st}")
-' 2>/dev/null) || true
-
-    if [[ -z "$bad_releases" ]]; then
-        pass "All Helm releases in deployed state"
-        json_add "helm_releases" "PASS" "All deployed"
-    else
-        [[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
-        while IFS= read -r line; do
-            local level release_name release_status
-            level=$(echo "$line" | cut -d: -f1)
-            release_name=$(echo "$line" | cut -d: -f2)
-            release_status=$(echo "$line" | cut -d: -f3)
-            if [[ "$level" == "FAIL" ]]; then
-                fail "Helm release $release_name: $release_status (blocks terraform)"
-                status="FAIL"
-            else
-                warn "Helm release $release_name: $release_status"
-                [[ "$status" != "FAIL" ]] && status="WARN"
-            fi
-            detail+="$release_name=$release_status; "
-            had_issue=true
-        done <<< "$bad_releases"
-        json_add "helm_releases" "$status" "$detail"
-    fi
-}
-
-# --- 19. Kyverno Policy Engine ---
-check_kyverno() {
-    section 19 "Kyverno Policy Engine"
-    local kv_pods not_running
-
-    kv_pods=$($KUBECTL get pods -n kyverno --no-headers 2>/dev/null || true)
-    if [[ -z "$kv_pods" ]]; then
-        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
-        fail "Kyverno namespace not found or empty — policy engine down, cascading cluster impact"
-        json_add "kyverno" "FAIL" "No Kyverno pods found"
-        return 0
-    fi
-
-    not_running=$(echo "$kv_pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
-    if [[ -n "$not_running" ]]; then
-        [[ "$QUIET" == true ]] && section_always 19 "Kyverno Policy Engine"
-        while IFS= read -r line; do
-            fail "Kyverno pod not running: $line"
-        done <<< "$not_running"
-        json_add "kyverno" "FAIL" "$not_running"
-    else
-        local total
-        total=$(count_lines "$kv_pods")
-        pass "All $total Kyverno pods running"
-        json_add "kyverno" "PASS" "$total pods running"
-    fi
-}
-
-# --- 20. NFS Connectivity ---
-check_nfs() {
-    section 20 "NFS Connectivity"
-
-    # Try native tools first (available locally), fall back to kubectl-based check (pod environment)
-    if command -v showmount &>/dev/null; then
-        if showmount -e 192.168.1.127 &>/dev/null; then
-            pass "NFS server 192.168.1.127 reachable (exports listed)"
-            json_add "nfs" "PASS" "NFS reachable"
-            return 0
-        fi
-    fi
-
-    if command -v nc &>/dev/null; then
-        if nc -z -G 3 192.168.1.127 2049 &>/dev/null; then
-            pass "NFS server 192.168.1.127 port 2049 open"
-            json_add "nfs" "PASS" "NFS port open"
-            return 0
-        fi
-    fi
-
-    # Fallback: check if NFS-backed pods are running (works in pod environment)
-    local nfs_pods
-    nfs_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
-import json, sys
-data = json.load(sys.stdin)
-count = 0
-for pod in data.get("items", []):
-    for vol in pod.get("spec", {}).get("volumes", []):
-        if "nfs" in vol:
-            if pod.get("status", {}).get("phase") == "Running":
-                count += 1
-            break
-print(count)
-' 2>/dev/null) || nfs_pods="0"
-
-    if [[ "$nfs_pods" -gt 0 ]]; then
-        pass "NFS healthy ($nfs_pods pods using NFS volumes are running)"
-        json_add "nfs" "PASS" "$nfs_pods NFS pods running"
-    else
-        [[ "$QUIET" == true ]] && section_always 20 "NFS Connectivity"
-        warn "Cannot verify NFS (showmount not available, no NFS pods found)"
-        json_add "nfs" "WARN" "Cannot verify"
-    fi
-}
-
-# --- 21. DNS Resolution ---
-check_dns() {
-    section 21 "DNS Resolution"
-    local internal_ok=false external_ok=false detail=""
-
-    # Try dig first (available locally), fall back to python3 (pod environment)
-    # Use system resolver (no @server) so it works from any host or pod
-    if command -v dig &>/dev/null; then
-        if dig viktorbarzin.me +short +time=3 +tries=1 2>/dev/null | grep -q .; then
-            internal_ok=true
-        fi
-        if dig google.com +short +time=3 +tries=1 2>/dev/null | grep -q .; then
-            external_ok=true
-        fi
-    else
-        # Fallback: use python3 for DNS resolution (works in pod environment)
-        local result
-        result=$(python3 -c "
-import socket
-try:
-    socket.getaddrinfo('viktorbarzin.me', 443)
-    print('INTERNAL_OK')
-except Exception:
-    print('INTERNAL_FAIL')
-try:
-    socket.getaddrinfo('google.com', 443)
-    print('EXTERNAL_OK')
-except Exception:
-    print('EXTERNAL_FAIL')
-" 2>/dev/null) || result=""
-
-        if echo "$result" | grep -q "INTERNAL_OK"; then
-            internal_ok=true
-        fi
-        if echo "$result" | grep -q "EXTERNAL_OK"; then
-            external_ok=true
-        fi
-    fi
-
-    if [[ "$internal_ok" == true && "$external_ok" == true ]]; then
-        pass "DNS resolves both internal (viktorbarzin.me) and external (google.com)"
-        json_add "dns" "PASS" "Both resolve"
-    elif [[ "$internal_ok" == true || "$external_ok" == true ]]; then
-        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
-        if [[ "$internal_ok" == false ]]; then
-            warn "DNS: internal (viktorbarzin.me) failed, external (google.com) OK"
-            detail="Internal failed"
-        else
-            warn "DNS: internal (viktorbarzin.me) OK, external (google.com) failed"
-            detail="External failed"
-        fi
-        json_add "dns" "WARN" "$detail"
-    else
-        [[ "$QUIET" == true ]] && section_always 21 "DNS Resolution"
-        fail "DNS not resolving — both internal and external failed"
-        json_add "dns" "FAIL" "Both failed"
-    fi
-}
-
-# --- 22. TLS Certificate Expiry ---
-check_tls_certs() {
-    section 22 "TLS Certificate Expiry"
-    local secrets detail="" had_issue=false status="PASS"
-
-    secrets=$($KUBECTL get secrets -A -o json 2>/dev/null) || {
-        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
-        warn "Cannot list secrets"
-        json_add "tls_certs" "WARN" "Cannot list secrets"
-        return 0
-    }
-
-    local cert_issues
-    cert_issues=$(echo "$secrets" | python3 -c '
-import json, sys, base64, subprocess, hashlib
-from datetime import datetime, timezone
-
-data = json.load(sys.stdin)
-seen_fingerprints = set()
-results = []
-
-for item in data.get("items", []):
-    if item.get("type") != "kubernetes.io/tls":
-        continue
-    ns = item["metadata"]["namespace"]
-    name = item["metadata"]["name"]
-    cert_data = item.get("data", {}).get("tls.crt", "")
-    if not cert_data:
-        continue
-
-    # Deduplicate by cert fingerprint
-    raw = base64.b64decode(cert_data)
-    fp = hashlib.sha256(raw).hexdigest()[:16]
-    if fp in seen_fingerprints:
-        continue
-    seen_fingerprints.add(fp)
-
-    # Parse certificate expiry with openssl
-    try:
-        result = subprocess.run(
-            ["openssl", "x509", "-noout", "-enddate", "-subject"],
-            input=raw, capture_output=True, timeout=5
-        )
-        output = result.stdout.decode()
-        for line in output.splitlines():
-            if line.startswith("notAfter="):
-                date_str = line.split("=", 1)[1]
-                # Parse openssl date format: "Mon DD HH:MM:SS YYYY GMT"
-                try:
-                    expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
-                    expiry = expiry.replace(tzinfo=timezone.utc)
-                    days_left = (expiry - datetime.now(timezone.utc)).days
-                    if days_left <= 7:
-                        print(f"FAIL:{ns}/{name}:{days_left}d")
-                    elif days_left <= 30:
-                        print(f"WARN:{ns}/{name}:{days_left}d")
-                except ValueError:
-                    pass
-    except (subprocess.TimeoutExpired, Exception):
-        pass
-' 2>/dev/null) || true
-
-    if [[ -z "$cert_issues" ]]; then
-        pass "All TLS certificates valid for >30 days"
-        json_add "tls_certs" "PASS" "All valid >30d"
-    else
-        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
-        while IFS= read -r line; do
-            local level cert_name days
-            level=$(echo "$line" | cut -d: -f1)
-            cert_name=$(echo "$line" | cut -d: -f2)
-            days=$(echo "$line" | cut -d: -f3)
-            if [[ "$level" == "FAIL" ]]; then
-                fail "TLS cert $cert_name expires in $days"
-                status="FAIL"
-            else
-                warn "TLS cert $cert_name expires in $days"
-                [[ "$status" != "FAIL" ]] && status="WARN"
-            fi
-            detail+="$cert_name=$days; "
-            had_issue=true
-        done <<< "$cert_issues"
-        json_add "tls_certs" "$status" "$detail"
-    fi
-}
-
-# --- 23. GPU Health ---
-check_gpu() {
-    section 23 "GPU Health"
-    local gpu_pods not_running
-
-    gpu_pods=$($KUBECTL get pods -n nvidia --no-headers 2>/dev/null || true)
-    if [[ -z "$gpu_pods" ]]; then
-        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
-        warn "NVIDIA namespace not found or empty"
-        json_add "gpu" "WARN" "No GPU pods found"
-        return 0
-    fi
-
-    # Check specifically for device-plugin (critical for GPU scheduling)
-    local device_plugin_down=false
-    local other_down=false
-    local detail=""
-
-    while IFS= read -r line; do
-        local pod_name pod_status
-        pod_name=$(echo "$line" | awk '{print $1}')
-        pod_status=$(echo "$line" | awk '{print $3}')
-        if [[ "$pod_status" != "Running" && "$pod_status" != "Completed" ]]; then
-            if echo "$pod_name" | grep -q "device-plugin"; then
-                device_plugin_down=true
-                detail+="device-plugin $pod_name: $pod_status; "
-            else
-                other_down=true
-                detail+="$pod_name: $pod_status; "
-            fi
-        fi
-    done <<< "$gpu_pods"
-
-    if [[ "$device_plugin_down" == true ]]; then
-        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
-        fail "GPU device-plugin is down — GPU workloads cannot schedule"
-        json_add "gpu" "FAIL" "$detail"
-    elif [[ "$other_down" == true ]]; then
-        [[ "$QUIET" == true ]] && section_always 23 "GPU Health"
-        warn "Some GPU pods not running: $detail"
-        json_add "gpu" "WARN" "$detail"
-    else
-        local total
-        total=$(count_lines "$gpu_pods")
-        pass "All $total GPU pods running"
-        json_add "gpu" "PASS" "$total pods running"
-    fi
-}
-
-# --- 24. Cloudflare Tunnel ---
-check_cloudflare_tunnel() {
-    section 24 "Cloudflare Tunnel"
-    local cf_pods running_count total_count
-
-    cf_pods=$($KUBECTL get pods -n cloudflared --no-headers 2>/dev/null || true)
-    if [[ -z "$cf_pods" ]]; then
-        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
-        fail "Cloudflare tunnel namespace not found or empty — external access broken"
-        json_add "cloudflare_tunnel" "FAIL" "No pods found"
-        return 0
-    fi
-
-    total_count=$(count_lines "$cf_pods")
-    running_count=$(echo "$cf_pods" | awk '$3 == "Running"' | wc -l | tr -d ' ')
-
-    if [[ "$running_count" -eq 0 ]]; then
-        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
-        fail "Cloudflare tunnel: 0/$total_count pods running — external access broken"
-        json_add "cloudflare_tunnel" "FAIL" "0/$total_count running"
-    elif [[ "$running_count" -lt "$total_count" ]]; then
-        [[ "$QUIET" == true ]] && section_always 24 "Cloudflare Tunnel"
-        warn "Cloudflare tunnel: $running_count/$total_count pods running (degraded)"
-        json_add "cloudflare_tunnel" "WARN" "$running_count/$total_count running"
-    else
-        pass "Cloudflare tunnel: all $total_count pods running"
-        json_add "cloudflare_tunnel" "PASS" "$total_count pods running"
-    fi
-}
-
-# --- 25. Advanced CPU Monitoring (Prometheus) ---
-check_prometheus_cpu() {
-    section 25 "Advanced CPU Monitoring"
-    local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
-    local detail="" had_issue=false status="PASS"
-
-    # Start port-forward to Prometheus if not using in-cluster DNS
-    local prom_url pf_pid=""
-    if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
-        prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
-    else
-        local pf_port
-        pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
-        $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
-        pf_pid=$!
-        sleep 2
-        prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
-    fi
-    # Cleanup port-forward on exit from this function
-    trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN
-
-    # Try to query Prometheus for CPU metrics
-    local cpu_data
-    cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
-        warn "Prometheus not accessible for CPU monitoring"
-        json_add "prometheus_cpu" "WARN" "Prometheus unreachable"
-        return 0
-    }
-    
-    # Parse JSON and check CPU usage
-    local cpu_results
-    cpu_results=$(echo "$cpu_data" | python3 -c "
-import json, sys
-try:
-    data = json.load(sys.stdin)
-    if data.get('status') == 'success':
-        for result in data['data']['result']:
-            instance = result['metric']['instance']
-            usage = float(result['value'][1])
-            # Map IP to node name  
-            if '10.0.20.100' in instance:
-                node = 'k8s-master'
-            elif '10.0.20.101' in instance:
-                node = 'k8s-node1' 
-            elif '10.0.20.102' in instance:
-                node = 'k8s-node2'
-            elif '10.0.20.103' in instance:
-                node = 'k8s-node3'
-            elif '10.0.20.104' in instance:
-                node = 'k8s-node4'
-            elif 'pve-node' in instance:
-                node = 'proxmox-host'
-            else:
-                node = instance
-            print(f'{node}:{usage:.1f}')
-except Exception as e:
-    print(f'ERROR:{e}')
-" 2>/dev/null) || true
-    
-    if [[ "$cpu_results" == *"ERROR"* || -z "$cpu_results" ]]; then
-        warn "Failed to parse Prometheus CPU data"
-        json_add "prometheus_cpu" "WARN" "Parse failed"
-        return 0
-    fi
-    
-    # Check CPU thresholds
-    while IFS=':' read -r node usage; do
-        [[ -z "$node" || -z "$usage" ]] && continue
-        usage_int=${usage%.*}  # Remove decimal
-        
-        if [[ "$usage_int" -gt 85 ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring"
-            fail "$node: ${usage}% CPU (critical)"
-            detail+="$node=${usage}% [CRIT]; "
-            had_issue=true
-            status="FAIL"
-        elif [[ "$usage_int" -gt 70 ]]; then
-            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Advanced CPU Monitoring" 
-            warn "$node: ${usage}% CPU (high)"
-            detail+="$node=${usage}% [HIGH]; "
-            had_issue=true
-            [[ "$status" != "FAIL" ]] && status="WARN"
-        else
-            detail+="$node=${usage}% [OK]; "
-        fi
-    done <<< "$cpu_results"
-    
-    [[ "$had_issue" == false ]] && pass "All nodes below 70% CPU usage (5m avg)"
-    json_add "prometheus_cpu" "$status" "$detail"
-}
-
-# --- 26. Power Monitoring ---
-check_power_monitoring() {
-    section 26 "Power Monitoring"
-    local detail="" had_issue=false status="PASS"
-
-    # Start port-forward to Prometheus if not using in-cluster DNS
-    local prom_url pf_pid=""
-    if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
-        prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
-    else
-        local pf_port
-        pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
-        $KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
-        pf_pid=$!
-        sleep 2
-        prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
-    fi
-    trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN
-
-    # GPU Power monitoring
-    local gpu_query="DCGM_FI_DEV_POWER_USAGE"
-    local gpu_data
-    gpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${gpu_query}" 2>/dev/null) || {
-        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
-        warn "GPU power metrics unavailable"
-        detail+="GPU metrics unavailable; "
-        had_issue=true
-        status="WARN"
-    }
-    
-    if [[ -n "$gpu_data" && "$gpu_data" != *"error"* ]]; then
-        local gpu_results
-        gpu_results=$(echo "$gpu_data" | python3 -c "
-import json, sys
-try:
-    data = json.load(sys.stdin)
-    if data.get('status') == 'success':
-        for result in data['data']['result']:
-            hostname = result['metric'].get('Hostname', 'unknown')
-            power = float(result['value'][1])
-            print(f'{hostname}:{power:.1f}')
-except Exception:
-    pass
-" 2>/dev/null) || true
-        
-        # Check GPU power thresholds (Tesla T4 TDP is ~70W)
-        while IFS=':' read -r node power; do
-            [[ -z "$node" || -z "$power" ]] && continue
-            power_int=${power%.*}
-            
-            if [[ "$power_int" -gt 65 ]]; then  # > 90% of T4 TDP
-                [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 26 "Power Monitoring"
-                warn "GPU $node: ${power}W (high power draw)"
-                detail+="GPU-$node=${power}W [HIGH]; "
-                had_issue=true
-                [[ "$status" != "FAIL" ]] && status="WARN"
-            elif [[ "$power_int" -gt 50 ]]; then  # > 70% of T4 TDP
-                detail+="GPU-$node=${power}W [ACTIVE]; "
-            else
-                detail+="GPU-$node=${power}W [IDLE]; "
-            fi
-        done <<< "$gpu_results"
-    fi
-    
-    [[ "$had_issue" == false ]] && pass "Power consumption within normal ranges"
-    json_add "power_monitoring" "$status" "$detail"
-}
-
-# --- Summary ---
-print_summary() {
-    if [[ "$JSON" == true ]]; then
-        echo "{"
-        echo "  \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
-        echo "  \"pass\": $PASS_COUNT,"
-        echo "  \"warn\": $WARN_COUNT,"
-        echo "  \"fail\": $FAIL_COUNT,"
-        echo "  \"checks\": ["
-        local first=true
-        for r in "${JSON_RESULTS[@]}"; do
-            if [[ "$first" == true ]]; then
-                echo "    $r"
-                first=false
-            else
-                echo "    ,$r"
-            fi
-        done
-        echo "  ]"
-        echo "}"
-        return 0
-    fi
-
-    echo ""
-    echo -e "${BOLD}═══════════════════════════════════════${NC}"
-    echo -e "${BOLD}  Cluster Health Summary${NC}"
-    echo -e "${BOLD}═══════════════════════════════════════${NC}"
-    echo -e "  ${GREEN}PASS${NC}: $PASS_COUNT    ${YELLOW}WARN${NC}: $WARN_COUNT    ${RED}FAIL${NC}: $FAIL_COUNT"
-    echo ""
-
-    if [[ "$FAIL_COUNT" -gt 0 ]]; then
-        echo -e "  Overall: ${RED}UNHEALTHY${NC}"
-    elif [[ "$WARN_COUNT" -gt 0 ]]; then
-        echo -e "  Overall: ${YELLOW}DEGRADED${NC}"
-    else
-        echo -e "  Overall: ${GREEN}HEALTHY${NC}"
-    fi
-    echo ""
-}
-
-# --- Slack Notification ---
-
-# Human-readable check name mapping
-friendly_check_name() {
-    case "$1" in
-        node_status)        echo "Node Status" ;;
-        node_resources)     echo "Node Resources" ;;
-        node_conditions)    echo "Node Conditions" ;;
-        problematic_pods)   echo "Problematic Pods" ;;
-        evicted_pods)       echo "Evicted Pods" ;;
-        daemonsets)         echo "DaemonSets" ;;
-        deployments)        echo "Deployments" ;;
-        pvcs)               echo "PVCs" ;;
-        hpa)                echo "HPAs" ;;
-        cronjob_failures)   echo "CronJob Failures" ;;
-        crowdsec)           echo "CrowdSec" ;;
-        ingresses)          echo "Ingresses" ;;
-        prometheus_alerts)  echo "Prometheus Alerts" ;;
-        uptime_kuma)        echo "Uptime Kuma" ;;
-        resourcequota)      echo "Resource Quotas" ;;
-        statefulsets)       echo "StatefulSets" ;;
-        node_disk)          echo "Node Disk" ;;
-        helm_releases)      echo "Helm Releases" ;;
-        kyverno)            echo "Kyverno" ;;
-        nfs)                echo "NFS Storage" ;;
-        dns)                echo "DNS Resolution" ;;
-        tls_certs)          echo "TLS Certificates" ;;
-        gpu)                echo "GPU" ;;
-        cloudflare_tunnel)  echo "Cloudflare Tunnel" ;;
-        prometheus_cpu)     echo "Advanced CPU Monitoring" ;;
-        power_monitoring)   echo "Power Monitoring" ;;
-        *)                  echo "$1" ;;
-    esac
-}
-
-send_slack() {
-    if [[ "$SEND_SLACK" != true ]]; then
-        return 0
-    fi
-    if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then
-        [[ "$JSON" != true ]] && echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification"
-        return 0
-    fi
-
-    # Gather stats for summary line
-    local node_count pod_count
-    node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
-    pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ')
-
-    local total_checks=$((PASS_COUNT + WARN_COUNT + FAIL_COUNT))
-
-    # Use python3 to build the entire Slack payload from JSON_RESULTS
-    local json_results_str
-    json_results_str=$(printf '%s\n' "${JSON_RESULTS[@]}")
-
-    local json_payload
-    json_payload=$(echo "$json_results_str" | python3 -c "
-import json, sys
-
-CHECK_NAMES = {
-    'node_status': 'Node Status',
-    'node_resources': 'Node Resources',
-    'node_conditions': 'Node Conditions',
-    'problematic_pods': 'Problematic Pods',
-    'evicted_pods': 'Evicted Pods',
-    'daemonsets': 'DaemonSets',
-    'deployments': 'Deployments',
-    'pvcs': 'PVCs',
-    'hpa': 'HPAs',
-    'cronjob_failures': 'CronJob Failures',
-    'crowdsec': 'CrowdSec',
-    'ingresses': 'Ingresses',
-    'prometheus_alerts': 'Prometheus Alerts',
-    'uptime_kuma': 'Uptime Kuma',
-    'resourcequota': 'Resource Quotas',
-    'statefulsets': 'StatefulSets',
-    'node_disk': 'Node Disk',
-    'helm_releases': 'Helm Releases',
-    'kyverno': 'Kyverno',
-    'nfs': 'NFS Storage',
-    'dns': 'DNS Resolution',
-    'tls_certs': 'TLS Certificates',
-    'gpu': 'GPU',
-    'cloudflare_tunnel': 'Cloudflare Tunnel',
-    'prometheus_cpu': 'Advanced CPU Monitoring',
-    'power_monitoring': 'Power Monitoring',
-}
-
-def format_detail(check, detail):
-    \"\"\"Format detail text for readability. Truncate long lists, split semicolons.\"\"\"
-    detail = detail.rstrip('; ').strip()
-
-    # For checks with long comma-separated lists (e.g. Uptime Kuma down monitors),
-    # truncate to first 5 items with a count
-    if check == 'uptime_kuma' and ': ' in detail:
-        prefix, names_str = detail.split(': ', 1)
-        names = [n.strip() for n in names_str.split(',') if n.strip()]
-        if len(names) > 5:
-            shown = ', '.join(names[:5])
-            detail = f'{prefix}: {shown} (+{len(names) - 5} more)'
-        elif names:
-            detail = prefix + ': ' + ', '.join(names)
-
-    # For resource quotas and similar semicolon-separated items,
-    # split into separate lines
-    if '; ' in detail:
-        parts = [p.strip() for p in detail.split(';') if p.strip()]
-        if len(parts) > 1:
-            lines = '\\n'.join(f'     \u2022 {p}' for p in parts)
-            return lines
-
-    return detail
-
-# Parse results
-fails = []
-warns = []
-for line in sys.stdin:
-    line = line.strip()
-    if not line:
-        continue
-    try:
-        d = json.loads(line)
-    except json.JSONDecodeError:
-        continue
-    status = d.get('status', '')
-    check = d.get('check', '')
-    detail = d.get('detail', '')
-    name = CHECK_NAMES.get(check, check)
-    formatted = format_detail(check, detail)
-
-    if status == 'FAIL':
-        fails.append((name, formatted))
-    elif status == 'WARN':
-        warns.append((name, formatted))
-
-pass_count = ${PASS_COUNT}
-warn_count = ${WARN_COUNT}
-fail_count = ${FAIL_COUNT}
-total = ${total_checks}
-nodes = '${node_count}'
-pods = '${pod_count}'
-
-blocks = []
-
-# Header block
-if fail_count == 0 and warn_count == 0:
-    header = f':white_check_mark: *Cluster Health Check \u2014 All Clear*'
-    summary = f'{total}/{total} checks passed \u2022 {nodes} nodes \u2022 {pods} pods'
-    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
-else:
-    issue_count = fail_count + warn_count
-    emoji = ':rotating_light:' if fail_count > 0 else ':warning:'
-    header = f'{emoji} *Cluster Health Check \u2014 {issue_count} Issue(s)*'
-    summary = f':white_check_mark: {pass_count} passed \u2022 :warning: {warn_count} warnings \u2022 :x: {fail_count} failed'
-    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': f'{header}\n{summary}'}})
-
-# Failed section
-if fails:
-    blocks.append({'type': 'divider'})
-    lines = [':x: *Failed*']
-    for name, detail in fails:
-        if '\\n' in detail:
-            lines.append(f'\u2022 *{name}*:')
-            lines.append(detail)
-        else:
-            lines.append(f'\u2022 *{name}*: {detail}')
-    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})
-
-# Warnings section
-if warns:
-    blocks.append({'type': 'divider'})
-    lines = [':warning: *Warnings*']
-    for name, detail in warns:
-        if '\\n' in detail:
-            lines.append(f'\u2022 *{name}*:')
-            lines.append(detail)
-        else:
-            lines.append(f'\u2022 *{name}*: {detail}')
-    blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': '\\n'.join(lines)}})
-
-# Footer with timestamp
-from datetime import datetime, timezone
-ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')
-blocks.append({'type': 'context', 'elements': [{'type': 'mrkdwn', 'text': f'{nodes} nodes \u2022 {pods} pods \u2022 {ts}'}]})
-
-payload = {'blocks': blocks}
-print(json.dumps(payload))
-")
-
-    curl -s -X POST "$SLACK_WEBHOOK_URL" \
-        -H 'Content-Type: application/json' \
-        -d "$json_payload" >/dev/null 2>&1 || {
-        [[ "$JSON" != true ]] && echo "WARNING: Failed to send Slack notification"
-    }
-
-    [[ "$JSON" != true ]] && echo "Slack notification sent."
-}
-
-# --- Main ---
-main() {
-    parse_args "$@"
-
-    if [[ "$JSON" != true ]]; then
-        echo -e "${BOLD}Cluster Health Check${NC} — $(date '+%Y-%m-%d %H:%M:%S')"
-        echo -e "Kubeconfig: $KUBECONFIG_PATH"
-        if [[ "$FIX" == true ]]; then
-            echo -e "${YELLOW}Auto-fix mode enabled${NC}"
-        fi
-    fi
-
-    check_nodes
-    check_resources
-    check_conditions
-    check_pods
-    check_evicted
-    check_daemonsets
-    check_deployments
-    check_pvcs
-    check_hpa
-    check_cronjobs
-    check_crowdsec
-    check_ingresses
-    check_alerts
-    check_uptime_kuma
-    check_resourcequota
-    check_statefulsets
-    check_node_disk
-    check_helm_releases
-    check_kyverno
-    check_nfs
-    check_dns
-    check_tls_certs
-    check_gpu
-    check_cloudflare_tunnel
-    check_prometheus_cpu
-    check_power_monitoring
-    print_summary
-    send_slack
-
-    # Always exit 0 — reporting is done via Slack notification.
-    # Non-zero exits mark the CronJob as Failed, which triggers Prometheus
-    # JobFailed alerts, creating a circular alert loop.
-    exit 0
-}
-
-main "$@"
diff --git a/.claude/skills/cluster-health/SKILL.md b/.claude/skills/cluster-health/SKILL.md
index be18fc9f..ef3ae25f 100644
--- a/.claude/skills/cluster-health/SKILL.md
+++ b/.claude/skills/cluster-health/SKILL.md
@@ -7,339 +7,314 @@ description: |
   (3) User asks to fix stuck pods, evicted pods, or CrashLoopBackOff,
   (4) User mentions "health check", "cluster status", "cluster health",
   (5) User asks "is everything running" or "any problems".
-  Runs 8 standard K8s health checks with safe auto-fix for evicted pods
-  and stuck CrashLoopBackOff pods.
+  Runs 42 cluster-wide checks (nodes, workloads, monitoring, certs,
+  backups, external reachability) with safe auto-fix for evicted pods.
 author: Claude Code
-version: 1.0.0
-date: 2026-02-21
+version: 2.0.0
+date: 2026-04-19
 ---
 
 # Cluster Health Check
 
-## Overview
+## MANDATORY: Run the script first
 
-- **Script**: `/workspace/infra/.claude/cluster-health.sh`
-- **Schedule**: CronJob runs every 30 minutes in the `openclaw` namespace
-- **Slack notifications**: Posts results to the webhook URL in `$SLACK_WEBHOOK_URL`
-- **Auto-fix**: Automatically deletes evicted/failed pods and CrashLoopBackOff pods with >10 restarts
-- **Exit code**: 0 = healthy, 1 = issues found
-
-## Quick Check
-
-Run the health check interactively:
+When this skill is invoked, your **first action** must be to run the
+cluster health check script and reason over its output before doing
+anything else. Do not improvise individual `kubectl` calls — the
+script is the authoritative surface.
 
 ```bash
-# Report only, no Slack notification
-bash /workspace/infra/.claude/cluster-health.sh --no-slack
-
-# Full run with Slack notification
-bash /workspace/infra/.claude/cluster-health.sh
-
-# Report only, no auto-fix and no Slack
-bash /workspace/infra/.claude/cluster-health.sh --no-fix --no-slack
+cd /home/wizard/code
+bash infra/scripts/cluster_healthcheck.sh --json | tee /tmp/cluster-health.json
 ```
 
-## What It Checks
+If the session is rooted elsewhere, fall back to the absolute path:
 
-| # | Check | Auto-Fix | Alerts |
-|---|-------|----------|--------|
-| 1 | **Node Health** — NotReady nodes, MemoryPressure, DiskPressure, PIDPressure | No | Yes |
-| 2 | **Pod Health** — CrashLoopBackOff, ImagePullBackOff, ErrImagePull, Error | Yes (CrashLoop >10 restarts) | Yes |
-| 3 | **Evicted/Failed Pods** — Pods in `Failed` phase | Yes (deletes all) | Yes |
-| 4 | **Failed Deployments** — Deployments with ready != desired replicas | No | Yes |
-| 5 | **Pending PVCs** — PersistentVolumeClaims not in `Bound` state | No | Yes |
-| 6 | **Resource Pressure** — Node CPU or memory >80% (warn) or >90% (issue) | No | Yes |
-| 7 | **CronJob Failures** — Failed CronJob-owned Jobs in the last 24h | No | Yes |
-| 8 | **DaemonSet Health** — DaemonSets with desired != ready | No | Yes |
+```bash
+bash /home/wizard/code/infra/scripts/cluster_healthcheck.sh --json
+```
+
+Then:
+
+1. Parse the JSON. Report the PASS/WARN/FAIL counts + overall verdict.
+2. Iterate every FAIL and WARN check, describe what tripped, and propose
+   the remediation path (use the recipes below).
+3. Only reach for ad-hoc `kubectl` commands when investigating a
+   specific failure beyond what the script reported.
+
+Exit codes: `0` = healthy, `1` = warnings only, `2` = failures.
+
+## Quick flags
+
+```bash
+# Human-readable report (default), no auto-fix
+bash infra/scripts/cluster_healthcheck.sh
+
+# Machine-readable JSON summary
+bash infra/scripts/cluster_healthcheck.sh --json
+
+# Only show WARN + FAIL (suppress PASS noise)
+bash infra/scripts/cluster_healthcheck.sh --quiet
+
+# Enable auto-fix (delete evicted pods, kick stuck CrashLoop pods)
+bash infra/scripts/cluster_healthcheck.sh --fix
+
+# Combined: quiet JSON without auto-fix
+bash infra/scripts/cluster_healthcheck.sh --no-fix --quiet --json
+
+# Custom kubeconfig
+bash infra/scripts/cluster_healthcheck.sh --kubeconfig /path/to/config
+```
+
+## What It Checks (42 checks)
+
+| # | Check | Notes |
+|---|-------|-------|
+| 1 | Node Status | NotReady nodes, version drift |
+| 2 | Node Resources | CPU/mem >80% (warn) / >90% (fail) |
+| 3 | Node Conditions | MemoryPressure / DiskPressure / PIDPressure |
+| 4 | Problematic Pods | CrashLoopBackOff / Error / ImagePullBackOff |
+| 5 | Evicted/Failed Pods | `status.phase=Failed` |
+| 6 | DaemonSets | desired == ready |
+| 7 | Deployments | ready == desired replicas |
+| 8 | PVC Status | all Bound |
+| 9 | HPA Health | targets not `<unknown>`, utilization <100% |
+| 10 | CronJob Failures | job conditions `Failed=True` in last 24h |
+| 11 | CrowdSec Agents | all pods Running |
+| 12 | Ingress Routes | every ingress has an LB IP + Traefik LB |
+| 13 | Prometheus Alerts | count of firing alerts |
+| 14 | Uptime Kuma Monitors | internal + external monitors up |
+| 15 | ResourceQuota Pressure | any quota >80% used |
+| 16 | StatefulSets | ready == desired |
+| 17 | Node Disk Usage | ephemeral-storage <80% |
+| 18 | Helm Release Health | all `deployed` (no `pending-*`) |
+| 19 | Kyverno Policy Engine | all pods Running |
+| 20 | NFS Connectivity | 192.168.1.127 showmount / port 2049 |
+| 21 | DNS Resolution | Technitium resolves internal + external |
+| 22 | TLS Certificate Expiry | TLS `Secret` certs >30d valid |
+| 23 | GPU Health | nvidia namespace + device-plugin Running |
+| 24 | Cloudflare Tunnel | pods Running |
+| 25 | Resource Usage | node CPU/mem headroom |
+| 26 | HA Sofia — Entity Availability | Home Assistant unavailable/unknown count |
+| 27 | HA Sofia — Integration Health | config entries setup_error / not_loaded |
+| 28 | HA Sofia — Automation Status | disabled / stale (>30d) automations |
+| 29 | HA Sofia — System Resources | HA CPU / mem / disk |
+| 30 | Hardware Exporters | snmp / idrac-redfish / proxmox / tuya pods + scrapes |
+| 31 | cert-manager — Certificate Readiness | Certificate CRs with `Ready!=True` |
+| 32 | cert-manager — Certificate Expiry (<14d) | notAfter within 14d |
+| 33 | cert-manager — Failed CertificateRequests | `Ready=False, reason=Failed` |
+| 34 | Backup Freshness — Per-DB Dumps | MySQL + PG dumps within 25h |
+| 35 | Backup Freshness — Offsite Sync | Pushgateway `backup_last_success_timestamp` <27h |
+| 36 | Backup Freshness — LVM PVC Snapshots | newest thin snapshot <25h (SSH PVE) |
+| 37 | Monitoring — Prometheus + Alertmanager | `/-/ready` + AM pods Running |
+| 38 | Monitoring — Vault Sealed Status | `vault status` reports `Sealed: false` |
+| 39 | Monitoring — ClusterSecretStore Ready | `vault-kv` + `vault-database` Ready |
+| 40 | External — Cloudflared + Authentik Replicas | deployments fully ready |
+| 41 | External — ExternalAccessDivergence Alert | alert not firing |
+| 42 | External — Traefik 5xx Rate (15m) | top-10 services emitting 5xx |
 
 ## Safe Auto-Fix Rules
 
-### Safe to auto-fix (the script does these automatically)
+`--fix` only performs operations that are genuinely reversible and
+observable. Nothing here rewrites Terraform state or mutates the cluster
+beyond "delete pod".
 
-1. **Evicted/Failed pods** — These are already terminated and just cluttering the namespace:
-   ```bash
-   kubectl delete pods -A --field-selector=status.phase=Failed
-   ```
+### Done automatically by `--fix`
 
-2. **CrashLoopBackOff pods with >10 restarts** — The pod is stuck in a crash loop; deleting lets the controller recreate it with a fresh backoff timer:
-   ```bash
-   kubectl delete pod -n <namespace> <pod-name> --grace-period=0
-   ```
+- **Evicted / Failed pods** — delete them; the controller recreates.
+  ```bash
+  kubectl delete pods -A --field-selector=status.phase=Failed
+  ```
+- **CrashLoopBackOff pods with >10 restarts** — delete once to reset
+  backoff timer.
 
 ### NEVER auto-fix (requires human investigation)
 
-- **NotReady nodes** — Could be network, kubelet, or hardware issue; needs SSH investigation
-- **DiskPressure / MemoryPressure / PIDPressure** — Root cause must be identified
-- **ImagePullBackOff** — Usually a wrong image tag or registry issue; needs config fix
-- **Failed deployments** — Could be resource limits, bad config, missing secrets
-- **Pending PVCs** — Usually NFS export missing or storage class issue
-- **Resource pressure >90%** — Need to identify which pods are consuming resources
-- **CronJob failures** — Need to check job logs to understand why it failed
-- **DaemonSet issues** — Could be node taints, resource limits, or image issues
+- NotReady nodes
+- MemoryPressure / DiskPressure / PIDPressure
+- ImagePullBackOff (usually a bad tag / registry credential)
+- Deployment ready-replica mismatch
+- Pending PVCs
+- Node CPU/memory >90%
+- CronJob failures
+- DaemonSet desired != ready
+- Vault sealed
+- ClusterSecretStore not Ready
+- cert-manager Certificate failures
+- Backup freshness regressions
+- Any external-reachability failure
 
-## Deep Investigation
+## Deep-investigation recipes per failure mode
 
-When the health check reports issues, use these commands to investigate further.
-
-### Node Issues
+### Node Issues (checks 1, 3, 17, 25)
 
 ```bash
-# Describe the problematic node (events, conditions, capacity)
-kubectl describe node <node-name>
-
-# Check resource usage across all nodes
+kubectl describe node <node>
 kubectl top nodes
-
-# Check recent events on a specific node
-kubectl get events --field-selector involvedObject.name=<node-name> --sort-by='.lastTimestamp'
-
-# SSH to the node for direct inspection
-ssh root@<node-ip>
+kubectl get events --field-selector involvedObject.name=<node> --sort-by='.lastTimestamp'
+# SSH to the node
+ssh root@10.0.20.10X
 systemctl status kubelet
 journalctl -u kubelet --since "30 minutes ago" | tail -100
-df -h
-free -h
+df -h ; free -h
 ```
 
-### Pod Issues
+Node IPs: `10.0.20.100` master, `.101` node1 (GPU), `.102` node2,
+`.103` node3, `.104` node4.
+
+### Pod Issues (checks 4, 5, 11, 19)
 
 ```bash
-# Describe the pod (events, conditions, container statuses)
-kubectl describe pod -n <namespace> <pod-name>
-
-# Check current logs
-kubectl logs -n <namespace> <pod-name> --tail=100
-
-# Check logs from the previous crashed container
-kubectl logs -n <namespace> <pod-name> --previous --tail=100
-
-# Check events in the namespace
-kubectl get events -n <namespace> --sort-by='.lastTimestamp' | tail -20
-
-# Check all pods in a namespace
-kubectl get pods -n <namespace> -o wide
+kubectl describe pod -n <ns> <pod>
+kubectl logs -n <ns> <pod> --tail=200
+kubectl logs -n <ns> <pod> --previous --tail=200
+kubectl get events -n <ns> --sort-by='.lastTimestamp' | tail -20
 ```
 
-### Deployment Issues
+Common failure causes: OOMKilled (raise mem limit in Terraform), bad
+config / missing env var, DB connection failure (check `dbaas` pods),
+NFS mount failure (`showmount -e 192.168.1.127`), stale
+imagePullSecret.
+
+### Deployment / StatefulSet / DaemonSet (checks 6, 7, 16)
 
 ```bash
-# Describe the deployment (strategy, conditions, events)
-kubectl describe deployment -n <namespace> <deployment-name>
-
-# Check rollout status
-kubectl rollout status deployment -n <namespace> <deployment-name>
-
-# Check rollout history
-kubectl rollout history deployment -n <namespace> <deployment-name>
-
-# Check the replicaset
-kubectl get rs -n <namespace> -l app=<app-label>
+kubectl describe deployment -n <ns> <name>
+kubectl rollout status deployment -n <ns> <name>
+kubectl rollout history deployment -n <ns> <name>
+kubectl get rs -n <ns> -l app=<app>
 ```
 
-### PVC Issues
+### PVC (check 8)
 
 ```bash
-# Describe the PVC (events, status, storage class)
-kubectl describe pvc -n <namespace> <pvc-name>
-
-# Check PVs
-kubectl get pv
-
-# Check events related to PVCs
-kubectl get events -n <namespace> --field-selector reason=FailedMount --sort-by='.lastTimestamp'
-
-# Verify NFS export exists
-showmount -e 10.0.10.15 | grep <service-name>
+kubectl describe pvc -n <ns> <pvc>
+kubectl get events -n <ns> --field-selector reason=FailedMount --sort-by='.lastTimestamp'
+kubectl get pv | grep <pvc>
+showmount -e 192.168.1.127
 ```
 
-### Resource Pressure
+### cert-manager (checks 31, 32, 33)
 
 ```bash
-# Top nodes (CPU and memory usage)
-kubectl top nodes
-
-# Top pods sorted by memory (cluster-wide)
-kubectl top pods -A --sort-by=memory | head -20
-
-# Top pods sorted by CPU (cluster-wide)
-kubectl top pods -A --sort-by=cpu | head -20
-
-# Check resource requests/limits in a namespace
-kubectl describe resourcequota -n <namespace>
-kubectl describe limitrange -n <namespace>
+kubectl get certificate -A
+kubectl describe certificate -n <ns> <name>
+kubectl get certificaterequest -A
+kubectl describe certificaterequest -n <ns> <name>
+kubectl logs -n cert-manager deploy/cert-manager | tail -50
 ```
 
-## Common Remediation
+Common causes: ACME HTTP-01 challenge blocked, ClusterIssuer missing
+DNS provider secret, rate-limit from Let's Encrypt.
 
-### Persistent CrashLoopBackOff
+### Backups (checks 34, 35, 36)
 
-A pod keeps crashing even after the auto-fix deletes it.
+```bash
+# Per-DB dumps (inside the DB pod)
+kubectl exec -n dbaas mysql-standalone-0 -- ls -lah /backup/per-db/
+kubectl exec -n dbaas pg-cluster-0 -- ls -lah /backup/per-db/
 
-1. **Check logs from the crashed container**:
-   ```bash
-   kubectl logs -n <namespace> <pod-name> --previous --tail=200
-   ```
+# Pushgateway metrics
+kubectl exec -n monitoring deploy/prometheus-server -- \
+    wget -qO- http://prometheus-prometheus-pushgateway:9091/metrics | \
+    grep backup_last_success_timestamp
 
-2. **Check the pod description for clues**:
-   ```bash
-   kubectl describe pod -n <namespace> <pod-name>
-   ```
-   Look for:
-   - `OOMKilled` in Last State — the container ran out of memory
-   - `Error` with exit code 1 — application error (bad config, missing env var, DB connection failure)
-   - `Error` with exit code 137 — killed by OOM killer or liveness probe
-   - `Error` with exit code 143 — SIGTERM (graceful shutdown failure)
+# LVM snapshots on PVE host
+ssh -o BatchMode=yes root@192.168.1.127 \
+    'lvs -o lv_name,lv_time,lv_size --noheadings | grep snap'
+```
 
-3. **Common causes**:
-   - **OOMKilled**: Increase memory limits in Terraform (see below)
-   - **Bad config**: Check environment variables, secrets, config maps
-   - **DB connection failure**: Verify the database pod is running (`kubectl get pods -n dbaas`)
-   - **NFS mount failure**: Verify NFS export exists (`showmount -e 10.0.10.15`)
-   - **Missing secret**: Check if TLS secret or other secrets exist in the namespace
+If offsite sync is stale, the common cause is the
+`offsite-sync-backup.service` systemd unit on the PVE host failing.
+`ssh root@192.168.1.127 'systemctl status offsite-sync-backup'`.
 
-### OOMKilled
+### Monitoring stack (checks 37, 38, 39)
 
-The container was killed because it exceeded its memory limit.
+```bash
+# Prometheus
+kubectl exec -n monitoring deploy/prometheus-server -- wget -qO- http://localhost:9090/-/ready
+kubectl logs -n monitoring deploy/prometheus-server --tail=100
 
-1. **Check current limits**:
-   ```bash
-   kubectl describe pod -n <namespace> <pod-name> | grep -A 5 "Limits"
-   ```
+# Alertmanager
+kubectl get pods -n monitoring | grep alertmanager
+kubectl logs -n monitoring -l app=prometheus-alertmanager --tail=100
 
-2. **Fix in Terraform** — Edit `modules/kubernetes/<service>/main.tf` and increase the memory limit:
-   ```hcl
-   resources {
-     limits = {
-       memory = "2Gi"  # Increase from current value
-     }
-   }
-   ```
+# Vault
+kubectl exec -n vault vault-0 -- sh -c 'VAULT_ADDR=http://127.0.0.1:8200 vault status'
+# If sealed: check raft peers with `vault operator raft list-peers` and unseal.
 
-3. **Apply the change**:
-   ```bash
-   cd /workspace/infra
-   terraform apply -target=module.kubernetes_cluster.module.<service> -auto-approve
-   ```
+# ClusterSecretStore
+kubectl get clustersecretstore
+kubectl describe clustersecretstore vault-kv vault-database
+kubectl logs -n external-secrets deploy/external-secrets --tail=100
+```
 
-### ImagePullBackOff
+### External reachability (checks 40, 41, 42)
 
-The container image cannot be pulled.
+```bash
+# Cloudflared
+kubectl get pods -n cloudflared
+kubectl logs -n cloudflared -l app=cloudflared --tail=100
 
-1. **Check the exact error**:
-   ```bash
-   kubectl describe pod -n <namespace> <pod-name> | grep -A 5 "Events"
-   ```
+# Authentik
+kubectl get pods -n authentik -l app=authentik-server
+kubectl logs -n authentik -l app=authentik-server --tail=100
 
-2. **Common causes**:
-   - **Wrong image tag**: Verify the tag exists on the registry (Docker Hub, ghcr.io, etc.)
-   - **Private registry without credentials**: Check if imagePullSecrets are configured
-   - **Pull-through cache issue**: The registry cache at `10.0.20.10` may have a stale entry
-     ```bash
-     # Check pull-through cache ports:
-     # 5000 = docker.io, 5010 = ghcr.io, 5020 = quay.io, 5030 = registry.k8s.io
-     curl -s http://10.0.20.10:5000/v2/_catalog | python3 -m json.tool
-     ```
-   - **Registry rate limit**: Docker Hub free tier has pull limits; pull-through cache helps avoid this
+# ExternalAccessDivergence alert
+kubectl exec -n monitoring deploy/prometheus-server -- \
+    wget -qO- 'http://localhost:9090/api/v1/alerts' | \
+    python3 -m json.tool | grep -A 5 ExternalAccessDivergence
 
-3. **Fix**: Update the image tag in the service's Terraform module and re-apply.
+# Traefik 5xx — find the hot service
+kubectl exec -n monitoring deploy/prometheus-server -- \
+    wget -qO- 'http://localhost:9090/api/v1/query?query=topk(10,rate(traefik_service_requests_total{code=~%225..%22}%5B15m%5D))' \
+    | python3 -m json.tool
+```
 
-### Node NotReady
+### OOMKilled remediation
 
-A node has gone NotReady.
+1. `kubectl describe pod -n <ns> <pod> | grep -A 5 Limits`
+2. Edit `infra/modules/kubernetes/<service>/main.tf` and raise
+   `resources.limits.memory`.
+3. `cd /home/wizard/code/infra && scripts/tg apply` (Tier 1) or
+   `terraform apply -target=module.<service>` as appropriate.
 
-1. **Check node conditions**:
-   ```bash
-   kubectl describe node <node-name> | grep -A 20 "Conditions"
-   ```
+### ImagePullBackOff remediation
 
-2. **SSH to the node and check kubelet**:
-   ```bash
-   ssh root@<node-ip>
-   systemctl status kubelet
-   journalctl -u kubelet --since "10 minutes ago" | tail -50
-   ```
+1. `kubectl describe pod -n <ns> <pod> | grep -A 5 Events`
+2. Verify tag exists on the source registry.
+3. Check pull-through cache at `10.0.20.10:{5000,5010,5020,5030}`.
+4. Update the image tag in Terraform + re-apply.
 
-3. **Check resources**:
-   ```bash
-   # On the node
-   df -h          # Disk space
-   free -h        # Memory
-   top -bn1       # CPU/processes
-   ```
+### Persistent CrashLoopBackOff after auto-fix
 
-4. **Node IPs** (for SSH):
-   - `10.0.20.100` — k8s-master
-   - `10.0.20.101` — k8s-node1 (GPU)
-   - `10.0.20.102` — k8s-node2
-   - `10.0.20.103` — k8s-node3
-   - `10.0.20.104` — k8s-node4
+1. `kubectl logs -n <ns> <pod> --previous --tail=200`
+2. `kubectl describe pod -n <ns> <pod>` and check Last State:
+   - `OOMKilled` → raise memory limit
+   - Exit code 137 → OOM or probe killed
+   - Exit code 143 → SIGTERM / graceful shutdown failed
+3. Cross-check dbaas + NFS + secrets are healthy.
 
-## Slack Webhook
+## Notes on the canonical / hardlink setup
 
-The script posts results to the Slack incoming webhook URL in `$SLACK_WEBHOOK_URL`. The message format uses Slack mrkdwn:
-- All clear: green checkmark with node/pod count
-- Warnings only: warning icon with details
-- Issues found: red alert icon with auto-fixes applied and remaining issues
+The authoritative copy of this SKILL.md lives at
+`/home/wizard/code/.claude/skills/cluster-health/SKILL.md`. A hardlink
+at `/home/wizard/code/infra/.claude/skills/cluster-health/SKILL.md`
+points to the same inode so infra-rooted sessions also discover the
+skill.
 
-The webhook URL is passed as an environment variable from `openclaw_skill_secrets` in `terraform.tfvars`.
+To verify the hardlink is intact:
 
-## Infrastructure
+```bash
+stat -c '%i %n' \
+    /home/wizard/code/.claude/skills/cluster-health/SKILL.md \
+    /home/wizard/code/infra/.claude/skills/cluster-health/SKILL.md
+```
 
-| Component | Path / Location |
-|-----------|----------------|
-| Health check script | `/workspace/infra/.claude/cluster-health.sh` (in-pod) or `.claude/cluster-health.sh` (repo) |
-| Terraform module | `modules/kubernetes/openclaw/main.tf` |
-| CronJob definition | Defined in the OpenClaw Terraform module |
-| Existing full healthcheck | `scripts/cluster_healthcheck.sh` (local-only, 24 checks with color output) |
-| Infra repo (in pod) | `/workspace/infra` |
-| kubectl (in pod) | `/tools/kubectl` |
-| terraform (in pod) | `/tools/terraform` |
+Both should print the same inode number. If they diverge (e.g. `git
+checkout` replaced the file rather than updating it), re-link:
 
-## Auto-File Incidents for SEV1/SEV2
-
-After running health checks, if **SEV1 or SEV2 issues** are found (node down, multiple services affected, core service outage, or single important service down), auto-file a GitHub Issue:
-
-### Severity Classification
-- **SEV1**: Node NotReady, multiple services down, data at risk, core service outage (DNS, auth, ingress, databases)
-- **SEV2**: Single non-core service down, degraded performance, persistent CrashLoopBackOff
-- **SEV3**: Warnings only, resource pressure <90%, cosmetic — do NOT auto-file
-
-### Workflow
-1. **Dedup check**: Before filing, query open incidents:
-   ```bash
-   GITHUB_TOKEN=$(vault kv get -field=github_pat secret/viktor)
-   curl -s -H "Authorization: token $GITHUB_TOKEN" \
-     "https://api.github.com/repos/ViktorBarzin/infra/issues?labels=incident&state=open&per_page=50"
-   ```
-   If an open issue already covers the same service/namespace, **skip filing**.
-
-2. **File the issue** with labels `incident`, `sev1` or `sev2`, `postmortem-required`:
-   - Title: `[AUTO] <Service/Namespace> — <brief symptom>`
-   - Body: full diagnostic dump (pod status, events, alerts, node state)
-   - The issue-automation GHA workflow will trigger the post-mortem pipeline automatically
-
-3. **Auto-close recovered services**: If a service that previously had an auto-filed incident is now healthy:
-   ```bash
-   # Comment and close
-   curl -s -X POST -H "Authorization: token $GITHUB_TOKEN" \
-     "https://api.github.com/repos/ViktorBarzin/infra/issues/<N>/comments" \
-     -d '{"body": "**Resolved** — Service recovered. Auto-closed by cluster health check."}'
-   curl -s -X PATCH -H "Authorization: token $GITHUB_TOKEN" \
-     "https://api.github.com/repos/ViktorBarzin/infra/issues/<N>" \
-     -d '{"state": "closed"}'
-   ```
-
-## Post-Mortem Auto-Suggest
-
-After running a healthcheck, if the cluster has **recovered from an unhealthy state** (previous run showed FAIL items that are now resolved), suggest writing a post-mortem:
-
-> The cluster has recovered from the previous unhealthy state. Would you like me to write a post-mortem? Run `/post-mortem` to generate one.
-
-This ensures incidents are documented while context is fresh.
-
-## Notes
-
-1. This script is designed to run inside the OpenClaw pod where kubectl is pre-configured via the ServiceAccount
-2. The full `scripts/cluster_healthcheck.sh` script runs 24 checks and is meant for local interactive use; this skill's script runs 8 core checks optimized for automated CronJob execution
-3. When investigating issues interactively, prefer running commands directly rather than re-running the script
-4. All Terraform changes must go through the `.tf` files — never use `kubectl apply/edit/patch` for persistent changes
+```bash
+ln -f /home/wizard/code/.claude/skills/cluster-health/SKILL.md \
+      /home/wizard/code/infra/.claude/skills/cluster-health/SKILL.md
+```
diff --git a/AGENTS.md b/AGENTS.md
index 2a885021..98e0bd89 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -99,7 +99,7 @@ Terragrunt-based homelab managing a Kubernetes cluster (5 nodes, v1.34.2) on Pro
 - `config.tfvars` — non-secret configuration (plaintext)
 - `secrets.sops.json` — all secrets (SOPS-encrypted JSON)
 - `terraform.tfvars` — legacy secrets file (git-crypt, kept for reference)
-- `scripts/cluster_healthcheck.sh` — 25-check cluster health script
+- `scripts/cluster_healthcheck.sh` — 42-check cluster health script (nodes, workloads, monitoring, certs, backups, external reachability)
 
 ## Storage
 - **NFS** (`nfs-proxmox` StorageClass): For app data. Use the `nfs_volume` module, never inline `nfs {}` blocks.
diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh
index 8a65839a..997c0b7d 100755
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # Cluster health check script.
-# Runs 24 diagnostic checks against the Kubernetes cluster and prints
+# Runs 42 diagnostic checks against the Kubernetes cluster and prints
 # a colour-coded report with PASS / WARN / FAIL for each section.
 #
 # Usage: ./scripts/cluster_healthcheck.sh [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]
@@ -26,7 +26,7 @@ JSON=false
 KUBECONFIG_PATH="$(pwd)/config"
 KUBECTL=""
 JSON_RESULTS=()
-TOTAL_CHECKS=30
+TOTAL_CHECKS=42
 
 # --- Helpers ---
 info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@@ -71,14 +71,16 @@ parse_args() {
     while [[ $# -gt 0 ]]; do
         case "$1" in
             --fix)        FIX=true; shift ;;
+            --no-fix)     FIX=false; shift ;;
             --quiet|-q)   QUIET=true; shift ;;
             --json)       JSON=true; shift ;;
             --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
             -h|--help)
-                echo "Usage: $0 [--fix] [--quiet|-q] [--json] [--kubeconfig <path>]"
+                echo "Usage: $0 [--fix|--no-fix] [--quiet|-q] [--json] [--kubeconfig <path>]"
                 echo ""
                 echo "Flags:"
                 echo "  --fix              Auto-remediate safe issues (delete evicted pods)"
+                echo "  --no-fix           Disable auto-remediation (default)"
                 echo "  --quiet, -q        Only show WARN and FAIL sections"
                 echo "  --json             Machine-readable JSON output"
                 echo "  --kubeconfig PATH  Override kubeconfig (default: \$(pwd)/config)"
@@ -1750,6 +1752,593 @@ else:
     json_add "hardware_exporters" "$status" "${detail:-All healthy}"
 }
 
+# --- 31. cert-manager: Certificate Readiness ---
+check_cert_manager_certificates() {
+    section 31 "cert-manager — Certificate Readiness"
+    local certs not_ready detail="" status="PASS"
+
+    certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || {
+        warn "cert-manager CRDs not installed or inaccessible"
+        json_add "certmanager_certificates" "WARN" "CRDs unavailable"
+        return 0
+    }
+
+    not_ready=$(echo "$certs" | python3 -c '
+import json, sys
+data = json.load(sys.stdin)
+for item in data.get("items", []):
+    ns = item["metadata"]["namespace"]
+    name = item["metadata"]["name"]
+    conds = item.get("status", {}).get("conditions", [])
+    ready = next((c for c in conds if c.get("type") == "Ready"), None)
+    if not ready or ready.get("status") != "True":
+        reason = ready.get("reason", "NoCondition") if ready else "NoCondition"
+        print(f"{ns}/{name}:{reason}")
+' 2>/dev/null) || true
+
+    if [[ -z "$not_ready" ]]; then
+        pass "All Certificate CRs Ready"
+        json_add "certmanager_certificates" "PASS" "All Ready"
+    else
+        [[ "$QUIET" == true ]] && section_always 31 "cert-manager — Certificate Readiness"
+        local count
+        count=$(count_lines "$not_ready")
+        while IFS= read -r line; do
+            fail "Certificate not Ready: $line"
+            detail+="$line; "
+        done <<< "$not_ready"
+        status="FAIL"
+        json_add "certmanager_certificates" "$status" "$count not Ready: $detail"
+    fi
+}
+
+# --- 32. cert-manager: Certificate Expiry (<14d) ---
+check_cert_manager_expiry() {
+    section 32 "cert-manager — Certificate Expiry (<14d)"
+    local certs expiring detail="" status="PASS"
+
+    certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || {
+        warn "cert-manager CRDs not installed or inaccessible"
+        json_add "certmanager_expiry" "WARN" "CRDs unavailable"
+        return 0
+    }
+
+    expiring=$(echo "$certs" | python3 -c '
+import json, sys
+from datetime import datetime, timezone, timedelta
+data = json.load(sys.stdin)
+cutoff = datetime.now(timezone.utc) + timedelta(days=14)
+for item in data.get("items", []):
+    ns = item["metadata"]["namespace"]
+    name = item["metadata"]["name"]
+    not_after = item.get("status", {}).get("notAfter")
+    if not not_after:
+        continue
+    try:
+        expiry = datetime.fromisoformat(not_after.replace("Z", "+00:00"))
+        if expiry < cutoff:
+            days = (expiry - datetime.now(timezone.utc)).days
+            level = "FAIL" if days <= 3 else "WARN"
+            print(f"{level}:{ns}/{name}:{days}")
+    except ValueError:
+        pass
+' 2>/dev/null) || true
+
+    if [[ -z "$expiring" ]]; then
+        pass "No Certificate CRs expiring within 14 days"
+        json_add "certmanager_expiry" "PASS" "None expiring <14d"
+    else
+        [[ "$QUIET" == true ]] && section_always 32 "cert-manager — Certificate Expiry (<14d)"
+        while IFS= read -r line; do
+            local level cert_name days
+            level=$(echo "$line" | cut -d: -f1)
+            cert_name=$(echo "$line" | cut -d: -f2)
+            days=$(echo "$line" | cut -d: -f3)
+            if [[ "$level" == "FAIL" ]]; then
+                fail "Certificate $cert_name expires in ${days}d"
+                status="FAIL"
+            else
+                warn "Certificate $cert_name expires in ${days}d"
+                [[ "$status" != "FAIL" ]] && status="WARN"
+            fi
+            detail+="$cert_name=${days}d; "
+        done <<< "$expiring"
+        json_add "certmanager_expiry" "$status" "$detail"
+    fi
+}
+
+# --- 33. cert-manager: Failed CertificateRequests ---
+check_cert_manager_requests() {
+    section 33 "cert-manager — Failed CertificateRequests"
+    local requests failed detail="" status="PASS"
+
+    requests=$($KUBECTL get certificaterequests.cert-manager.io -A -o json 2>/dev/null) || {
+        warn "cert-manager CRDs not installed or inaccessible"
+        json_add "certmanager_requests" "WARN" "CRDs unavailable"
+        return 0
+    }
+
+    failed=$(echo "$requests" | python3 -c '
+import json, sys
+data = json.load(sys.stdin)
+for item in data.get("items", []):
+    ns = item["metadata"]["namespace"]
+    name = item["metadata"]["name"]
+    conds = item.get("status", {}).get("conditions", [])
+    for c in conds:
+        if c.get("type") == "Ready" and c.get("status") == "False" and c.get("reason") == "Failed":
+            print(f"{ns}/{name}:{c.get(\"message\", \"\")[:80]}")
+            break
+' 2>/dev/null) || true
+
+    if [[ -z "$failed" ]]; then
+        pass "No failed CertificateRequests"
+        json_add "certmanager_requests" "PASS" "None failed"
+    else
+        [[ "$QUIET" == true ]] && section_always 33 "cert-manager — Failed CertificateRequests"
+        local count
+        count=$(count_lines "$failed")
+        while IFS= read -r line; do
+            fail "CertificateRequest failed: $line"
+            detail+="$line; "
+        done <<< "$failed"
+        status="FAIL"
+        json_add "certmanager_requests" "$status" "$count failed: $detail"
+    fi
+}
+
+# --- 34. Backup Freshness: Per-DB Dumps ---
+check_backup_per_db() {
+    section 34 "Backup Freshness — Per-DB Dumps"
+    local detail="" had_issue=false status="PASS"
+
+    # Freshness threshold: 25 hours
+    local now_epoch max_age_sec
+    now_epoch=$(date -u +%s)
+    max_age_sec=$((25 * 3600))
+
+    _check_cronjob_fresh() {
+        local ns="$1" cj="$2" label="$3"
+        local ts age_sec
+        ts=$($KUBECTL get cronjob -n "$ns" "$cj" -o jsonpath='{.status.lastSuccessfulTime}' 2>/dev/null || true)
+        if [[ -z "$ts" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps"
+            fail "$label: CronJob $ns/$cj has no lastSuccessfulTime"
+            detail+="${label}=no-success; "
+            had_issue=true
+            status="FAIL"
+            return 0
+        fi
+        local ts_epoch
+        ts_epoch=$(date -u -d "$ts" +%s 2>/dev/null || echo 0)
+        age_sec=$((now_epoch - ts_epoch))
+        if [[ "$age_sec" -gt "$max_age_sec" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 34 "Backup Freshness — Per-DB Dumps"
+            local age_h=$((age_sec / 3600))
+            fail "$label: last success ${age_h}h ago (>25h)"
+            detail+="${label}=${age_h}h; "
+            had_issue=true
+            status="FAIL"
+        else
+            local age_h=$((age_sec / 3600))
+            detail+="${label}=${age_h}h; "
+        fi
+    }
+
+    _check_cronjob_fresh dbaas mysql-backup-per-db mysql
+    _check_cronjob_fresh dbaas postgresql-backup-per-db pg
+
+    [[ "$had_issue" == false ]] && pass "Per-DB dumps fresh — $detail"
+    json_add "backup_per_db" "$status" "$detail"
+}
+
+# --- 35. Backup Freshness: Offsite Sync ---
+check_backup_offsite_sync() {
+    section 35 "Backup Freshness — Offsite Sync"
+    local metrics detail="" status="PASS"
+
+    metrics=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
+        wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true)
+
+    if [[ -z "$metrics" ]]; then
+        [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
+        warn "Cannot query Pushgateway"
+        json_add "backup_offsite_sync" "WARN" "Pushgateway unreachable"
+        return 0
+    fi
+
+    local age_hours
+    age_hours=$(echo "$metrics" | python3 -c '
+import sys, re, time
+ts = None
+for line in sys.stdin:
+    if line.startswith("#"):
+        continue
+    if "backup_last_success_timestamp" in line and "offsite-backup-sync" in line:
+        m = re.search(r"\s([0-9.eE+]+)\s*$", line.strip())
+        if m:
+            try:
+                ts = float(m.group(1))
+                break
+            except ValueError:
+                pass
+if ts is None:
+    print("missing")
+else:
+    age = (time.time() - ts) / 3600
+    print(f"{age:.1f}")
+' 2>/dev/null) || age_hours="error"
+
+    if [[ "$age_hours" == "missing" ]]; then
+        [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
+        fail "backup_last_success_timestamp metric missing for offsite-backup-sync"
+        json_add "backup_offsite_sync" "FAIL" "Metric missing"
+    elif [[ "$age_hours" == "error" ]]; then
+        [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
+        warn "Failed to parse Pushgateway metric"
+        json_add "backup_offsite_sync" "WARN" "Parse error"
+    else
+        local age_int
+        age_int=$(printf '%.0f' "$age_hours")
+        if [[ "$age_int" -gt 27 ]]; then
+            [[ "$QUIET" == true ]] && section_always 35 "Backup Freshness — Offsite Sync"
+            fail "Offsite sync last success ${age_hours}h ago (>27h)"
+            status="FAIL"
+        else
+            pass "Offsite sync last success ${age_hours}h ago"
+        fi
+        detail="age=${age_hours}h"
+        json_add "backup_offsite_sync" "$status" "$detail"
+    fi
+}
+
+# --- 36. Backup Freshness: LVM PVC Snapshots ---
+check_backup_lvm_snapshots() {
+    section 36 "Backup Freshness — LVM PVC Snapshots"
+    local snap_output detail="" status="PASS"
+
+    snap_output=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
+        root@192.168.1.127 "lvs -o lv_name,lv_time --noheadings 2>/dev/null | grep -- -snap" 2>/dev/null || true)
+
+    if [[ -z "$snap_output" ]]; then
+        [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
+        warn "No LVM PVC snapshots found or SSH to 192.168.1.127 failed (BatchMode)"
+        json_add "backup_lvm_snapshots" "WARN" "SSH failed or no snapshots"
+        return 0
+    fi
+
+    local newest_age_hours
+    newest_age_hours=$(echo "$snap_output" | python3 -c '
+import sys, re, time
+from datetime import datetime
+newest = None
+for line in sys.stdin:
+    line = line.strip()
+    if not line:
+        continue
+    parts = line.split(None, 1)
+    if len(parts) < 2:
+        continue
+    date_str = parts[1].strip()
+    # lv_time format: "2026-04-19 03:00:01 +0000" or similar
+    for fmt in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S"):
+        try:
+            dt = datetime.strptime(date_str, fmt)
+            ts = dt.timestamp()
+            if newest is None or ts > newest:
+                newest = ts
+            break
+        except ValueError:
+            continue
+if newest is None:
+    print("parse_error")
+else:
+    age = (time.time() - newest) / 3600
+    print(f"{age:.1f}")
+' 2>/dev/null) || newest_age_hours="error"
+
+    if [[ "$newest_age_hours" == "parse_error" || "$newest_age_hours" == "error" ]]; then
+        [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
+        warn "Could not parse LVM snapshot timestamps"
+        json_add "backup_lvm_snapshots" "WARN" "Parse error"
+    else
+        local count age_int
+        count=$(count_lines "$snap_output")
+        age_int=$(printf '%.0f' "$newest_age_hours")
+        if [[ "$age_int" -gt 25 ]]; then
+            [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots"
+            fail "Newest LVM snapshot ${newest_age_hours}h old (>25h); $count total"
+            status="FAIL"
+        else
+            pass "LVM snapshots fresh — $count total, newest ${newest_age_hours}h old"
+        fi
+        detail="count=$count newest=${newest_age_hours}h"
+        json_add "backup_lvm_snapshots" "$status" "$detail"
+    fi
+}
+
+# --- 37. Monitoring: Prometheus + Alertmanager ---
+check_monitoring_prom_am() {
+    section 37 "Monitoring — Prometheus + Alertmanager"
+    local detail="" had_issue=false status="PASS"
+
+    # Prometheus /-/ready
+    local prom_ready
+    prom_ready=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
+        wget -qO- "http://localhost:9090/-/ready" 2>/dev/null || true)
+    if echo "$prom_ready" | grep -qi "ready"; then
+        detail+="prometheus=ready; "
+    else
+        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager"
+        fail "Prometheus /-/ready returned no Ready response"
+        detail+="prometheus=not-ready; "
+        had_issue=true
+        status="FAIL"
+    fi
+
+    # Alertmanager running pod count
+    local am_running
+    am_running=$($KUBECTL get pods -n monitoring --no-headers 2>/dev/null | \
+        grep alertmanager | awk '$3 == "Running"' | wc -l | tr -d ' ')
+    if [[ "$am_running" -gt 0 ]]; then
+        detail+="alertmanager=${am_running} running; "
+    else
+        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 37 "Monitoring — Prometheus + Alertmanager"
+        fail "Alertmanager: 0 Running pods"
+        detail+="alertmanager=none-running; "
+        had_issue=true
+        status="FAIL"
+    fi
+
+    [[ "$had_issue" == false ]] && pass "Prometheus Ready, $am_running Alertmanager pod(s) Running"
+    json_add "monitoring_prom_am" "$status" "$detail"
+}
+
+# --- 38. Monitoring: Vault Sealed Status ---
+check_monitoring_vault() {
+    section 38 "Monitoring — Vault Sealed Status"
+    local output detail="" status="PASS"
+
+    output=$($KUBECTL exec -n vault vault-0 -- \
+        sh -c 'VAULT_ADDR=http://127.0.0.1:8200 vault status' 2>&1 || true)
+
+    if [[ -z "$output" ]]; then
+        [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
+        fail "Cannot exec vault status on vault-0"
+        json_add "monitoring_vault" "FAIL" "Exec failed"
+        return 0
+    fi
+
+    if echo "$output" | grep -qi "^Sealed[[:space:]]*false"; then
+        pass "Vault unsealed"
+        detail="sealed=false"
+        json_add "monitoring_vault" "PASS" "$detail"
+    elif echo "$output" | grep -qi "^Sealed[[:space:]]*true"; then
+        [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
+        fail "Vault is SEALED — secrets unavailable"
+        detail="sealed=true"
+        status="FAIL"
+        json_add "monitoring_vault" "$status" "$detail"
+    else
+        [[ "$QUIET" == true ]] && section_always 38 "Monitoring — Vault Sealed Status"
+        warn "Cannot parse vault status output"
+        json_add "monitoring_vault" "WARN" "Parse error"
+    fi
+}
+
+# --- 39. Monitoring: ClusterSecretStore Ready ---
+check_monitoring_css() {
+    section 39 "Monitoring — ClusterSecretStore Ready"
+    local css not_ready detail="" status="PASS"
+
+    css=$($KUBECTL get clustersecretstore -o json 2>/dev/null) || {
+        [[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready"
+        warn "ClusterSecretStore CRD not installed"
+        json_add "monitoring_css" "WARN" "CRD missing"
+        return 0
+    }
+
+    not_ready=$(echo "$css" | python3 -c '
+import json, sys
+data = json.load(sys.stdin)
+for item in data.get("items", []):
+    name = item["metadata"]["name"]
+    conds = item.get("status", {}).get("conditions", [])
+    ready = next((c for c in conds if c.get("type") == "Ready"), None)
+    if not ready or ready.get("status") != "True":
+        print(f"{name}:{ready.get(\"reason\", \"NoCondition\") if ready else \"NoCondition\"}")
+' 2>/dev/null) || true
+
+    if [[ -z "$not_ready" ]]; then
+        local total
+        total=$(echo "$css" | python3 -c 'import json,sys; print(len(json.load(sys.stdin).get("items",[])))' 2>/dev/null || echo "?")
+        pass "All $total ClusterSecretStores Ready"
+        json_add "monitoring_css" "PASS" "$total Ready"
+    else
+        [[ "$QUIET" == true ]] && section_always 39 "Monitoring — ClusterSecretStore Ready"
+        while IFS= read -r line; do
+            fail "ClusterSecretStore not Ready: $line"
+            detail+="$line; "
+        done <<< "$not_ready"
+        status="FAIL"
+        json_add "monitoring_css" "$status" "$detail"
+    fi
+}
+
+# --- 40. External Reachability: Cloudflared + Authentik Replicas ---
+check_external_replicas() {
+    section 40 "External — Cloudflared + Authentik Replicas"
+    local detail="" had_issue=false status="PASS"
+
+    # Cloudflared
+    local cf_json cf_ready cf_desired
+    cf_json=$($KUBECTL get deployment cloudflared -n cloudflared -o json 2>/dev/null || true)
+    if [[ -z "$cf_json" ]]; then
+        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
+        fail "Cloudflared deployment not found"
+        detail+="cloudflared=missing; "
+        had_issue=true
+        status="FAIL"
+    else
+        cf_ready=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0")
+        cf_desired=$(echo "$cf_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0")
+        if [[ "$cf_ready" != "$cf_desired" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
+            fail "Cloudflared: $cf_ready/$cf_desired ready (external access degraded)"
+            detail+="cloudflared=${cf_ready}/${cf_desired}; "
+            had_issue=true
+            status="FAIL"
+        else
+            detail+="cloudflared=${cf_ready}/${cf_desired}; "
+        fi
+    fi
+
+    # Authentik server (Helm chart names the deployment goauthentik-server)
+    local auth_json auth_ready auth_desired
+    auth_json=$($KUBECTL get deployment goauthentik-server -n authentik -o json 2>/dev/null || true)
+    if [[ -z "$auth_json" ]]; then
+        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
+        warn "goauthentik-server deployment not found in authentik namespace"
+        detail+="authentik=missing; "
+        had_issue=true
+        [[ "$status" != "FAIL" ]] && status="WARN"
+    else
+        auth_ready=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("status",{}).get("readyReplicas",0) or 0)' 2>/dev/null || echo "0")
+        auth_desired=$(echo "$auth_json" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("spec",{}).get("replicas",0) or 0)' 2>/dev/null || echo "0")
+        if [[ "$auth_ready" != "$auth_desired" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 40 "External — Cloudflared + Authentik Replicas"
+            fail "goauthentik-server: $auth_ready/$auth_desired ready (auth degraded)"
+            detail+="authentik=${auth_ready}/${auth_desired}; "
+            had_issue=true
+            status="FAIL"
+        else
+            detail+="authentik=${auth_ready}/${auth_desired}; "
+        fi
+    fi
+
+    [[ "$had_issue" == false ]] && pass "Cloudflared + authentik-server at full replicas ($detail)"
+    json_add "external_replicas" "$status" "$detail"
+}
+
+# --- 41. External Reachability: ExternalAccessDivergence Alert ---
+check_external_divergence() {
+    section 41 "External — ExternalAccessDivergence Alert"
+    local alerts result detail="" status="PASS"
+
+    alerts=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
+        wget -qO- "http://localhost:9090/api/v1/alerts" 2>/dev/null || true)
+
+    if [[ -z "$alerts" ]]; then
+        [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
+        warn "Cannot query Prometheus alerts"
+        json_add "external_divergence" "WARN" "Cannot query"
+        return 0
+    fi
+
+    result=$(echo "$alerts" | python3 -c '
+import json, sys
+try:
+    data = json.load(sys.stdin)
+    alerts = data.get("data", {}).get("alerts", []) if isinstance(data, dict) else data
+    firing = [a for a in alerts
+              if a.get("labels", {}).get("alertname") == "ExternalAccessDivergence"
+              and a.get("state") == "firing"]
+    if firing:
+        hosts = [a.get("labels", {}).get("host") or a.get("labels", {}).get("service") or "?" for a in firing]
+        print(f"{len(firing)}:" + ",".join(hosts))
+    else:
+        print("0:")
+except Exception as e:
+    print(f"error:{e}")
+' 2>/dev/null) || result="error:parse"
+
+    if [[ "$result" == error:* ]]; then
+        [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
+        warn "Failed to parse alerts JSON: ${result#error:}"
+        json_add "external_divergence" "WARN" "Parse error"
+        return 0
+    fi
+
+    local count names
+    count=$(echo "$result" | cut -d: -f1)
+    names=$(echo "$result" | cut -d: -f2-)
+
+    if [[ "$count" -eq 0 ]]; then
+        pass "ExternalAccessDivergence not firing"
+        json_add "external_divergence" "PASS" "Not firing"
+    else
+        [[ "$QUIET" == true ]] && section_always 41 "External — ExternalAccessDivergence Alert"
+        fail "ExternalAccessDivergence firing for $count target(s): $names"
+        status="FAIL"
+        detail="$count firing: $names"
+        json_add "external_divergence" "$status" "$detail"
+    fi
+}
+
+# --- 42. External Reachability: Traefik 5xx Rate ---
+check_external_traefik_5xx() {
+    section 42 "External — Traefik 5xx Rate (15m)"
+    local query_result detail="" status="PASS"
+
+    query_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
+        wget -qO- 'http://localhost:9090/api/v1/query?query=topk(10,rate(traefik_service_requests_total{code=~%225..%22}%5B15m%5D))' 2>/dev/null || true)
+
+    if [[ -z "$query_result" ]]; then
+        [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
+        warn "Cannot query Prometheus for traefik 5xx rate"
+        json_add "external_traefik_5xx" "WARN" "Query failed"
+        return 0
+    fi
+
+    local parsed
+    parsed=$(echo "$query_result" | python3 -c '
+import json, sys
+try:
+    data = json.load(sys.stdin)
+    results = data.get("data", {}).get("result", [])
+    hot = [(r.get("metric", {}).get("service", "?"), float(r.get("value", [0, "0"])[1])) for r in results]
+    hot = [(s, v) for s, v in hot if v > 0.01]  # 1% req/s threshold
+    hot.sort(key=lambda x: -x[1])
+    if not hot:
+        print("0:")
+    else:
+        top = [f"{s}={v:.2f}/s" for s, v in hot[:5]]
+        print(f"{len(hot)}:" + "; ".join(top))
+except Exception as e:
+    print(f"error:{e}")
+' 2>/dev/null) || parsed="error:parse"
+
+    if [[ "$parsed" == error:* ]]; then
+        [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
+        warn "Parse failed: ${parsed#error:}"
+        json_add "external_traefik_5xx" "WARN" "Parse error"
+        return 0
+    fi
+
+    local count top
+    count=$(echo "$parsed" | cut -d: -f1)
+    top=$(echo "$parsed" | cut -d: -f2-)
+
+    if [[ "$count" -eq 0 ]]; then
+        pass "No Traefik services with 5xx rate >0.01 req/s (last 15m)"
+        json_add "external_traefik_5xx" "PASS" "None above threshold"
+    else
+        [[ "$QUIET" == true ]] && section_always 42 "External — Traefik 5xx Rate (15m)"
+        # WARN at any 5xx; FAIL if top service >1 req/s
+        local top_rate
+        top_rate=$(echo "$top" | grep -oE '[0-9.]+/s' | head -1 | tr -d '/s')
+        if awk "BEGIN{exit !($top_rate > 1.0)}" 2>/dev/null; then
+            fail "$count Traefik service(s) with elevated 5xx: $top"
+            status="FAIL"
+        else
+            warn "$count Traefik service(s) emitting 5xx: $top"
+            status="WARN"
+        fi
+        detail="$count services: $top"
+        json_add "external_traefik_5xx" "$status" "$detail"
+    fi
+}
+
 # --- Summary ---
 print_summary() {
     if [[ "$JSON" == true ]]; then
@@ -1832,6 +2421,18 @@ main() {
     check_ha_automations
     check_ha_system
     check_hardware_exporters
+    check_cert_manager_certificates
+    check_cert_manager_expiry
+    check_cert_manager_requests
+    check_backup_per_db
+    check_backup_offsite_sync
+    check_backup_lvm_snapshots
+    check_monitoring_prom_am
+    check_monitoring_vault
+    check_monitoring_css
+    check_external_replicas
+    check_external_divergence
+    check_external_traefik_5xx
     print_summary
 
     # Exit code: 2 for failures, 1 for warnings, 0 for clean
diff --git a/setup-monitoring.sh b/setup-monitoring.sh
deleted file mode 100755
index a7e3caf7..00000000
--- a/setup-monitoring.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Setup script for automated monitoring environment
-# Ensures health check scripts have access to kubeconfig
-
-echo "=== Setting up automated monitoring environment ==="
-
-# Copy kubeconfig to location expected by health check scripts
-if [ -f /home/node/.openclaw/kubeconfig ]; then
-    cp /home/node/.openclaw/kubeconfig /workspace/infra/config
-    echo "✅ Kubeconfig copied to /workspace/infra/config"
-else
-    echo "❌ Source kubeconfig not found at /home/node/.openclaw/kubeconfig"
-    exit 1
-fi
-
-# Test health check access
-echo ""
-echo "Testing health check script access..."
-cd /workspace/infra
-if KUBECONFIG="" timeout 30 bash .claude/cluster-health.sh --quiet > /dev/null 2>&1; then
-    echo "✅ Health check script can access cluster"
-else
-    echo "❌ Health check script cannot access cluster"
-    exit 1
-fi
-
-echo ""
-echo "✅ Automated monitoring environment setup complete"
-echo "📊 Cron health checks will now work properly"
\ No newline at end of file
diff --git a/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts b/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts
index 3d0fa891..f96f4d56 100644
--- a/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts
+++ b/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts
@@ -83,7 +83,7 @@ For secrets requiring admin access (shared infra passwords, API keys):
 | \`modules/kubernetes/nfs_volume/\` | NFS volume module (CSI-backed, soft mount) |
 | \`config.tfvars\` | Non-secret configuration (plaintext) |
 | \`secrets.sops.json\` | All secrets (SOPS-encrypted JSON) |
-| \`scripts/cluster_healthcheck.sh\` | 25-check cluster health script |
+| \`scripts/cluster_healthcheck.sh\` | 42-check cluster health script |
 | \`AGENTS.md\` | Full AI agent instructions (auto-loaded by most agents) |
 
 ### Tier System
diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf
index 2a611be9..5d5f9a1d 100644
--- a/stacks/openclaw/main.tf
+++ b/stacks/openclaw/main.tf
@@ -441,11 +441,6 @@ resource "kubernetes_deployment" "openclaw" {
             name  = "UPTIME_KUMA_PASSWORD"
             value = local.skill_secrets["uptime_kuma_password"]
           }
-          # Skill secrets - Slack
-          env {
-            name  = "SLACK_WEBHOOK_URL"
-            value = local.skill_secrets["slack_webhook"]
-          }
           # Memory API
           env {
             name  = "MEMORY_API_URL"
@@ -846,7 +841,10 @@ module "task_webhook_ingress" {
   external_monitor = false
 }
 
-# --- CronJob: Scheduled cluster health check ---
+# --- Shared ServiceAccount: grants pod-exec into the openclaw pod ---
+# Used by the task_processor CronJob (below). Previously also used by the
+# cluster_healthcheck CronJob, which has been decommissioned — the local
+# `scripts/cluster_healthcheck.sh` is now the single authoritative runner.
 
 resource "kubernetes_service_account" "healthcheck" {
   metadata {
@@ -889,76 +887,6 @@ resource "kubernetes_role_binding" "healthcheck_exec" {
   }
 }
 
-resource "kubernetes_cron_job_v1" "cluster_healthcheck" {
-  metadata {
-    name      = "cluster-healthcheck"
-    namespace = kubernetes_namespace.openclaw.metadata[0].name
-    labels = {
-      app  = "cluster-healthcheck"
-      tier = local.tiers.aux
-    }
-  }
-  spec {
-    schedule                      = "0 */8 * * *"
-    concurrency_policy            = "Forbid"
-    failed_jobs_history_limit     = 3
-    successful_jobs_history_limit = 3
-
-    job_template {
-      metadata {
-        labels = {
-          app = "cluster-healthcheck"
-        }
-      }
-      spec {
-        active_deadline_seconds = 300
-        backoff_limit           = 0
-        template {
-          metadata {
-            labels = {
-              app = "cluster-healthcheck"
-            }
-          }
-          spec {
-            service_account_name = kubernetes_service_account.healthcheck.metadata[0].name
-            restart_policy       = "Never"
-
-            container {
-              name  = "healthcheck"
-              image = "bitnami/kubectl:latest"
-              command = ["bash", "-c", <<-EOF
-                # Find the openclaw pod
-                POD=$(kubectl get pods -n openclaw -l app=openclaw -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
-                if [ -z "$POD" ]; then
-                  echo "ERROR: OpenClaw pod not found"
-                  exit 1
-                fi
-                echo "Executing health check in pod $POD..."
-                kubectl exec -n openclaw "$POD" -c openclaw -- bash /workspace/infra/.claude/cluster-health.sh
-              EOF
-              ]
-
-              resources {
-                requests = {
-                  cpu    = "50m"
-                  memory = "64Mi"
-                }
-                limits = {
-                  memory = "64Mi"
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  lifecycle {
-    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
-    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
-  }
-}
-
 # --- CronJob: Task processor — polls Forgejo issues and triggers OpenClaw ---
 
 resource "kubernetes_cron_job_v1" "task_processor" {
@@ -983,8 +911,9 @@ resource "kubernetes_cron_job_v1" "task_processor" {
         }
       }
       spec {
-        active_deadline_seconds = 600
-        backoff_limit           = 0
+        active_deadline_seconds    = 600
+        backoff_limit              = 0
+        ttl_seconds_after_finished = 86400
         template {
           metadata {
             labels = {