feat: add hardware exporter checks to cluster healthcheck (check #30)

Verifies snmp-exporter, idrac-redfish-exporter, proxmox-exporter, and tuya-bridge pods are running, plus checks Prometheus scrape targets (snmp-idrac, snmp-ups, redfish-idrac, proxmox-host) are UP.
2026-04-06 14:58:46 +03:00 · 2026-04-06 14:58:46 +03:00 · 9e2ac5fbb5
commit 9e2ac5fbb5
parent a4c80adbce
1 changed files with 96 additions and 1 deletions
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
@ -26,7 +26,7 @@ JSON=false
 KUBECONFIG_PATH="$(pwd)/config"
 KUBECTL=""
 JSON_RESULTS=()
-TOTAL_CHECKS=29
+TOTAL_CHECKS=30

 # --- Helpers ---
 info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@ -1626,6 +1626,100 @@ PYEOF
    fi
 }

+# --- 30. Hardware Exporters ---
+check_hardware_exporters() {
+    section 30 "Hardware Exporters"
+    local detail="" had_issue=false status="PASS"
+
+    # Check exporter pods are Running
+    local exporters=(
+        "monitoring:snmp-exporter"
+        "monitoring:idrac-redfish-exporter"
+        "monitoring:proxmox-exporter"
+        "tuya-bridge:tuya-bridge"
+    )
+
+    for entry in "${exporters[@]}"; do
+        local ns="${entry%%:*}"
+        local name="${entry##*:}"
+        local pods
+        pods=$($KUBECTL get pods -n "$ns" -l "app=$name" --no-headers 2>/dev/null || true)
+
+        # If label selector returns nothing, try matching by deployment name prefix
+        if [[ -z "$pods" ]]; then
+            pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep "^${name}-" || true)
+        fi
+
+        if [[ -z "$pods" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
+            fail "$ns/$name: no pods found"
+            detail+="$ns/$name=missing; "
+            had_issue=true
+            status="FAIL"
+            continue
+        fi
+
+        local not_running
+        not_running=$(echo "$pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
+        if [[ -n "$not_running" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
+            fail "$ns/$name pod not running: $not_running"
+            detail+="$ns/$name=not-running; "
+            had_issue=true
+            status="FAIL"
+        fi
+    done
+
+    # Check Prometheus scrape targets for hardware exporters
+    local prom_jobs=("snmp-idrac" "snmp-ups" "redfish-idrac" "proxmox-host")
+    local up_result
+    up_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
+        wget -q -O- 'http://localhost:9090/api/v1/query?query=up' 2>/dev/null || true)
+
+    if [[ -n "$up_result" ]]; then
+        for job in "${prom_jobs[@]}"; do
+            local job_up
+            job_up=$(echo "$up_result" | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+for r in data.get('data', {}).get('result', []):
+    if r.get('metric', {}).get('job') == '$job':
+        print(r.get('value', [0, '0'])[1])
+        break
+else:
+    print('missing')
+" 2>/dev/null) || job_up="error"
+
+            if [[ "$job_up" == "1" ]]; then
+                detail+="$job=up; "
+            elif [[ "$job_up" == "missing" ]]; then
+                [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
+                warn "Prometheus target '$job' not found"
+                detail+="$job=missing; "
+                had_issue=true
+                [[ "$status" != "FAIL" ]] && status="WARN"
+            else
+                [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
+                fail "Prometheus target '$job' is down (up=$job_up)"
+                detail+="$job=down; "
+                had_issue=true
+                status="FAIL"
+            fi
+        done
+    else
+        [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
+        warn "Cannot query Prometheus for exporter targets"
+        detail+="prometheus-query-failed; "
+        had_issue=true
+        [[ "$status" != "FAIL" ]] && status="WARN"
+    fi
+
+    if [[ "$had_issue" == false ]]; then
+        pass "All hardware exporters running and scraped by Prometheus"
+    fi
+    json_add "hardware_exporters" "$status" "${detail:-All healthy}"
+}
+
 # --- Summary ---
 print_summary() {
    if [[ "$JSON" == true ]]; then
@ -1707,6 +1801,7 @@ main() {
    check_ha_integrations
    check_ha_automations
    check_ha_system
+    check_hardware_exporters
    print_summary

    # Exit code: 2 for failures, 1 for warnings, 0 for clean