feat: add hardware exporter checks to cluster healthcheck (check #30)
Verifies snmp-exporter, idrac-redfish-exporter, proxmox-exporter, and tuya-bridge pods are running, plus checks Prometheus scrape targets (snmp-idrac, snmp-ups, redfish-idrac, proxmox-host) are UP.
This commit is contained in:
parent
a4c80adbce
commit
9e2ac5fbb5
1 changed files with 96 additions and 1 deletions
|
|
@ -26,7 +26,7 @@ JSON=false
|
||||||
KUBECONFIG_PATH="$(pwd)/config"
|
KUBECONFIG_PATH="$(pwd)/config"
|
||||||
KUBECTL=""
|
KUBECTL=""
|
||||||
JSON_RESULTS=()
|
JSON_RESULTS=()
|
||||||
TOTAL_CHECKS=29
|
TOTAL_CHECKS=30
|
||||||
|
|
||||||
# --- Helpers ---
|
# --- Helpers ---
|
||||||
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
|
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
|
@ -1626,6 +1626,100 @@ PYEOF
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- 30. Hardware Exporters ---
|
||||||
|
check_hardware_exporters() {
|
||||||
|
section 30 "Hardware Exporters"
|
||||||
|
local detail="" had_issue=false status="PASS"
|
||||||
|
|
||||||
|
# Check exporter pods are Running
|
||||||
|
local exporters=(
|
||||||
|
"monitoring:snmp-exporter"
|
||||||
|
"monitoring:idrac-redfish-exporter"
|
||||||
|
"monitoring:proxmox-exporter"
|
||||||
|
"tuya-bridge:tuya-bridge"
|
||||||
|
)
|
||||||
|
|
||||||
|
for entry in "${exporters[@]}"; do
|
||||||
|
local ns="${entry%%:*}"
|
||||||
|
local name="${entry##*:}"
|
||||||
|
local pods
|
||||||
|
pods=$($KUBECTL get pods -n "$ns" -l "app=$name" --no-headers 2>/dev/null || true)
|
||||||
|
|
||||||
|
# If label selector returns nothing, try matching by deployment name prefix
|
||||||
|
if [[ -z "$pods" ]]; then
|
||||||
|
pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep "^${name}-" || true)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$pods" ]]; then
|
||||||
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
|
||||||
|
fail "$ns/$name: no pods found"
|
||||||
|
detail+="$ns/$name=missing; "
|
||||||
|
had_issue=true
|
||||||
|
status="FAIL"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
local not_running
|
||||||
|
not_running=$(echo "$pods" | awk '$3 != "Running" && $3 != "Completed" {print $1 ": " $3}' || true)
|
||||||
|
if [[ -n "$not_running" ]]; then
|
||||||
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
|
||||||
|
fail "$ns/$name pod not running: $not_running"
|
||||||
|
detail+="$ns/$name=not-running; "
|
||||||
|
had_issue=true
|
||||||
|
status="FAIL"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check Prometheus scrape targets for hardware exporters
|
||||||
|
local prom_jobs=("snmp-idrac" "snmp-ups" "redfish-idrac" "proxmox-host")
|
||||||
|
local up_result
|
||||||
|
up_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
|
||||||
|
wget -q -O- 'http://localhost:9090/api/v1/query?query=up' 2>/dev/null || true)
|
||||||
|
|
||||||
|
if [[ -n "$up_result" ]]; then
|
||||||
|
for job in "${prom_jobs[@]}"; do
|
||||||
|
local job_up
|
||||||
|
job_up=$(echo "$up_result" | python3 -c "
|
||||||
|
import json, sys
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
for r in data.get('data', {}).get('result', []):
|
||||||
|
if r.get('metric', {}).get('job') == '$job':
|
||||||
|
print(r.get('value', [0, '0'])[1])
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print('missing')
|
||||||
|
" 2>/dev/null) || job_up="error"
|
||||||
|
|
||||||
|
if [[ "$job_up" == "1" ]]; then
|
||||||
|
detail+="$job=up; "
|
||||||
|
elif [[ "$job_up" == "missing" ]]; then
|
||||||
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
|
||||||
|
warn "Prometheus target '$job' not found"
|
||||||
|
detail+="$job=missing; "
|
||||||
|
had_issue=true
|
||||||
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||||
|
else
|
||||||
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
|
||||||
|
fail "Prometheus target '$job' is down (up=$job_up)"
|
||||||
|
detail+="$job=down; "
|
||||||
|
had_issue=true
|
||||||
|
status="FAIL"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
[[ "$had_issue" == false && "$QUIET" == true ]] && section_always 30 "Hardware Exporters"
|
||||||
|
warn "Cannot query Prometheus for exporter targets"
|
||||||
|
detail+="prometheus-query-failed; "
|
||||||
|
had_issue=true
|
||||||
|
[[ "$status" != "FAIL" ]] && status="WARN"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$had_issue" == false ]]; then
|
||||||
|
pass "All hardware exporters running and scraped by Prometheus"
|
||||||
|
fi
|
||||||
|
json_add "hardware_exporters" "$status" "${detail:-All healthy}"
|
||||||
|
}
|
||||||
|
|
||||||
# --- Summary ---
|
# --- Summary ---
|
||||||
print_summary() {
|
print_summary() {
|
||||||
if [[ "$JSON" == true ]]; then
|
if [[ "$JSON" == true ]]; then
|
||||||
|
|
@ -1707,6 +1801,7 @@ main() {
|
||||||
check_ha_integrations
|
check_ha_integrations
|
||||||
check_ha_automations
|
check_ha_automations
|
||||||
check_ha_system
|
check_ha_system
|
||||||
|
check_hardware_exporters
|
||||||
print_summary
|
print_summary
|
||||||
|
|
||||||
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue