cluster-health: add checks 43 + 44 (PVE host thermals + load)

Both new checks SSH read-only to the PVE host and emit PASS/WARN/FAIL
via the standard healthcheck output + JSON. They run alongside the
existing 42 checks and surface the same alerts the 2026-05-20/21
optimization session had to gather by hand.

#43 PVE Host Thermals — Xeon E5-2699v4 package + per-core temps
  Reads every /sys/class/hwmon/hwmon0/temp*_input in one SSH round-trip.
  Thresholds tuned to the live TjMax=83 / Tcrit=93:
    PASS  < 75 °C package
    WARN  75-82 °C  (approaching max, action time)
    FAIL  >= 83 °C  (at/above TjMax, throttling imminent)
  Reports hottest core label too so a single hot core doesn't hide in
  the package average.

#44 PVE Host Load — load avg vs 44-thread capacity
  Reads /proc/loadavg, compares 5-min to thread count (44):
    PASS  load_5 < 30   (< 70% threads busy)
    WARN  30-37         (oversubscribed but not saturating)
    FAIL  >= 38         (~85%+ threads busy — scheduler saturation)
  Uses 5-min so brief work spikes don't false-fail.

Both gracefully WARN-degrade if SSH BatchMode fails, matching the
existing check 36 (LVM PVC snapshots) pattern. TOTAL_CHECKS bumped
42 -> 44 and the dispatcher updated.
This commit is contained in:
Viktor Barzin 2026-05-22 09:55:11 +00:00
parent 1b21d4819e
commit 8228171104

View file

@ -27,7 +27,7 @@ KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="$(pwd)/config"
KUBECTL=""
JSON_RESULTS=()
TOTAL_CHECKS=42
TOTAL_CHECKS=44
# --- Helpers ---
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@ -2465,6 +2465,105 @@ except Exception as e:
}
# --- 42. External Reachability: Traefik 5xx Rate ---
check_pve_thermals() {
section 43 "PVE Host Thermals — Xeon E5-2699v4 package + per-core temps"
local raw status="PASS"
# Read all hwmon temp inputs in one SSH round-trip. Output: one line per
# sensor, "<sensor_label> <celsius>". Falls back gracefully on missing
# labels (Xeon coretemp driver exposes both `Package id 0` and `Core N`).
raw=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
root@192.168.1.127 '
cd /sys/class/hwmon/hwmon0 2>/dev/null || exit 1
for tfile in temp*_input; do
[[ -e "$tfile" ]] || continue
base=${tfile%_input}
label=$(cat "${base}_label" 2>/dev/null || echo "$base")
val=$(cat "$tfile" 2>/dev/null)
[[ -n "$val" ]] && echo "$label $((val/1000))"
done
' 2>/dev/null || true)
if [[ -z "$raw" ]]; then
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
warn "Could not read hwmon temps from 192.168.1.127 (SSH BatchMode failed or path missing)"
json_add "pve_thermals" "WARN" "SSH failed or hwmon path missing"
return 0
fi
local pkg_temp max_core_temp max_core_label
pkg_temp=$(echo "$raw" | awk '/^Package id/{print $NF; exit}')
max_core_temp=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print m}')
max_core_label=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print lbl}')
# Xeon E5-2699v4: TjMax = 83°C reported max, 93°C critical (from earlier
# session: max=83, crit=93). Set WARN well below the manufacturer max so
# action can be taken before throttling onset.
# PASS < 75°C package
# WARN 75-82°C package (approaching max)
# FAIL >= 83°C package (at/above max — throttling imminent)
local detail="package=${pkg_temp}°C max_core=${max_core_temp}°C (${max_core_label})"
if [[ -z "$pkg_temp" ]]; then
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
warn "Package temp not found in hwmon output"
json_add "pve_thermals" "WARN" "$detail"
elif [[ "$pkg_temp" -ge 83 ]]; then
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
fail "PVE package temp ${pkg_temp}°C >= TjMax (83°C) — throttling imminent. $detail"
json_add "pve_thermals" "FAIL" "$detail"
status="FAIL"
elif [[ "$pkg_temp" -ge 75 ]]; then
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
warn "PVE package temp ${pkg_temp}°C in warn band (75-82°C). $detail"
json_add "pve_thermals" "WARN" "$detail"
else
pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label})"
json_add "pve_thermals" "PASS" "$detail"
fi
}
check_pve_load() {
section 44 "PVE Host Load — load avg vs 44-thread capacity"
local raw load_1 load_5 load_15
raw=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
root@192.168.1.127 'cat /proc/loadavg' 2>/dev/null || true)
if [[ -z "$raw" ]]; then
[[ "$QUIET" == true ]] && section_always 44 "PVE Host Load"
warn "Could not read /proc/loadavg from 192.168.1.127"
json_add "pve_load" "WARN" "SSH failed"
return 0
fi
load_1=$(echo "$raw" | awk '{print $1}')
load_5=$(echo "$raw" | awk '{print $2}')
load_15=$(echo "$raw" | awk '{print $3}')
# Round load_5 down for integer comparison (avoid bc dep)
local load_5_int
load_5_int=$(printf '%.0f' "$load_5")
# R730: 44 hw threads (22c × HT). Healthy avg ~ 15-22 (~30-50% utilisation
# of thread count). Warn when sustained 5-min above 30 (~70% threads
# busy). Fail when 5-min above 38 (~85% — close to scheduler saturation).
# PASS load_5 < 30
# WARN 30 <= load_5 < 38
# FAIL load_5 >= 38
local detail="1m=${load_1} 5m=${load_5} 15m=${load_15}"
if [[ "$load_5_int" -ge 38 ]]; then
[[ "$QUIET" == true ]] && section_always 44 "PVE Host Load"
fail "PVE 5-min load ${load_5} >= 38 of 44 threads — saturation. $detail"
json_add "pve_load" "FAIL" "$detail"
elif [[ "$load_5_int" -ge 30 ]]; then
[[ "$QUIET" == true ]] && section_always 44 "PVE Host Load"
warn "PVE 5-min load ${load_5} in warn band (30-37 of 44 threads). $detail"
json_add "pve_load" "WARN" "$detail"
else
pass "PVE load avg $detail (< 30/44 threads)"
json_add "pve_load" "PASS" "$detail"
fi
}
check_external_traefik_5xx() {
section 42 "External — Traefik 5xx Rate (15m)"
local query_result detail="" status="PASS"
@ -2621,6 +2720,8 @@ main() {
check_monitoring_css
check_external_replicas
check_external_divergence
check_pve_thermals
check_pve_load
check_external_traefik_5xx
print_summary