From 6950b8f1973f56cbdddd16a06914fcdcec9f36c3 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 22 May 2026 14:09:08 +0000 Subject: [PATCH] cluster-health #43: tighten PVE thermal threshold to 65 C MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Viktor: healthy baseline range is 55-65 C; anything above 65 C is a signal a VM/workload is using too much CPU and warrants investigation. Previous thresholds were calibrated to the hardware's TjMax (75/83 C) — that was too lax, since cluster-load-driven elevation arrives a long time before throttling. The 65 C cutoff matches the live Prometheus baseline (Apr 20-May 8 2026: peak 61-69 C, avg 51-55 C) and the session-observed correlation: above 65 C means the cluster is doing sustained work that should be looked at, even if hardware is still nowhere near its limit. Updated: PASS < 65 C (within 55-65 baseline) WARN 65-82 C (elevated; check top kvm processes for the culprit) FAIL >= 83 C (at/above TjMax — throttling imminent) Verified live: 67 C now WARN (was PASS under the 75 C threshold). --- scripts/cluster_healthcheck.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index 799703cf..fa5cf5e4 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -2496,12 +2496,14 @@ check_pve_thermals() { max_core_temp=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print m}') max_core_label=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print lbl}') - # Xeon E5-2699v4: TjMax = 83°C reported max, 93°C critical (from earlier - # session: max=83, crit=93). Set WARN well below the manufacturer max so - # action can be taken before throttling onset. - # PASS < 75°C package - # WARN 75-82°C package (approaching max) - # FAIL >= 83°C package (at/above max — throttling imminent) + # Healthy baseline for this R730 (verified Apr 20-May 8 2026 from + # Prometheus): peak 61-69°C, avg 51-55°C. Treat anything above 65°C + # as a signal that some VM/workload is using too much CPU and warrants + # investigation, even though the Xeon E5-2699v4 has TjMax=83°C / + # Tcrit=93°C. This catches load creep early, well before throttling. + # PASS < 65°C package (within baseline 55-65 °C band) + # WARN 65-82°C package (elevated — investigate top CPU consumer) + # FAIL >= 83°C package (at/above TjMax — throttling imminent) local detail="package=${pkg_temp}°C max_core=${max_core_temp}°C (${max_core_label})" if [[ -z "$pkg_temp" ]]; then [[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals" @@ -2512,12 +2514,12 @@ check_pve_thermals() { fail "PVE package temp ${pkg_temp}°C >= TjMax (83°C) — throttling imminent. $detail" json_add "pve_thermals" "FAIL" "$detail" status="FAIL" - elif [[ "$pkg_temp" -ge 75 ]]; then + elif [[ "$pkg_temp" -ge 65 ]]; then [[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals" - warn "PVE package temp ${pkg_temp}°C in warn band (75-82°C). $detail" + warn "PVE package temp ${pkg_temp}°C above baseline (>65°C) — some VM is using too much CPU; check top kvm processes. $detail" json_add "pve_thermals" "WARN" "$detail" else - pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label})" + pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label}) — within 55-65°C baseline" json_add "pve_thermals" "PASS" "$detail" fi }