cluster-health #43: tighten PVE thermal threshold to 65 C
Per Viktor: healthy baseline range is 55-65 C; anything above 65 C is a signal a VM/workload is using too much CPU and warrants investigation. Previous thresholds were calibrated to the hardware's TjMax (75/83 C) — that was too lax, since cluster-load-driven elevation arrives a long time before throttling. The 65 C cutoff matches the live Prometheus baseline (Apr 20-May 8 2026: peak 61-69 C, avg 51-55 C) and the session-observed correlation: above 65 C means the cluster is doing sustained work that should be looked at, even if hardware is still nowhere near its limit. Updated: PASS < 65 C (within 55-65 baseline) WARN 65-82 C (elevated; check top kvm processes for the culprit) FAIL >= 83 C (at/above TjMax — throttling imminent) Verified live: 67 C now WARN (was PASS under the 75 C threshold).
This commit is contained in:
parent
dbb3dc04d3
commit
6950b8f197
1 changed files with 11 additions and 9 deletions
|
|
@ -2496,12 +2496,14 @@ check_pve_thermals() {
|
|||
max_core_temp=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print m}')
|
||||
max_core_label=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print lbl}')
|
||||
|
||||
# Xeon E5-2699v4: TjMax = 83°C reported max, 93°C critical (from earlier
|
||||
# session: max=83, crit=93). Set WARN well below the manufacturer max so
|
||||
# action can be taken before throttling onset.
|
||||
# PASS < 75°C package
|
||||
# WARN 75-82°C package (approaching max)
|
||||
# FAIL >= 83°C package (at/above max — throttling imminent)
|
||||
# Healthy baseline for this R730 (verified Apr 20-May 8 2026 from
|
||||
# Prometheus): peak 61-69°C, avg 51-55°C. Treat anything above 65°C
|
||||
# as a signal that some VM/workload is using too much CPU and warrants
|
||||
# investigation, even though the Xeon E5-2699v4 has TjMax=83°C /
|
||||
# Tcrit=93°C. This catches load creep early, well before throttling.
|
||||
# PASS < 65°C package (within baseline 55-65 °C band)
|
||||
# WARN 65-82°C package (elevated — investigate top CPU consumer)
|
||||
# FAIL >= 83°C package (at/above TjMax — throttling imminent)
|
||||
local detail="package=${pkg_temp}°C max_core=${max_core_temp}°C (${max_core_label})"
|
||||
if [[ -z "$pkg_temp" ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
|
||||
|
|
@ -2512,12 +2514,12 @@ check_pve_thermals() {
|
|||
fail "PVE package temp ${pkg_temp}°C >= TjMax (83°C) — throttling imminent. $detail"
|
||||
json_add "pve_thermals" "FAIL" "$detail"
|
||||
status="FAIL"
|
||||
elif [[ "$pkg_temp" -ge 75 ]]; then
|
||||
elif [[ "$pkg_temp" -ge 65 ]]; then
|
||||
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
|
||||
warn "PVE package temp ${pkg_temp}°C in warn band (75-82°C). $detail"
|
||||
warn "PVE package temp ${pkg_temp}°C above baseline (>65°C) — some VM is using too much CPU; check top kvm processes. $detail"
|
||||
json_add "pve_thermals" "WARN" "$detail"
|
||||
else
|
||||
pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label})"
|
||||
pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label}) — within 55-65°C baseline"
|
||||
json_add "pve_thermals" "PASS" "$detail"
|
||||
fi
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue