cluster-health #43: tighten PVE thermal threshold to 65 C
Per Viktor: healthy baseline range is 55-65 C; anything above 65 C is a signal a VM/workload is using too much CPU and warrants investigation. Previous thresholds were calibrated to the hardware's TjMax (75/83 C) — that was too lax, since cluster-load-driven elevation arrives a long time before throttling. The 65 C cutoff matches the live Prometheus baseline (Apr 20-May 8 2026: peak 61-69 C, avg 51-55 C) and the session-observed correlation: above 65 C means the cluster is doing sustained work that should be looked at, even if hardware is still nowhere near its limit. Updated: PASS < 65 C (within 55-65 baseline) WARN 65-82 C (elevated; check top kvm processes for the culprit) FAIL >= 83 C (at/above TjMax — throttling imminent) Verified live: 67 C now WARN (was PASS under the 75 C threshold).
This commit is contained in:
parent
282d7f6182
commit
4830230984
1 changed files with 11 additions and 9 deletions
|
|
@ -2496,12 +2496,14 @@ check_pve_thermals() {
|
||||||
max_core_temp=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print m}')
|
max_core_temp=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print m}')
|
||||||
max_core_label=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print lbl}')
|
max_core_label=$(echo "$raw" | awk '/^Core/{if($NF>m){m=$NF; lbl=$1" "$2}} END{print lbl}')
|
||||||
|
|
||||||
# Xeon E5-2699v4: TjMax = 83°C reported max, 93°C critical (from earlier
|
# Healthy baseline for this R730 (verified Apr 20-May 8 2026 from
|
||||||
# session: max=83, crit=93). Set WARN well below the manufacturer max so
|
# Prometheus): peak 61-69°C, avg 51-55°C. Treat anything above 65°C
|
||||||
# action can be taken before throttling onset.
|
# as a signal that some VM/workload is using too much CPU and warrants
|
||||||
# PASS < 75°C package
|
# investigation, even though the Xeon E5-2699v4 has TjMax=83°C /
|
||||||
# WARN 75-82°C package (approaching max)
|
# Tcrit=93°C. This catches load creep early, well before throttling.
|
||||||
# FAIL >= 83°C package (at/above max — throttling imminent)
|
# PASS < 65°C package (within baseline 55-65 °C band)
|
||||||
|
# WARN 65-82°C package (elevated — investigate top CPU consumer)
|
||||||
|
# FAIL >= 83°C package (at/above TjMax — throttling imminent)
|
||||||
local detail="package=${pkg_temp}°C max_core=${max_core_temp}°C (${max_core_label})"
|
local detail="package=${pkg_temp}°C max_core=${max_core_temp}°C (${max_core_label})"
|
||||||
if [[ -z "$pkg_temp" ]]; then
|
if [[ -z "$pkg_temp" ]]; then
|
||||||
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
|
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
|
||||||
|
|
@ -2512,12 +2514,12 @@ check_pve_thermals() {
|
||||||
fail "PVE package temp ${pkg_temp}°C >= TjMax (83°C) — throttling imminent. $detail"
|
fail "PVE package temp ${pkg_temp}°C >= TjMax (83°C) — throttling imminent. $detail"
|
||||||
json_add "pve_thermals" "FAIL" "$detail"
|
json_add "pve_thermals" "FAIL" "$detail"
|
||||||
status="FAIL"
|
status="FAIL"
|
||||||
elif [[ "$pkg_temp" -ge 75 ]]; then
|
elif [[ "$pkg_temp" -ge 65 ]]; then
|
||||||
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
|
[[ "$QUIET" == true ]] && section_always 43 "PVE Host Thermals"
|
||||||
warn "PVE package temp ${pkg_temp}°C in warn band (75-82°C). $detail"
|
warn "PVE package temp ${pkg_temp}°C above baseline (>65°C) — some VM is using too much CPU; check top kvm processes. $detail"
|
||||||
json_add "pve_thermals" "WARN" "$detail"
|
json_add "pve_thermals" "WARN" "$detail"
|
||||||
else
|
else
|
||||||
pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label})"
|
pass "PVE package ${pkg_temp}°C, hottest core ${max_core_temp}°C (${max_core_label}) — within 55-65°C baseline"
|
||||||
json_add "pve_thermals" "PASS" "$detail"
|
json_add "pve_thermals" "PASS" "$detail"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue