fan-control: continuous linear curve (replaces discrete step-bands)

Replace the step-band fan curve with a continuous linear ramp — the bands
flapped at edges (e.g. 45<->65%). Web-researched: linear + 2-3C hysteresis
is the homelab standard; PID is overkill for this slow thermal loop.
fan% now interpolates between env-tunable anchors:
  COOL  50C/30% -> 83C/100% (~2.1%/C; ~51% at the ~60C equilibrium)
  QUIET 68C/20% -> 83C/100% (near-silent until ~70C)
Both reach 100% at the 83C ceiling. Anti-oscillation: asymmetric
hysteresis (fc_decide) + a MIN_STEP (3%) min-change threshold.
41 bash tests green; deployed + verified live (59C -> 49%, smooth).

[ci skip]

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-05 10:29:35 +00:00
parent 945c1936e3
commit 324f2dc3bf
4 changed files with 67 additions and 62 deletions

View file

@ -47,28 +47,31 @@ set -uo pipefail
: "${DRY_RUN:=0}" # 1 => log IPMI actions instead of executing
: "${RUN_ONCE:=0}" # 1 => one iteration then exit (testing)
# Curves as "min_temp:pct" entries, descending; first whose min_temp <= temp wins.
# COOL is power-tuned (2026-06-05 power/temp sweep): the cooling-per-watt knee is
# ~60% — beyond it airflow buys almost nothing (60->70% = +21W/-2°C, 70->100% =
# +54W/0°C; the CPU floors ~59°C at cluster load). So the normal band caps at 60%
# (~303W, ~61°C); 80/100% are a high-load safety ramp before the 83°C ceiling.
COOL_CURVE=(79:100 73:80 64:60 55:50 0:30)
QUIET_CURVE=(82:100 78:65 73:40 0:20)
# Continuous LINEAR fan curve (2026-06-05): fan% ramps proportionally with CPU
# temp between (T_LO,P_LO) and (T_HI,P_HI), clamped flat outside. Replaces the old
# discrete step-bands (which flapped at band edges — e.g. 45<->65%). Both modes
# reach 100% right at the 83°C ceiling. Anchors are env-tunable.
# COOL (garage empty): 30% @50°C .. 100% @83°C (~2.1%/°C; equilibrium ~60°C/~51%)
# QUIET (someone there): 20% @68°C .. 100% @83°C (near-silent until ~70°C)
# Web-researched: a linear curve + 2-3°C hysteresis is the homelab standard; PID is
# overkill for this slow thermal loop. See docs/plans/2026-06-04-pve-fan-control-design.md.
: "${COOL_T_LO:=50}"; : "${COOL_P_LO:=30}"; : "${COOL_T_HI:=83}"; : "${COOL_P_HI:=100}"
: "${QUIET_T_LO:=68}"; : "${QUIET_P_LO:=20}"; : "${QUIET_T_HI:=83}"; : "${QUIET_P_HI:=100}"
: "${MIN_STEP:=3}" # min fan-% change worth an IPMI write (anti-jitter on the smooth curve)
log() { printf '%s %s\n' "$(date '+%Y-%m-%dT%H:%M:%S%z')" "$*"; }
# ---- pure functions (no side effects; unit-tested) ----
# fc_curve <mode> <temp> -> fan percent
# fc_curve <mode> <temp> -> fan percent (continuous linear interpolation between
# the per-mode (T_LO,P_LO)..(T_HI,P_HI) anchors; clamped flat outside the range).
fc_curve() {
local mode="$1" temp="$2"
local -a curve
if [[ "$mode" == "quiet" ]]; then curve=("${QUIET_CURVE[@]}"); else curve=("${COOL_CURVE[@]}"); fi
local entry
for entry in "${curve[@]}"; do
if (( temp >= ${entry%%:*} )); then echo "${entry##*:}"; return 0; fi
done
echo "${curve[-1]##*:}"
local mode="$1" temp="$2" tlo plo thi phi
if [[ "$mode" == "quiet" ]]; then tlo=$QUIET_T_LO; plo=$QUIET_P_LO; thi=$QUIET_T_HI; phi=$QUIET_P_HI
else tlo=$COOL_T_LO; plo=$COOL_P_LO; thi=$COOL_T_HI; phi=$COOL_P_HI; fi
if (( temp <= tlo )); then echo "$plo"; return 0; fi
if (( temp >= thi )); then echo "$phi"; return 0; fi
echo $(( plo + ( (temp - tlo) * (phi - plo) + (thi - tlo) / 2 ) / (thi - tlo) )) # rounded
}
# fc_decide <mode> <temp> <current_pct> <deadband> -> fan percent
@ -230,7 +233,9 @@ main() {
local presence="cool"; [[ "$ha_mode" == "auto" ]] && presence="$(get_presence)"
local eff; if [[ "$ha_mode" == "manual" ]]; then eff="manual"; elif [[ "$ha_mode" == "auto" ]]; then eff="$presence"; else eff="$ha_mode"; fi
local pct; pct="$(fc_resolve "$ha_mode" "$temp" "$manual_pct" "$presence" "$current" "$DEADBAND")"
if (( pct != current )); then
# Only write when first-run or the change clears MIN_STEP (kills 1-2% jitter
# on the continuous curve; fc_decide already gives asymmetric hysteresis).
if (( current < 0 || pct - current >= MIN_STEP || current - pct >= MIN_STEP )); then
if set_manual "$pct"; then log "temp=${temp}C ha_mode=${ha_mode} eff=${eff} fan=${pct}% (was ${current}%)"; current="$pct"
else log "WARN set_manual ${pct}% failed"; fi
fi

View file

@ -1,6 +1,6 @@
#!/usr/bin/env bash
# Unit tests for the pure functions in fan-control.sh.
# Sources the script (main is guarded), exercises curve/decide/presence/parse.
# Sources the script (main is guarded), exercises curve/decide/resolve/presence/parse.
# Run: bash infra/scripts/test-fan-control.sh
set -uo pipefail
@ -15,35 +15,31 @@ eq() { # <description> <expected> <actual>
fi
}
# --- COOL curve (power-tuned 2026-06-05: knee at 60%) ---
eq "cool 40 -> 30" 30 "$(fc_curve cool 40)"
eq "cool 54 -> 30" 30 "$(fc_curve cool 54)"
eq "cool 55 -> 50" 50 "$(fc_curve cool 55)"
eq "cool 63 -> 50" 50 "$(fc_curve cool 63)"
eq "cool 64 -> 60" 60 "$(fc_curve cool 64)"
eq "cool 72 -> 60" 60 "$(fc_curve cool 72)"
eq "cool 73 -> 80" 80 "$(fc_curve cool 73)"
eq "cool 78 -> 80" 80 "$(fc_curve cool 78)"
eq "cool 79 -> 100" 100 "$(fc_curve cool 79)"
eq "cool 91 -> 100" 100 "$(fc_curve cool 91)"
# --- COOL curve (continuous linear: 30% @50C .. 100% @83C) ---
eq "cool <=T_LO clamps" 30 "$(fc_curve cool 40)"
eq "cool 50 -> 30" 30 "$(fc_curve cool 50)"
eq "cool 55 -> 41" 41 "$(fc_curve cool 55)"
eq "cool 60 -> 51" 51 "$(fc_curve cool 60)"
eq "cool 64 -> 60" 60 "$(fc_curve cool 64)"
eq "cool 70 -> 72" 72 "$(fc_curve cool 70)"
eq "cool 75 -> 83" 83 "$(fc_curve cool 75)"
eq "cool 83 -> 100" 100 "$(fc_curve cool 83)"
eq "cool >=T_HI clamps" 100 "$(fc_curve cool 90)"
# --- QUIET curve ---
eq "quiet 50 -> 20" 20 "$(fc_curve quiet 50)"
eq "quiet 72 -> 20" 20 "$(fc_curve quiet 72)"
eq "quiet 73 -> 40" 40 "$(fc_curve quiet 73)"
eq "quiet 77 -> 40" 40 "$(fc_curve quiet 77)"
eq "quiet 78 -> 65" 65 "$(fc_curve quiet 78)"
eq "quiet 81 -> 65" 65 "$(fc_curve quiet 81)"
eq "quiet 82 -> 100" 100 "$(fc_curve quiet 82)"
# --- QUIET curve (continuous linear: 20% @68C .. 100% @83C) ---
eq "quiet <=T_LO clamps" 20 "$(fc_curve quiet 60)"
eq "quiet 68 -> 20" 20 "$(fc_curve quiet 68)"
eq "quiet 70 -> 31" 31 "$(fc_curve quiet 70)"
eq "quiet 75 -> 57" 57 "$(fc_curve quiet 75)"
eq "quiet 80 -> 84" 84 "$(fc_curve quiet 80)"
eq "quiet 83 -> 100" 100 "$(fc_curve quiet 83)"
# --- decide: hysteresis ---
eq "decide uninit -> target" 60 "$(fc_decide cool 68 -1 3)"
eq "decide ramp up now" 60 "$(fc_decide cool 68 25 3)"
eq "decide equal holds" 60 "$(fc_decide cool 64 60 3)"
eq "decide down held in band" 80 "$(fc_decide cool 70 80 3)" # 70+3=73 still 80% -> hold
eq "decide down past band" 60 "$(fc_decide cool 69 80 3)" # 69+3=72 -> 60% < 80 -> drop
eq "decide 100 holds" 100 "$(fc_decide cool 77 100 3)" # 77+3=80 -> 100 -> hold
eq "decide 100 drops" 80 "$(fc_decide cool 75 100 3)" # 75+3=78 -> 80 < 100 -> drop
# --- decide: asymmetric hysteresis (ramp up now, ease down only past the deadband) ---
eq "decide uninit -> target" 68 "$(fc_decide cool 68 -1 3)"
eq "decide ramp up now" 68 "$(fc_decide cool 68 25 3)"
eq "decide equal holds" 62 "$(fc_decide cool 65 62 3)"
eq "decide down held" 72 "$(fc_decide cool 68 72 3)" # curve(68)=68<72 but curve(71)=75 !<72 -> hold
eq "decide down past" 60 "$(fc_decide cool 64 72 3)" # curve(64)=60, curve(67)=66<72 -> drop
# --- fc_clamp / fc_resolve: HA mode resolution ---
eq "clamp over 100" 100 "$(fc_clamp 150)"
@ -51,11 +47,10 @@ eq "clamp under 0" 0 "$(fc_clamp -5)"
eq "clamp passthrough" 45 "$(fc_clamp 45)"
eq "resolve manual=slider" 42 "$(fc_resolve manual 64 42 cool -1 3)"
eq "resolve manual clamped" 100 "$(fc_resolve manual 64 150 cool -1 3)"
eq "resolve cool=cool curve" 60 "$(fc_resolve cool 64 0 cool -1 3)"
eq "resolve quiet=quiet curve" 65 "$(fc_resolve quiet 80 0 cool -1 3)"
eq "resolve auto+empty=cool" 60 "$(fc_resolve auto 64 0 cool -1 3)"
eq "resolve auto+present=quiet" 20 "$(fc_resolve auto 64 0 quiet -1 3)"
eq "resolve cool hysteresis" 60 "$(fc_resolve cool 69 0 cool 80 3)"
eq "resolve cool=cool curve" 51 "$(fc_resolve cool 60 0 cool -1 3)"
eq "resolve quiet=quiet curve" 73 "$(fc_resolve quiet 78 0 cool -1 3)"
eq "resolve auto+empty=cool" 51 "$(fc_resolve auto 60 0 cool -1 3)"
eq "resolve auto+present=quiet" 31 "$(fc_resolve auto 70 0 quiet -1 3)"
# --- presence ---
now=1000000