From 324f2dc3bfc6d473a4f9ff9e05b27371a4dbb93a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 5 Jun 2026 10:29:35 +0000 Subject: [PATCH] fan-control: continuous linear curve (replaces discrete step-bands) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the step-band fan curve with a continuous linear ramp — the bands flapped at edges (e.g. 45<->65%). Web-researched: linear + 2-3C hysteresis is the homelab standard; PID is overkill for this slow thermal loop. fan% now interpolates between env-tunable anchors: COOL 50C/30% -> 83C/100% (~2.1%/C; ~51% at the ~60C equilibrium) QUIET 68C/20% -> 83C/100% (near-silent until ~70C) Both reach 100% at the 83C ceiling. Anti-oscillation: asymmetric hysteresis (fc_decide) + a MIN_STEP (3%) min-change threshold. 41 bash tests green; deployed + verified live (59C -> 49%, smooth). [ci skip] Co-Authored-By: Claude Opus 4.8 --- .../2026-06-04-pve-fan-control-design.md | 27 ++++---- docs/runbooks/fan-control.md | 2 +- scripts/fan-control.sh | 39 ++++++------ scripts/test-fan-control.sh | 61 +++++++++---------- 4 files changed, 67 insertions(+), 62 deletions(-) diff --git a/docs/plans/2026-06-04-pve-fan-control-design.md b/docs/plans/2026-06-04-pve-fan-control-design.md index 6dc0bc31..6e140330 100644 --- a/docs/plans/2026-06-04-pve-fan-control-design.md +++ b/docs/plans/2026-06-04-pve-fan-control-design.md @@ -70,19 +70,24 @@ ceiling. QUIET is unchanged (already at the low-power floor: 20 % / 4,800 RPM / `HOLD_SECS` (15 min) ⇒ someone's around ⇒ QUIET; otherwise COOL. `house_mode` was rejected — it tracks *apartment* occupancy, irrelevant to garage noise. -4. **Two curves**, picked by presence (COOL power-tuned 2026-06-05 — see - "Power characterization" below): +4. **Two continuous LINEAR curves**, picked by presence. (Originally discrete + step-bands; replaced 2026-06-05 — the bands flapped at edges, e.g. 45↔65%. + Web research: a linear curve + 2–3°C hysteresis is the homelab standard; PID + is overkill for this slow thermal loop and even PID projects "only lower, don't + chase a setpoint".) fan% interpolates between per-mode anchors, clamped flat + outside; both reach 100% right at the 83°C ceiling: - | CPU °C | COOL % (empty) | CPU °C | QUIET % (occupied) | - |--------|----------------|--------|--------------------| - | ≤54 | 30 | ≤72 | 20 (≈silent floor) | - | 55–63 | 50 | 73–77 | 40 | - | 64–72 | 60 (knee) | 78–81 | 65 | - | 73–78 | 80 | ≥82 | 100 | - | ≥79 | 100 | | | + | Mode | T_LO → P_LO | T_HI → P_HI | slope | + |------|-------------|-------------|-------| + | COOL (garage empty) | 50°C → 30% | 83°C → 100% | ~2.1%/°C (≈51% at the ~60°C equilibrium) | + | QUIET (occupied) | 68°C → 20% | 83°C → 100% | ~4.7%/°C (near-silent until ~70°C) | - 3°C downward hysteresis prevents flapping at band edges (ramp up immediately, - step down only once the curve still wants lower 3°C hotter). + Anchors are env-tunable (`COOL_T_LO/P_LO/T_HI/P_HI`, `QUIET_*`). Under normal + load the COOL equilibrium (~60°C → ~51%) sits near the measured ~60% power + knee; the ramp toward 100% only engages at genuinely high temp (safety). + Anti-oscillation: asymmetric hysteresis (ramp up immediately, ease down only + once the curve wants lower 3°C hotter) **plus** a `MIN_STEP` (3%) min-change + threshold so 1–2% wiggles don't churn IPMI writes. ## Safety diff --git a/docs/runbooks/fan-control.md b/docs/runbooks/fan-control.md index 1f5d2ea7..9c4df935 100644 --- a/docs/runbooks/fan-control.md +++ b/docs/runbooks/fan-control.md @@ -55,7 +55,7 @@ Edit `/etc/fan-control.env` on the host, then `systemctl restart fan-control`. Common knobs: - `HOLD_SECS` — how long to stay quiet after the garage door last moved (default 900 = 15 min). - `CEILING` — temp at which we abandon manual control and let the firmware take over (default 83). -- Curves themselves are arrays (`COOL_CURVE`, `QUIET_CURVE`) near the top of the script. +- Curve shape: **linear anchors** near the top of the script — `COOL_T_LO/COOL_P_LO/COOL_T_HI/COOL_P_HI` (default 50°C/30% → 83°C/100%) and `QUIET_*` (68°C/20% → 83°C/100%); fan% interpolates linearly between them (replaced the old discrete step-bands). `MIN_STEP` (default 3%) = smallest fan-% change worth an IPMI write (anti-jitter); `DEADBAND` (3°C) = ease-down hysteresis. Lower `COOL_P_HI` or raise `COOL_T_HI` to run the top end quieter; steepen by raising `COOL_P_LO` / lowering `COOL_T_LO`. ## Deploy / update diff --git a/scripts/fan-control.sh b/scripts/fan-control.sh index e2a7ac83..e60ae1e6 100644 --- a/scripts/fan-control.sh +++ b/scripts/fan-control.sh @@ -47,28 +47,31 @@ set -uo pipefail : "${DRY_RUN:=0}" # 1 => log IPMI actions instead of executing : "${RUN_ONCE:=0}" # 1 => one iteration then exit (testing) -# Curves as "min_temp:pct" entries, descending; first whose min_temp <= temp wins. -# COOL is power-tuned (2026-06-05 power/temp sweep): the cooling-per-watt knee is -# ~60% — beyond it airflow buys almost nothing (60->70% = +21W/-2°C, 70->100% = -# +54W/0°C; the CPU floors ~59°C at cluster load). So the normal band caps at 60% -# (~303W, ~61°C); 80/100% are a high-load safety ramp before the 83°C ceiling. -COOL_CURVE=(79:100 73:80 64:60 55:50 0:30) -QUIET_CURVE=(82:100 78:65 73:40 0:20) +# Continuous LINEAR fan curve (2026-06-05): fan% ramps proportionally with CPU +# temp between (T_LO,P_LO) and (T_HI,P_HI), clamped flat outside. Replaces the old +# discrete step-bands (which flapped at band edges — e.g. 45<->65%). Both modes +# reach 100% right at the 83°C ceiling. Anchors are env-tunable. +# COOL (garage empty): 30% @50°C .. 100% @83°C (~2.1%/°C; equilibrium ~60°C/~51%) +# QUIET (someone there): 20% @68°C .. 100% @83°C (near-silent until ~70°C) +# Web-researched: a linear curve + 2-3°C hysteresis is the homelab standard; PID is +# overkill for this slow thermal loop. See docs/plans/2026-06-04-pve-fan-control-design.md. +: "${COOL_T_LO:=50}"; : "${COOL_P_LO:=30}"; : "${COOL_T_HI:=83}"; : "${COOL_P_HI:=100}" +: "${QUIET_T_LO:=68}"; : "${QUIET_P_LO:=20}"; : "${QUIET_T_HI:=83}"; : "${QUIET_P_HI:=100}" +: "${MIN_STEP:=3}" # min fan-% change worth an IPMI write (anti-jitter on the smooth curve) log() { printf '%s %s\n' "$(date '+%Y-%m-%dT%H:%M:%S%z')" "$*"; } # ---- pure functions (no side effects; unit-tested) ---- -# fc_curve -> fan percent +# fc_curve -> fan percent (continuous linear interpolation between +# the per-mode (T_LO,P_LO)..(T_HI,P_HI) anchors; clamped flat outside the range). fc_curve() { - local mode="$1" temp="$2" - local -a curve - if [[ "$mode" == "quiet" ]]; then curve=("${QUIET_CURVE[@]}"); else curve=("${COOL_CURVE[@]}"); fi - local entry - for entry in "${curve[@]}"; do - if (( temp >= ${entry%%:*} )); then echo "${entry##*:}"; return 0; fi - done - echo "${curve[-1]##*:}" + local mode="$1" temp="$2" tlo plo thi phi + if [[ "$mode" == "quiet" ]]; then tlo=$QUIET_T_LO; plo=$QUIET_P_LO; thi=$QUIET_T_HI; phi=$QUIET_P_HI + else tlo=$COOL_T_LO; plo=$COOL_P_LO; thi=$COOL_T_HI; phi=$COOL_P_HI; fi + if (( temp <= tlo )); then echo "$plo"; return 0; fi + if (( temp >= thi )); then echo "$phi"; return 0; fi + echo $(( plo + ( (temp - tlo) * (phi - plo) + (thi - tlo) / 2 ) / (thi - tlo) )) # rounded } # fc_decide -> fan percent @@ -230,7 +233,9 @@ main() { local presence="cool"; [[ "$ha_mode" == "auto" ]] && presence="$(get_presence)" local eff; if [[ "$ha_mode" == "manual" ]]; then eff="manual"; elif [[ "$ha_mode" == "auto" ]]; then eff="$presence"; else eff="$ha_mode"; fi local pct; pct="$(fc_resolve "$ha_mode" "$temp" "$manual_pct" "$presence" "$current" "$DEADBAND")" - if (( pct != current )); then + # Only write when first-run or the change clears MIN_STEP (kills 1-2% jitter + # on the continuous curve; fc_decide already gives asymmetric hysteresis). + if (( current < 0 || pct - current >= MIN_STEP || current - pct >= MIN_STEP )); then if set_manual "$pct"; then log "temp=${temp}C ha_mode=${ha_mode} eff=${eff} fan=${pct}% (was ${current}%)"; current="$pct" else log "WARN set_manual ${pct}% failed"; fi fi diff --git a/scripts/test-fan-control.sh b/scripts/test-fan-control.sh index 2ed45d9c..660a4147 100644 --- a/scripts/test-fan-control.sh +++ b/scripts/test-fan-control.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Unit tests for the pure functions in fan-control.sh. -# Sources the script (main is guarded), exercises curve/decide/presence/parse. +# Sources the script (main is guarded), exercises curve/decide/resolve/presence/parse. # Run: bash infra/scripts/test-fan-control.sh set -uo pipefail @@ -15,35 +15,31 @@ eq() { # fi } -# --- COOL curve (power-tuned 2026-06-05: knee at 60%) --- -eq "cool 40 -> 30" 30 "$(fc_curve cool 40)" -eq "cool 54 -> 30" 30 "$(fc_curve cool 54)" -eq "cool 55 -> 50" 50 "$(fc_curve cool 55)" -eq "cool 63 -> 50" 50 "$(fc_curve cool 63)" -eq "cool 64 -> 60" 60 "$(fc_curve cool 64)" -eq "cool 72 -> 60" 60 "$(fc_curve cool 72)" -eq "cool 73 -> 80" 80 "$(fc_curve cool 73)" -eq "cool 78 -> 80" 80 "$(fc_curve cool 78)" -eq "cool 79 -> 100" 100 "$(fc_curve cool 79)" -eq "cool 91 -> 100" 100 "$(fc_curve cool 91)" +# --- COOL curve (continuous linear: 30% @50C .. 100% @83C) --- +eq "cool <=T_LO clamps" 30 "$(fc_curve cool 40)" +eq "cool 50 -> 30" 30 "$(fc_curve cool 50)" +eq "cool 55 -> 41" 41 "$(fc_curve cool 55)" +eq "cool 60 -> 51" 51 "$(fc_curve cool 60)" +eq "cool 64 -> 60" 60 "$(fc_curve cool 64)" +eq "cool 70 -> 72" 72 "$(fc_curve cool 70)" +eq "cool 75 -> 83" 83 "$(fc_curve cool 75)" +eq "cool 83 -> 100" 100 "$(fc_curve cool 83)" +eq "cool >=T_HI clamps" 100 "$(fc_curve cool 90)" -# --- QUIET curve --- -eq "quiet 50 -> 20" 20 "$(fc_curve quiet 50)" -eq "quiet 72 -> 20" 20 "$(fc_curve quiet 72)" -eq "quiet 73 -> 40" 40 "$(fc_curve quiet 73)" -eq "quiet 77 -> 40" 40 "$(fc_curve quiet 77)" -eq "quiet 78 -> 65" 65 "$(fc_curve quiet 78)" -eq "quiet 81 -> 65" 65 "$(fc_curve quiet 81)" -eq "quiet 82 -> 100" 100 "$(fc_curve quiet 82)" +# --- QUIET curve (continuous linear: 20% @68C .. 100% @83C) --- +eq "quiet <=T_LO clamps" 20 "$(fc_curve quiet 60)" +eq "quiet 68 -> 20" 20 "$(fc_curve quiet 68)" +eq "quiet 70 -> 31" 31 "$(fc_curve quiet 70)" +eq "quiet 75 -> 57" 57 "$(fc_curve quiet 75)" +eq "quiet 80 -> 84" 84 "$(fc_curve quiet 80)" +eq "quiet 83 -> 100" 100 "$(fc_curve quiet 83)" -# --- decide: hysteresis --- -eq "decide uninit -> target" 60 "$(fc_decide cool 68 -1 3)" -eq "decide ramp up now" 60 "$(fc_decide cool 68 25 3)" -eq "decide equal holds" 60 "$(fc_decide cool 64 60 3)" -eq "decide down held in band" 80 "$(fc_decide cool 70 80 3)" # 70+3=73 still 80% -> hold -eq "decide down past band" 60 "$(fc_decide cool 69 80 3)" # 69+3=72 -> 60% < 80 -> drop -eq "decide 100 holds" 100 "$(fc_decide cool 77 100 3)" # 77+3=80 -> 100 -> hold -eq "decide 100 drops" 80 "$(fc_decide cool 75 100 3)" # 75+3=78 -> 80 < 100 -> drop +# --- decide: asymmetric hysteresis (ramp up now, ease down only past the deadband) --- +eq "decide uninit -> target" 68 "$(fc_decide cool 68 -1 3)" +eq "decide ramp up now" 68 "$(fc_decide cool 68 25 3)" +eq "decide equal holds" 62 "$(fc_decide cool 65 62 3)" +eq "decide down held" 72 "$(fc_decide cool 68 72 3)" # curve(68)=68<72 but curve(71)=75 !<72 -> hold +eq "decide down past" 60 "$(fc_decide cool 64 72 3)" # curve(64)=60, curve(67)=66<72 -> drop # --- fc_clamp / fc_resolve: HA mode resolution --- eq "clamp over 100" 100 "$(fc_clamp 150)" @@ -51,11 +47,10 @@ eq "clamp under 0" 0 "$(fc_clamp -5)" eq "clamp passthrough" 45 "$(fc_clamp 45)" eq "resolve manual=slider" 42 "$(fc_resolve manual 64 42 cool -1 3)" eq "resolve manual clamped" 100 "$(fc_resolve manual 64 150 cool -1 3)" -eq "resolve cool=cool curve" 60 "$(fc_resolve cool 64 0 cool -1 3)" -eq "resolve quiet=quiet curve" 65 "$(fc_resolve quiet 80 0 cool -1 3)" -eq "resolve auto+empty=cool" 60 "$(fc_resolve auto 64 0 cool -1 3)" -eq "resolve auto+present=quiet" 20 "$(fc_resolve auto 64 0 quiet -1 3)" -eq "resolve cool hysteresis" 60 "$(fc_resolve cool 69 0 cool 80 3)" +eq "resolve cool=cool curve" 51 "$(fc_resolve cool 60 0 cool -1 3)" +eq "resolve quiet=quiet curve" 73 "$(fc_resolve quiet 78 0 cool -1 3)" +eq "resolve auto+empty=cool" 51 "$(fc_resolve auto 60 0 cool -1 3)" +eq "resolve auto+present=quiet" 31 "$(fc_resolve auto 70 0 quiet -1 3)" # --- presence --- now=1000000