From cd13b9d0625eb823b21f6e3a297f266ba44bf7f7 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 11 May 2026 20:13:42 +0000 Subject: [PATCH] =?UTF-8?q?monitoring:=20drop=20PVAutoExpanding=20alert=20?= =?UTF-8?q?=E2=80=94=20info-only=20noise,=20not=20actionable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PVAutoExpanding fired at >80% used (info severity), but pvc-autoresizer's threshold is 10% free (= 90% used) — the alert always fired ~10 points before any action would have been taken, and there was nothing for an operator to do during that window either. It was a "heads up" that didn't surface a problem. Real failure modes are already covered: * PVFillingUp (critical, >95% for 10m) — autoresizer didn't keep up * PVPredictedFull (warning, predict_linear 24h) — trend toward exhaustion Sharpened PVFillingUp's annotation to spell out the likely causes (storage_limit reached, expansion failing, or missing autoresizer annotations) so the responder doesn't have to recall the runbook. --- .../modules/monitoring/prometheus_chart_values.tpl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index be600107..4f8b4f8f 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1134,20 +1134,18 @@ serverFiles: severity: warning annotations: summary: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }}: {{ $value | printf \"%.1f\" }}% free (threshold: 10%)" - - alert: PVAutoExpanding - expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * 100 > 80 and kubelet_volume_stats_capacity_bytes < 1099511627776 - for: 5m - labels: - severity: info - annotations: - summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }}: {{ $value | printf \"%.0f\" }}% used — auto-expansion should trigger" + # PVAutoExpanding removed — was info-only at >80% used, but + # pvc-autoresizer's threshold is 10% free (= 90% used), so the + # alert always fired ~10 percentage points before any action + # was needed. Real failures are caught by PVFillingUp (autoresizer + # didn't keep up) and PVPredictedFull (trend toward exhaustion). - alert: PVFillingUp expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * 100 > 95 and kubelet_volume_stats_capacity_bytes < 1099511627776 for: 10m labels: severity: critical annotations: - summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }}: {{ $value | printf \"%.0f\" }}% used — auto-expansion may have failed" + summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }}: {{ $value | printf \"%.0f\" }}% used — pvc-autoresizer didn't expand in time (storage_limit reached, expansion failing, or no autoresizer annotations)" - alert: PVPredictedFull expr: predict_linear(kubelet_volume_stats_used_bytes[6h], 3600*24) > kubelet_volume_stats_capacity_bytes and kubelet_volume_stats_capacity_bytes < 1099511627776 for: 1h