From b5689afe6de810074adf254d090f2f9b0803585f Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 6 Apr 2026 15:39:23 +0300 Subject: [PATCH] fix(monitoring): tune alert thresholds to reduce false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - HighPowerUsage: raise from 200W to 300W (R730 idles at ~230W) - HighServiceLatency: exclude headscale (WebSocket) and authentik (SSO) from latency checks — both have inherently high avg response times --- .../modules/monitoring/prometheus_chart_values.tpl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index d1f16d29..227b8194 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -756,12 +756,12 @@ serverFiles: annotations: summary: "Power outage - input voltage: {{ $value | printf \"%.0f\" }}V (threshold: <150V)" - alert: HighPowerUsage - expr: r730_idrac_idrac_power_control_consumed_watts > 200 + expr: r730_idrac_idrac_power_control_consumed_watts > 300 for: 60m labels: severity: info annotations: - summary: "Server power: {{ $value | printf \"%.0f\" }}W (threshold: 200W)" + summary: "Server power: {{ $value | printf \"%.0f\" }}W (threshold: 300W)" - alert: UsingInverterEnergyForTooLong expr: automatic_transfer_switch_power_mode > 0 # 1 = Inverter; 0 = Grid for: 24h @@ -1577,10 +1577,10 @@ serverFiles: - alert: HighServiceLatency expr: | ( - sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*"}[5m])) by (service) - / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) + sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*authentik.*"}[5m])) by (service) + / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*authentik.*"}[5m])) by (service) ) > 10 - and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) > 0.01 + and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*authentik.*"}[5m])) by (service) > 0.01 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 5m labels: