From fc5a4b66ada47e8db935e0885fa900c8d9527c02 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 27 May 2026 19:46:18 +0000 Subject: [PATCH] monitoring: exclude catchall-error-pages from HighService4xxRate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The catchall-error-pages IngressRoute matches HostRegexp(^(.+\.)? viktorbarzin\.me$) at priority=1 — it's the wildcard handler that returns 404 for any unmatched hostname (typos + scanner traffic). By design its 4xx rate sits at ~100%, so HighService4xxRate was a permanent false positive for traefik-catchall-error-pages-*@kubernetescrd. Same exclusion pattern as nextcloud/grafana/linkwarden/claude-memory (services with legitimately high 4xx counts). --- .../modules/monitoring/prometheus_chart_values.tpl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index d27c17f5..12fd8884 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2141,13 +2141,18 @@ serverFiles: annotations: summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 10%)" - alert: HighService4xxRate + # `.*catchall-error-pages.*` is excluded because that IngressRoute + # is the wildcard `HostRegexp(^(.+\.)?viktorbarzin\.me$)` handler + # — its entire purpose is to return 404 for unmatched hostnames + # (typos + scanner traffic), so its 4xx rate is permanently ~100%. + # Without this exclusion the alert is a perpetual false positive. expr: | ( - sum(rate(traefik_service_requests_total{code=~"4..", service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*|.*claude-memory.*"}[5m])) by (service) - / sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*|.*claude-memory.*"}[5m])) by (service) + sum(rate(traefik_service_requests_total{code=~"4..", service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*|.*claude-memory.*|.*catchall-error-pages.*"}[5m])) by (service) + / sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*|.*claude-memory.*|.*catchall-error-pages.*"}[5m])) by (service) * 100 ) > 30 - and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*|.*claude-memory.*"}[5m])) by (service) > 0.1 + and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*|.*claude-memory.*|.*catchall-error-pages.*"}[5m])) by (service) > 0.1 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 15m labels: