From 63ee655c08acb90307294c1c55744060af4d2378 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 4 Jun 2026 08:07:58 +0000 Subject: [PATCH] monitoring: fix PrometheusBackupStale false-fire (32d->40d threshold) The prometheus-backup sidecar runs monthly on the 1st SUNDAY 04:00 UTC. Consecutive first-Sundays can be ~35 days apart (e.g. May 3 -> Jun 7), but the alert threshold was 32d (2764800s) -> it false-fired every year for the ~3 days between day-32 and the next run. Raised to 40d (3456000s): clears the max first-Sunday spacing with margin, still catches a genuinely missed monthly backup. Backup itself is healthy (last May 3, next Jun 7). Verified: live rule now > 3.456e6, alert state inactive. --- .../modules/monitoring/prometheus_chart_values.tpl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 91145a33..009f2798 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1554,12 +1554,18 @@ serverFiles: annotations: summary: "Redis backup CronJob has never completed successfully" - alert: PrometheusBackupStale - expr: (time() - prometheus_backup_last_success_timestamp{job="prometheus-backup"}) > 2764800 + # The backup sidecar runs monthly on the 1st SUNDAY 04:00 UTC. + # Consecutive first-Sundays can be up to ~35-37 days apart (e.g. + # May 3 → Jun 7 = 35d), so a 32d threshold false-fired every year + # in the gap before the next run. 40d (3456000s) clears the max + # first-Sunday spacing with margin while still catching a genuinely + # missed monthly backup. + expr: (time() - prometheus_backup_last_success_timestamp{job="prometheus-backup"}) > 3456000 for: 30m labels: severity: critical annotations: - summary: "Prometheus backup is {{ $value | humanizeDuration }} old (threshold: 32d)" + summary: "Prometheus backup is {{ $value | humanizeDuration }} old (threshold: 40d)" - alert: PrometheusBackupNeverRun expr: absent(prometheus_backup_last_success_timestamp{job="prometheus-backup"}) for: 32d