From 416c2a0468fcbcd4ed770cfbee1872398c799003 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 24 May 2026 11:55:19 +0000 Subject: [PATCH] monitoring: add AncaElementsMirror{Stale,Failing} alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layer 3a (anca-elements local mirror) now has the same alert coverage as offsite-sync-backup: - AncaElementsMirrorStale fires if last_run_timestamp > 16d (2 weekly cycles, matches the 8d → 9d slack used elsewhere) - AncaElementsMirrorFailing fires if last_status != 0 BackupDiskFull (existing) covers the sda fill-up risk at 85%. Not applied this commit — pick up on next monitoring stack apply. --- .../modules/monitoring/prometheus_chart_values.tpl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index f8561b8c..d590b435 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1562,6 +1562,20 @@ serverFiles: severity: warning annotations: summary: "Offsite backup sync is {{ $value | humanizeDuration }} old (threshold: 9d)" + - alert: AncaElementsMirrorStale + expr: (time() - anca_elements_mirror_last_run_timestamp{job="anca-elements-mirror"}) > 1382400 + for: 30m + labels: + severity: warning + annotations: + summary: "anca-elements mirror is {{ $value | humanizeDuration }} old (threshold: 16d / 2 weekly cycles)" + - alert: AncaElementsMirrorFailing + expr: anca_elements_mirror_last_status{job="anca-elements-mirror"} != 0 + for: 0m + labels: + severity: warning + annotations: + summary: "anca-elements mirror last run failed (status={{ $value }})" - alert: BackupDiskFull expr: (1 - node_filesystem_avail_bytes{job="proxmox-host", mountpoint="/mnt/backup"} / node_filesystem_size_bytes{job="proxmox-host", mountpoint="/mnt/backup"}) > 0.85 for: 15m