From e49c91e60c8509032219c9a7987a48304aacdf9e Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Wed, 10 Jun 2026 09:10:46 +0000
Subject: [PATCH] monitoring: VzdumpBackup{Stale,NeverRun,Failing} alerts for
 the new VM-image backup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vzdump-vms pushes vzdump_last_{run,success}_timestamp + vzdump_last_status to
Pushgateway job vzdump-backup, but nothing alerted on them — a stopped/failing VM
backup would be silent (exactly how the nfs-mirror reaping went unnoticed until I
re-verified). Add the trio to the 3-2-1 group in prometheus_chart_values.tpl,
mirroring the LVM/pfSense/nfs-mirror alerts. Stale = >~50h since last success.

NOT [ci]-applied: this is a Terraform stack change — arms on the next
`scripts/tg apply` of the monitoring stack (metrics already flow, so it arms
immediately once applied). Admin-gated apply per org policy.

[ci skip]

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/architecture/backup-dr.md                |  2 +-
 .../monitoring/prometheus_chart_values.tpl    | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md
index df30dea2..c4d0092f 100644
--- a/docs/architecture/backup-dr.md
+++ b/docs/architecture/backup-dr.md
@@ -367,7 +367,7 @@ The hand-managed Linux VMs are **intentionally not in Terraform** (telmate/bpg p
 **Retention**: `KEEP=3` newest dumps per VMID on sda (`/mnt/backup/vzdump/`); each devvm image is ~35-50 GB zstd.
 **Critical dependency**: `nfs-mirror` MUST keep `--exclude='/vzdump/'`. Its nightly `rsync -rlt --delete /srv/nfs/ → /mnt/backup/` treats any `/mnt/backup` dir with no `/srv/nfs` counterpart as an orphan and deletes it — this silently reaped the first two vzdump images at 02:00 on 2026-06-10 before the exclude was added (same reason `pvc-data`/`pfsense`/`pve-config`/`sqlite-backup` are excluded).
 **Offsite**: deliberately **NOT** appended to the incremental offsite manifest — it never deletes, so daily multi-GB images would accumulate unbounded on Synology. Instead the **monthly offsite-sync full pass (days 1-7)** mirrors all of `/mnt/backup` (including `vzdump/`) to Synology with `--delete`, bounded to local retention. So Copy 2 (sda) refreshes **daily**; Copy 3 (Synology) refreshes **monthly**.
-**Monitoring**: pushes `vzdump_last_run_timestamp` / `vzdump_last_status` / `vzdump_last_success_timestamp` to Pushgateway job `vzdump-backup`. A `VzdumpBackupStale` / `VzdumpBackupFailing` alert in `stacks/monitoring` (mirroring the LVM/pfSense backup alerts) is the recommended next addition.
+**Monitoring**: pushes `vzdump_last_run_timestamp` / `vzdump_last_status` / `vzdump_last_success_timestamp` to Pushgateway job `vzdump-backup`. Alerts `VzdumpBackupStale` (>~50h since last success), `VzdumpBackupNeverRun`, `VzdumpBackupFailing` (status≠0) are defined in `stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl` (the 3-2-1 group) — **effective on the next `monitoring` stack apply** (metrics already flow, so the alerts arm immediately once applied).
 **Restore**: on the PVE host, `qmrestore /mnt/backup/vzdump/vzdump-qemu-<vmid>-<ts>.vma.zst <vmid>` — restore to a spare VMID first if the original still exists, then swap disks; or use the PVE UI (add `/mnt/backup` as a dir storage with content=backup → Restore).
 
 ### Layer 2: Weekly File-Level Backup (sda Backup Disk)
diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index b3c53062..732b4e68 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -1696,6 +1696,28 @@ serverFiles:
               severity: warning
             annotations:
               summary: "NFS local mirror last run failed (status={{ $value }})"
+          - alert: VzdumpBackupStale
+            expr: (time() - vzdump_last_success_timestamp{job="vzdump-backup"}) > 180000
+            for: 30m
+            labels:
+              severity: warning
+            annotations:
+              summary: "vzdump VM image backup is {{ $value | humanizeDuration }} old (threshold: ~50h / 2 daily cycles)"
+              description: "vzdump-vms.timer on 192.168.1.127 hasn't produced a fresh devvm image. Check: ssh root@192.168.1.127 systemctl status vzdump-vms. Runbook: docs/architecture/backup-dr.md (VM Image Backups)."
+          - alert: VzdumpBackupNeverRun
+            expr: absent(vzdump_last_run_timestamp{job="vzdump-backup"})
+            for: 48h
+            labels:
+              severity: warning
+            annotations:
+              summary: "vzdump VM image backup job has never reported metrics to Pushgateway"
+          - alert: VzdumpBackupFailing
+            expr: vzdump_last_status{job="vzdump-backup"} != 0
+            for: 0m
+            labels:
+              severity: warning
+            annotations:
+              summary: "vzdump VM image backup last run failed (status={{ $value }})"
           - alert: BackupDiskFull
             expr: (1 - node_filesystem_avail_bytes{job="proxmox-host", mountpoint="/mnt/backup"} / node_filesystem_size_bytes{job="proxmox-host", mountpoint="/mnt/backup"}) > 0.85
             for: 15m