From d39770b30db31a336de1f46c920694dd80fff76e Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Wed, 22 Apr 2026 08:54:37 +0000
Subject: [PATCH] monitoring: tighten LVMSnapshotStale to 30h for daily-cadence
 detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Threshold was 48h + 30m for: a job that runs daily. We don't need
to wait 2.5 days to detect a broken timer — bring it down to 30h
+ 30m (just over a day of cadence + minor drift/retry grace). Also
add a description pointing to the restore runbook so the alert
text surfaces the fix path directly.

Threshold change: 172800s → 108000s. Docs in backup-dr.md synced.

Re-triggers default.yml apply now that ci/Dockerfile is rebuilt
with vault CLI — this is the first commit touching a stack that
will actually succeed since the e80b2f02 regression.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/architecture/backup-dr.md                                | 4 ++--
 .../monitoring/modules/monitoring/prometheus_chart_values.tpl | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md
index 2c992c20..eb1d11d7 100644
--- a/docs/architecture/backup-dr.md
+++ b/docs/architecture/backup-dr.md
@@ -226,7 +226,7 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62
 - They already have app-level dumps (Layer 2)
 - Including them causes ~36% write amplification; excluding them reduces overhead to ~0%
 
-**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>24h), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free).
+**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>30h since last run + 30m `for:`), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free).
 
 **Restore**: `lvm-pvc-snapshot restore <pvc-lv> <snapshot-lv>` — auto-discovers K8s workload, scales down, swaps LVs, scales back up. See `docs/runbooks/restore-lvm-snapshot.md`.
 
@@ -673,7 +673,7 @@ module "nfs_backup" {
 │  ~~CloudSyncNeverRun~~      REMOVED (TrueNAS decommissioned)    │
 │  ~~CloudSyncFailing~~       REMOVED (TrueNAS decommissioned)    │
 │  VaultwardenIntegrityFail   integrity_ok == 0                   │
-│  LVMSnapshotStale           > 24h since last snapshot           │
+│  LVMSnapshotStale           > 30h since last snapshot           │
 │  LVMSnapshotFailing         snapshot creation failed            │
 │  LVMThinPoolLow             < 15% free space in thin pool       │
 │  WeeklyBackupStale          > 8d  since last success            │
diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index 87903ff6..ae5d6d6f 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -1249,12 +1249,13 @@ serverFiles:
             annotations:
               summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
           - alert: LVMSnapshotStale
-            expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 172800
+            expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 108000
             for: 30m
             labels:
               severity: critical
             annotations:
               summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected daily)"
+              description: "Timer lvm-pvc-snapshot.timer on 192.168.1.127 hasn't pushed fresh metrics. Runbook: docs/runbooks/restore-lvm-snapshot.md"
           - alert: LVMSnapshotNeverRun
             expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"})
             for: 48h