From d39770b30db31a336de1f46c920694dd80fff76e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 22 Apr 2026 08:54:37 +0000 Subject: [PATCH] monitoring: tighten LVMSnapshotStale to 30h for daily-cadence detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threshold was 48h + 30m for: a job that runs daily. We don't need to wait 2.5 days to detect a broken timer — bring it down to 30h + 30m (just over a day of cadence + minor drift/retry grace). Also add a description pointing to the restore runbook so the alert text surfaces the fix path directly. Threshold change: 172800s → 108000s. Docs in backup-dr.md synced. Re-triggers default.yml apply now that ci/Dockerfile is rebuilt with vault CLI — this is the first commit touching a stack that will actually succeed since the e80b2f02 regression. Co-Authored-By: Claude Opus 4.7 --- docs/architecture/backup-dr.md | 4 ++-- .../monitoring/modules/monitoring/prometheus_chart_values.tpl | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index 2c992c20..eb1d11d7 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -226,7 +226,7 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 - They already have app-level dumps (Layer 2) - Including them causes ~36% write amplification; excluding them reduces overhead to ~0% -**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>24h), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free). +**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>30h since last run + 30m `for:`), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free). **Restore**: `lvm-pvc-snapshot restore ` — auto-discovers K8s workload, scales down, swaps LVs, scales back up. See `docs/runbooks/restore-lvm-snapshot.md`. @@ -673,7 +673,7 @@ module "nfs_backup" { │ ~~CloudSyncNeverRun~~ REMOVED (TrueNAS decommissioned) │ │ ~~CloudSyncFailing~~ REMOVED (TrueNAS decommissioned) │ │ VaultwardenIntegrityFail integrity_ok == 0 │ -│ LVMSnapshotStale > 24h since last snapshot │ +│ LVMSnapshotStale > 30h since last snapshot │ │ LVMSnapshotFailing snapshot creation failed │ │ LVMThinPoolLow < 15% free space in thin pool │ │ WeeklyBackupStale > 8d since last success │ diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 87903ff6..ae5d6d6f 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1249,12 +1249,13 @@ serverFiles: annotations: summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}" - alert: LVMSnapshotStale - expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 172800 + expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 108000 for: 30m labels: severity: critical annotations: summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected daily)" + description: "Timer lvm-pvc-snapshot.timer on 192.168.1.127 hasn't pushed fresh metrics. Runbook: docs/runbooks/restore-lvm-snapshot.md" - alert: LVMSnapshotNeverRun expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) for: 48h