diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index 2c992c20..eb1d11d7 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -226,7 +226,7 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 - They already have app-level dumps (Layer 2) - Including them causes ~36% write amplification; excluding them reduces overhead to ~0% -**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>24h), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free). +**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>30h since last run + 30m `for:`), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free). **Restore**: `lvm-pvc-snapshot restore ` — auto-discovers K8s workload, scales down, swaps LVs, scales back up. See `docs/runbooks/restore-lvm-snapshot.md`. @@ -673,7 +673,7 @@ module "nfs_backup" { │ ~~CloudSyncNeverRun~~ REMOVED (TrueNAS decommissioned) │ │ ~~CloudSyncFailing~~ REMOVED (TrueNAS decommissioned) │ │ VaultwardenIntegrityFail integrity_ok == 0 │ -│ LVMSnapshotStale > 24h since last snapshot │ +│ LVMSnapshotStale > 30h since last snapshot │ │ LVMSnapshotFailing snapshot creation failed │ │ LVMThinPoolLow < 15% free space in thin pool │ │ WeeklyBackupStale > 8d since last success │ diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 87903ff6..ae5d6d6f 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1249,12 +1249,13 @@ serverFiles: annotations: summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}" - alert: LVMSnapshotStale - expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 172800 + expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 108000 for: 30m labels: severity: critical annotations: summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected daily)" + description: "Timer lvm-pvc-snapshot.timer on 192.168.1.127 hasn't pushed fresh metrics. Runbook: docs/runbooks/restore-lvm-snapshot.md" - alert: LVMSnapshotNeverRun expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) for: 48h