add 3-2-1 backup pipeline: weekly PVC file copy, NFS mirror, pfsense, offsite sync
- weekly-backup.sh: mounts LVM thin snapshots ro, rsyncs files to /mnt/backup/pvc-data with --link-dest versioning (4 weeks). Also mirrors NFS backup dirs from TrueNAS, backs up pfsense (config.xml + full tar), PVE host config, and prunes >7d snapshots. - offsite-sync-backup.sh: rsync --files-from manifest to Synology (no full dir walk). Monthly full --delete sync on 1st Sunday. After=weekly-backup.service dependency. - lvm-pvc-snapshot.timer: changed to daily 03:00 (was 2x daily) - Prometheus alerts: WeeklyBackupStale, WeeklyBackupFailing, PfsenseBackupStale, OffsiteBackupSyncStale, BackupDiskFull. LVMSnapshotStale threshold 24h→48h.
This commit is contained in:
parent
09b4bad958
commit
d009f9a0f2
8 changed files with 422 additions and 2 deletions
|
|
@ -1074,12 +1074,12 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||
- alert: LVMSnapshotStale
|
||||
expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 86400
|
||||
expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 172800
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected every 12h)"
|
||||
summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected daily)"
|
||||
- alert: LVMSnapshotNeverRun
|
||||
expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"})
|
||||
for: 48h
|
||||
|
|
@ -1101,6 +1101,42 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion"
|
||||
# --- 3-2-1 Backup Pipeline Alerts ---
|
||||
- alert: WeeklyBackupStale
|
||||
expr: (time() - weekly_backup_last_run_timestamp{job="weekly-backup"}) > 777600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Weekly backup is {{ $value | humanizeDuration }} old (threshold: 9d)"
|
||||
- alert: WeeklyBackupFailing
|
||||
expr: weekly_backup_last_status{job="weekly-backup"} != 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Weekly backup completed with errors (status={{ $value }})"
|
||||
- alert: PfsenseBackupStale
|
||||
expr: (time() - backup_last_success_timestamp{job="pfsense-backup"}) > 777600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "pfsense backup is {{ $value | humanizeDuration }} old (threshold: 9d)"
|
||||
- alert: OffsiteBackupSyncStale
|
||||
expr: (time() - backup_last_success_timestamp{job="offsite-backup-sync"}) > 777600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Offsite backup sync is {{ $value | humanizeDuration }} old (threshold: 9d)"
|
||||
- alert: BackupDiskFull
|
||||
expr: (1 - node_filesystem_avail_bytes{job="proxmox-host", mountpoint="/mnt/backup"} / node_filesystem_size_bytes{job="proxmox-host", mountpoint="/mnt/backup"}) > 0.85
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Backup disk /mnt/backup is {{ $value | humanizePercentage }} full"
|
||||
- alert: NewTailscaleClient
|
||||
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
|
||||
for: 5m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue