From 82f674a0b4414c551a2aa005498e06be743be567 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 13 Apr 2026 18:37:04 +0000 Subject: [PATCH] =?UTF-8?q?rename=20weekly-backup=20=E2=86=92=20daily-back?= =?UTF-8?q?up=20across=20scripts,=20timers,=20services,=20and=20docs=20[ci?= =?UTF-8?q?=20skip]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflects the schedule change from weekly to daily. All references updated: - scripts/weekly-backup.{sh,timer,service} → daily-backup.* - Pushgateway job name: weekly-backup → daily-backup - Prometheus metric names: weekly_backup_* → daily_backup_* - All docs, runbooks, AGENTS.md, CLAUDE.md, proxmox-inventory - offsite-sync dependency: After=daily-backup.service Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/CLAUDE.md | 4 +- .claude/reference/proxmox-inventory.md | 4 +- .claude/scripts/backup-verify.sh | 54 +++++++++---------- AGENTS.md | 4 +- docs/architecture/backup-dr.md | 38 ++++++------- docs/runbooks/restore-full-cluster.md | 2 +- docs/runbooks/restore-pvc-from-backup.md | 4 +- ...ly-backup.service => daily-backup.service} | 4 +- scripts/{weekly-backup.sh => daily-backup.sh} | 18 +++---- ...weekly-backup.timer => daily-backup.timer} | 0 scripts/offsite-sync-backup.service | 2 +- scripts/offsite-sync-backup.sh | 2 +- .../monitoring/prometheus_chart_values.tpl | 8 +-- 13 files changed, 72 insertions(+), 72 deletions(-) rename scripts/{weekly-backup.service => daily-backup.service} (73%) rename scripts/{weekly-backup.sh => daily-backup.sh} (94%) rename scripts/{weekly-backup.timer => daily-backup.timer} (100%) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 469bc9dc..afba135a 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -188,8 +188,8 @@ resource "kubernetes_persistent_volume_claim" "data_proxmox" { **Copy 3**: Synology NAS offsite (two-tier: sda + NFS) **PVE host scripts** (source: `infra/scripts/`): -- `/usr/local/bin/weekly-backup` — Sunday 05:00. Mounts LVM thin snapshots ro → rsyncs FILES to `/mnt/backup/pvc-data////` with `--link-dest` versioning (4 weeks). Auto SQLite backup (magic number check, `?mode=ro`). Auto-discovered BACKUP_DIRS (glob, not hardcoded). Also backs up pfSense (config.xml + tar), PVE config. Prunes snapshots >7d. -- `/usr/local/bin/offsite-sync-backup` — Sunday 08:00 (After=weekly-backup). Step 1: sda → Synology `pve-backup/` (PVC snapshots, pfSense, PVE config). Step 2: NFS → Synology `nfs/` + `nfs-ssd/` via inotify change-tracked `rsync --files-from`. Monthly full `rsync --delete` on 1st Sunday. +- `/usr/local/bin/daily-backup` — Daily 05:00. Mounts LVM thin snapshots ro → rsyncs FILES to `/mnt/backup/pvc-data////` with `--link-dest` versioning (4 weeks). Auto SQLite backup (magic number check, `?mode=ro`). Auto-discovered BACKUP_DIRS (glob, not hardcoded). Also backs up pfSense (config.xml + tar), PVE config. Prunes snapshots >7d. +- `/usr/local/bin/offsite-sync-backup` — Daily 06:00 (After=daily-backup). Step 1: sda → Synology `pve-backup/` (PVC snapshots, pfSense, PVE config). Step 2: NFS → Synology `nfs/` + `nfs-ssd/` via inotify change-tracked `rsync --files-from`. Monthly full `rsync --delete` on 1st Sunday. - `/usr/local/bin/lvm-pvc-snapshot` — Daily 03:00. Thin snapshots of all PVCs except dbaas+monitoring. 7-day retention. Instant restore: `lvm-pvc-snapshot restore `. - `nfs-change-tracker.service` — Continuous inotifywait on `/srv/nfs` + `/srv/nfs-ssd`. Logs changed file paths to `/mnt/backup/.nfs-changes.log`. Consumed by offsite-sync-backup for incremental rsync (completes in seconds instead of 30+ minutes). diff --git a/.claude/reference/proxmox-inventory.md b/.claude/reference/proxmox-inventory.md index fc2f79cc..e2d8308d 100644 --- a/.claude/reference/proxmox-inventory.md +++ b/.claude/reference/proxmox-inventory.md @@ -118,8 +118,8 @@ Channel 3: A4 [32G] ──── A8 [32G] ──── A12[ 8G ] = 72 GB | Unit | Type | Schedule | Purpose | |------|------|----------|---------| | `lvm-pvc-snapshot.timer` | Timer | Daily 03:00 | LVM thin snapshots of all PVCs (7-day retention) | -| `weekly-backup.timer` | Timer | Sunday 05:00 | PVC file backup, auto SQLite backup, pfSense, PVE config | -| `offsite-sync-backup.timer` | Timer | Sunday 08:00 | Two-step rsync to Synology (sda + NFS via inotify) | +| `daily-backup.timer` | Timer | Daily 05:00 | PVC file backup, auto SQLite backup, pfSense, PVE config | +| `offsite-sync-backup.timer` | Timer | Daily 06:00 | Two-step rsync to Synology (sda + NFS via inotify) | | `nfs-change-tracker.service` | Service | Continuous | inotifywait on `/srv/nfs` + `/srv/nfs-ssd`, logs to `/mnt/backup/.nfs-changes.log` | ## GPU Node (k8s-node1) diff --git a/.claude/scripts/backup-verify.sh b/.claude/scripts/backup-verify.sh index 37ee409b..d72d5f4b 100755 --- a/.claude/scripts/backup-verify.sh +++ b/.claude/scripts/backup-verify.sh @@ -148,15 +148,15 @@ check_lvm_snapshot_timer() { # LAYER 2: Weekly Backup (sda) # ============================================================ -check_weekly_backup_freshness() { - if $DRY_RUN; then add_check "weekly-backup-freshness" "ok" "DRY RUN"; return; fi - if ! $PVE_REACHABLE; then add_check "weekly-backup-freshness" "fail" "PVE unreachable"; return; fi +check_daily_backup_freshness() { + if $DRY_RUN; then add_check "daily-backup-freshness" "ok" "DRY RUN"; return; fi + if ! $PVE_REACHABLE; then add_check "daily-backup-freshness" "fail" "PVE unreachable"; return; fi local ts - ts=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^weekly_backup_last_run_timestamp' | head -1 | awk '{print \$2}'" 2>/dev/null) || true + ts=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^daily_backup_last_run_timestamp' | head -1 | awk '{print \$2}'" 2>/dev/null) || true if [ -z "$ts" ]; then - add_check "weekly-backup-freshness" "fail" "No weekly backup metric — may have never run" + add_check "daily-backup-freshness" "fail" "No weekly backup metric — may have never run" return fi @@ -165,44 +165,44 @@ check_weekly_backup_freshness() { age_h=$(python3 -c "print(f'{($now - $ts) / 3600:.1f}')" 2>/dev/null) if python3 -c "exit(0 if ($now - $ts) < 777600 else 1)" 2>/dev/null; then # 9d - add_check "weekly-backup-freshness" "ok" "Last run ${age_h}h ago" + add_check "daily-backup-freshness" "ok" "Last run ${age_h}h ago" else - add_check "weekly-backup-freshness" "fail" "Weekly backup stale: ${age_h}h ago (threshold: 9d)" + add_check "daily-backup-freshness" "fail" "Daily backup stale: ${age_h}h ago (threshold: 9d)" fi } -check_weekly_backup_status() { - if $DRY_RUN; then add_check "weekly-backup-status" "ok" "DRY RUN"; return; fi - if ! $PVE_REACHABLE; then add_check "weekly-backup-status" "fail" "PVE unreachable"; return; fi +check_daily_backup_status() { + if $DRY_RUN; then add_check "daily-backup-status" "ok" "DRY RUN"; return; fi + if ! $PVE_REACHABLE; then add_check "daily-backup-status" "fail" "PVE unreachable"; return; fi local status - status=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^weekly_backup_last_status' | head -1 | awk '{print \$2}'" 2>/dev/null) || true + status=$($PVE_SSH "curl -s http://10.0.20.100:30091/metrics 2>/dev/null | grep '^daily_backup_last_status' | head -1 | awk '{print \$2}'" 2>/dev/null) || true if [ "$status" = "0" ] || [ "$status" = "0.0" ]; then - add_check "weekly-backup-status" "ok" "Last weekly backup succeeded" + add_check "daily-backup-status" "ok" "Last weekly backup succeeded" elif [ -z "$status" ]; then - add_check "weekly-backup-status" "warn" "No status metric found" + add_check "daily-backup-status" "warn" "No status metric found" else - add_check "weekly-backup-status" "fail" "Last weekly backup failed (status=$status)" + add_check "daily-backup-status" "fail" "Last weekly backup failed (status=$status)" fi } -check_weekly_backup_timer() { - if $DRY_RUN; then add_check "weekly-backup-timer" "ok" "DRY RUN"; return; fi - if ! $PVE_REACHABLE; then add_check "weekly-backup-timer" "fail" "PVE unreachable"; return; fi +check_daily_backup_timer() { + if $DRY_RUN; then add_check "daily-backup-timer" "ok" "DRY RUN"; return; fi + if ! $PVE_REACHABLE; then add_check "daily-backup-timer" "fail" "PVE unreachable"; return; fi local active enabled - active=$($PVE_SSH "systemctl is-active weekly-backup.timer 2>/dev/null" 2>/dev/null) || active="unknown" - enabled=$($PVE_SSH "systemctl is-enabled weekly-backup.timer 2>/dev/null" 2>/dev/null) || enabled="unknown" + active=$($PVE_SSH "systemctl is-active daily-backup.timer 2>/dev/null" 2>/dev/null) || active="unknown" + enabled=$($PVE_SSH "systemctl is-enabled daily-backup.timer 2>/dev/null" 2>/dev/null) || enabled="unknown" if [ "$active" = "active" ] && [ "$enabled" = "enabled" ]; then - add_check "weekly-backup-timer" "ok" "Timer active and enabled" + add_check "daily-backup-timer" "ok" "Timer active and enabled" else - add_check "weekly-backup-timer" "fail" "Timer: active=$active enabled=$enabled" + add_check "daily-backup-timer" "fail" "Timer: active=$active enabled=$enabled" if $FIX; then - $PVE_SSH "systemctl enable --now weekly-backup.timer" 2>/dev/null && \ - add_check "weekly-backup-timer-fix" "ok" "AUTO-FIX: Timer re-enabled" || \ - add_check "weekly-backup-timer-fix" "fail" "AUTO-FIX: Failed to re-enable timer" + $PVE_SSH "systemctl enable --now daily-backup.timer" 2>/dev/null && \ + add_check "daily-backup-timer-fix" "ok" "AUTO-FIX: Timer re-enabled" || \ + add_check "daily-backup-timer-fix" "fail" "AUTO-FIX: Failed to re-enable timer" fi fi } @@ -529,9 +529,9 @@ check_lvm_thinpool_free check_lvm_snapshot_timer # Layer 2: Weekly Backup (sda) -check_weekly_backup_freshness -check_weekly_backup_status -check_weekly_backup_timer +check_daily_backup_freshness +check_daily_backup_status +check_daily_backup_timer check_sda_mount check_sda_disk_usage check_pvc_data_freshness diff --git a/AGENTS.md b/AGENTS.md index d8463b0e..f159d7e2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -66,8 +66,8 @@ Terragrunt-based homelab managing a Kubernetes cluster (5 nodes, v1.34.2) on Pro - **NFS mount options**: Always `soft,timeo=30,retrans=3` to prevent uninterruptible sleep (D state). - **NFS export directory must exist** on the Proxmox host before Terraform can create the PV. - **Backup (3-2-1)**: Copy 1 = live PVCs on sdc. Copy 2 = sda `/mnt/backup` (PVC file backups, auto SQLite backups, pfSense, PVE config). Copy 3 = Synology offsite (two-tier: sda→`pve-backup/`, NFS→`nfs/`+`nfs-ssd/` via inotify change tracking). -- **weekly-backup** (Sunday 05:00): Auto-discovered BACKUP_DIRS (glob), auto SQLite backup (magic number + `?mode=ro`), pfSense, PVE config. No NFS mirror step (NFS syncs directly to Synology via inotify). -- **offsite-sync-backup** (Sunday 08:00): Step 1: sda→Synology `pve-backup/`. Step 2: NFS→Synology `nfs/`+`nfs-ssd/` via `rsync --files-from` (inotify change log). Monthly full `--delete`. +- **daily-backup** (Daily 05:00): Auto-discovered BACKUP_DIRS (glob), auto SQLite backup (magic number + `?mode=ro`), pfSense, PVE config. No NFS mirror step (NFS syncs directly to Synology via inotify). +- **offsite-sync-backup** (Daily 06:00): Step 1: sda→Synology `pve-backup/`. Step 2: NFS→Synology `nfs/`+`nfs-ssd/` via `rsync --files-from` (inotify change log). Monthly full `--delete`. - **nfs-change-tracker.service**: inotifywait on `/srv/nfs` + `/srv/nfs-ssd`, logs to `/mnt/backup/.nfs-changes.log`. Incremental syncs complete in seconds. - **Synology layout** (`/volume1/Backup/Viki/`): `pve-backup/` (from sda), `nfs/` (from `/srv/nfs`), `nfs-ssd/` (from `/srv/nfs-ssd`). `truenas/` renamed to `nfs/`, `pve-backup/nfs-mirror/` removed. diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index b64bdcbf..b052cb9a 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -29,7 +29,7 @@ graph TB end subgraph Layer2["Layer 2: Weekly File Backup"] - PVCBackup["PVC File Copy
Sunday 05:00
4 weekly versions
/mnt/backup/pvc-data//"] + PVCBackup["PVC File Copy
Daily 05:00
4 weekly versions
/mnt/backup/pvc-data//"] SQLiteBackup["Auto SQLite Backup
magic number check + ?mode=ro
from PVC snapshots"] PfsenseBackup["pfSense Backup
config.xml + full tar
4 weekly versions"] PVEConfig["PVE Config
/etc/pve + scripts"] @@ -56,7 +56,7 @@ graph TB end subgraph Layer3["Layer 3: Offsite Sync"] - PVEOffsite["Step 1: sda → Synology
Sunday 08:00
pve-backup/ only"] + PVEOffsite["Step 1: sda → Synology
Daily 06:00
pve-backup/ only"] NFSOffsite["Step 2: NFS → Synology
inotify change-tracked
rsync --files-from
nfs/ + nfs-ssd/"] end @@ -94,7 +94,7 @@ graph LR S02["02:00 Vault backup
(CronJob)"] S03a["03:00 Redis backup
(CronJob)"] S03b["03:00 LVM snapshots
(lvm-pvc-snapshot timer)"] - S05["05:00 Weekly backup
(weekly-backup timer)
1. PVC file copy (auto-discovered BACKUP_DIRS)
2. Auto SQLite backup (magic number + ?mode=ro)
3. pfSense backup
4. PVE config
5. Prune snapshots"] + S05["05:00 Daily backup
(daily-backup timer)
1. PVC file copy (auto-discovered BACKUP_DIRS)
2. Auto SQLite backup (magic number + ?mode=ro)
3. pfSense backup
4. PVE config
5. Prune snapshots"] S08["08:00 Offsite sync
(offsite-sync-backup timer)
Step 1: sda → Synology pve-backup/
Step 2: NFS → Synology nfs/ + nfs-ssd/
(inotify change-tracked)"] end @@ -195,11 +195,11 @@ graph LR | Component | Version/Schedule | Location | Purpose | |-----------|-----------------|----------|---------| | LVM Thin Snapshots | Daily 03:00, 7d retention | PVE host: `lvm-pvc-snapshot` | CoW snapshots of 62 proxmox-lvm PVCs | -| Weekly PVC Backup | Sunday 05:00, 4 weeks | PVE host: `weekly-backup` | File-level PVC copy to sda | -| Auto SQLite Backup | Sunday 05:00 + weekly-backup | PVE host: magic number check + ?mode=ro | Safe SQLite backup from PVC snapshots | +| Daily PVC Backup | Daily 05:00, 4 weeks | PVE host: `daily-backup` | File-level PVC copy to sda | +| Auto SQLite Backup | Daily 05:00 + daily-backup | PVE host: magic number check + ?mode=ro | Safe SQLite backup from PVC snapshots | | NFS Change Tracker | Continuous (inotifywait) | PVE host: `nfs-change-tracker.service` | Logs changed NFS file paths to `/mnt/backup/.nfs-changes.log` | -| pfSense Backup | Sunday 05:00 + weekly-backup | PVE host: SSH + API | config.xml + full filesystem tar | -| Offsite Sync | Sunday 08:00 (after weekly-backup) | PVE host: `offsite-sync-backup` | Two-step: sda→pve-backup + NFS→nfs/nfs-ssd via inotify | +| pfSense Backup | Daily 05:00 + daily-backup | PVE host: SSH + API | config.xml + full filesystem tar | +| Offsite Sync | Daily 06:00 (after daily-backup) | PVE host: `offsite-sync-backup` | Two-step: sda→pve-backup + NFS→nfs/nfs-ssd via inotify | | PostgreSQL Backup | Daily 00:00, 14d retention | CronJob in `dbaas` namespace | pg_dumpall for all databases | | MySQL Backup | Daily 00:30, 14d retention | CronJob in `dbaas` namespace | mysqldump for all databases | | etcd Backup | Weekly Sunday 01:00, 30d | CronJob in `kube-system` | etcdctl snapshot | @@ -232,8 +232,8 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 **Backup disk**: sda (1.1TB RAID1 SAS) → VG `backup` → LV `data` → ext4 → mounted at `/mnt/backup` on PVE host. Dedicated backup disk, independent of live storage. -**Script**: `/usr/local/bin/weekly-backup` on PVE host (source: `infra/scripts/weekly-backup`) -**Schedule**: Sunday 05:00 via systemd timer +**Script**: `/usr/local/bin/daily-backup` on PVE host (source: `infra/scripts/daily-backup`) +**Schedule**: Daily 05:00 via systemd timer **Retention**: 4 weekly versions (weeks 0-3 via `--link-dest` hardlink dedup) #### What Gets Backed Up @@ -280,7 +280,7 @@ K8s CronJobs run inside the cluster, dumping database/state to NFS-exported back - **PostgreSQL** (`pg_dumpall`): Dumps all databases to `/mnt/main/postgresql-backup/`. Command: `pg_dumpall -h pg-cluster-rw.dbaas -U postgres | gzip -9 > backup-$(date +%Y%m%d).sql.gz`. 14-day rotation via `find -mtime +14 -delete`. - **MySQL** (`mysqldump`): Dumps all databases. Command: `mysqldump -h mysql-primary.dbaas --all-databases --single-transaction | gzip -9 > backup-$(date +%Y%m%d).sql.gz`. 14-day rotation. -**Weekly backups (Sunday 01:00-04:00)**: +**Daily backups (Sunday 01:00-04:00)**: - **etcd**: `etcdctl snapshot save /mnt/main/etcd-backup/snapshot-$(date +%Y%m%d).db`. 30-day retention. Critical for cluster recovery. - **Vaultwarden**: See "Vaultwarden Enhanced Protection" below. 30-day retention. - **Vault**: `vault operator raft snapshot save /mnt/main/vault-backup/snapshot-$(date +%Y%m%d).snap`. 30-day retention. @@ -308,7 +308,7 @@ This provides both frequent backups (every 6h) AND continuous integrity monitori ### Layer 3: Offsite Sync to Synology NAS **Script**: `/usr/local/bin/offsite-sync-backup` on PVE host (source: `infra/scripts/offsite-sync-backup`) -**Schedule**: Sunday 08:00 via systemd timer (After=weekly-backup.service) +**Schedule**: Daily 06:00 via systemd timer (After=daily-backup.service) Two-step offsite sync: @@ -346,14 +346,14 @@ Two-step offsite sync: | Path | Purpose | |------|---------| | `/usr/local/bin/lvm-pvc-snapshot` | PVE host: LVM snapshot creation + restore | -| `/usr/local/bin/weekly-backup` | PVE host: PVC file copy + auto SQLite backup + pfSense | +| `/usr/local/bin/daily-backup` | PVE host: PVC file copy + auto SQLite backup + pfSense | | `/usr/local/bin/offsite-sync-backup` | PVE host: two-step rsync to Synology (sda + NFS via inotify) | | `/mnt/backup/` | PVE host: sda mount point (1.1TB backup disk) | | `/mnt/backup/.nfs-changes.log` | NFS change log from inotifywait, consumed by offsite-sync | | `/etc/systemd/system/nfs-change-tracker.service` | inotifywait watcher for `/srv/nfs` + `/srv/nfs-ssd` | | `/etc/systemd/system/lvm-pvc-snapshot.timer` | Daily 03:00 (LVM snapshots) | -| `/etc/systemd/system/weekly-backup.timer` | Sunday 05:00 (file backup) | -| `/etc/systemd/system/offsite-sync-backup.timer` | Sunday 08:00 (offsite sync) | +| `/etc/systemd/system/daily-backup.timer` | Daily 05:00 (file backup) | +| `/etc/systemd/system/offsite-sync-backup.timer` | Daily 06:00 (offsite sync) | | `stacks/dbaas/` | Terraform: PostgreSQL/MySQL backup CronJobs | | `stacks/vault/` | Terraform: Vault backup CronJob | | `stacks/vaultwarden/` | Terraform: Vaultwarden backup + integrity CronJobs | @@ -466,8 +466,8 @@ See `docs/runbooks/restore-lvm-snapshot.md`. **Diagnosis**: ```bash ssh root@192.168.1.127 -systemctl status weekly-backup.service -journalctl -u weekly-backup.service --since "7 days ago" +systemctl status daily-backup.service +journalctl -u daily-backup.service --since "7 days ago" df -h /mnt/backup ``` @@ -480,7 +480,7 @@ df -h /mnt/backup 1. If disk full: Clean up old weekly versions manually, adjust retention 2. If LV mount failed: `lvchange -ay backup/data && mount /mnt/backup` 3. If NFS failed: Check Proxmox NFS availability (`showmount -e 192.168.1.127`), verify exports -4. Manually trigger: `systemctl start weekly-backup.service` +4. Manually trigger: `systemctl start daily-backup.service` ### Offsite Sync Failing @@ -550,7 +550,7 @@ kubectl exec -n vaultwarden deployment/vaultwarden -- sqlite3 /data/db.sqlite3 " **Diagnosis**: ```bash ssh root@192.168.1.127 -systemctl status weekly-backup.service | grep -A5 pfsense +systemctl status daily-backup.service | grep -A5 pfsense ``` **Common causes**: @@ -683,7 +683,7 @@ module "nfs_backup" { **Metrics sources**: - Backup CronJobs: Push `backup_last_success_timestamp` to Pushgateway on completion - LVM snapshot script: Pushes `lvm_snapshot_last_success_timestamp`, `lvm_snapshot_count`, `lvm_thin_pool_free_percent` -- Weekly backup script: Pushes `backup_weekly_last_success_timestamp`, `backup_disk_usage_percent` +- Daily backup script: Pushes `backup_weekly_last_success_timestamp`, `backup_disk_usage_percent` - Offsite sync script: Pushes `offsite_backup_sync_last_success_timestamp` - ~~CloudSync monitor~~: Removed (TrueNAS decommissioned) - Vaultwarden integrity: Pushes `vaultwarden_sqlite_integrity_ok` hourly diff --git a/docs/runbooks/restore-full-cluster.md b/docs/runbooks/restore-full-cluster.md index 0316514d..56f6adfd 100644 --- a/docs/runbooks/restore-full-cluster.md +++ b/docs/runbooks/restore-full-cluster.md @@ -91,7 +91,7 @@ lvchange -an pve/$LV_NAME **Note on pfSense restore**: If pfSense needs restoration, restore `config.xml` from `/mnt/backup/pfsense//config.xml` via web UI, or full filesystem tar for custom scripts. -**Note on PVE config restore**: If custom scripts/timers are lost, restore from `/mnt/backup/pve-config/` (weekly-backup, offsite-sync-backup, lvm-pvc-snapshot scripts + timers). +**Note on PVE config restore**: If custom scripts/timers are lost, restore from `/mnt/backup/pve-config/` (daily-backup, offsite-sync-backup, lvm-pvc-snapshot scripts + timers). ### Phase 4: Vault (secrets foundation) ```bash diff --git a/docs/runbooks/restore-pvc-from-backup.md b/docs/runbooks/restore-pvc-from-backup.md index 62132091..8e99ecb9 100644 --- a/docs/runbooks/restore-pvc-from-backup.md +++ b/docs/runbooks/restore-pvc-from-backup.md @@ -193,10 +193,10 @@ For databases (MySQL, PostgreSQL), prefer the app-level backup restore (see `res | Problem | Cause | Fix | |---------|-------|-----| | "LV is active" during mount | Workload pod still running or stuck | `kubectl get pods -A | grep `, delete pod if stuck | -| "No such file or directory" in backup | PVC not backed up (in excluded namespace) | Check `weekly-backup` script EXCLUDE_NAMESPACES | +| "No such file or directory" in backup | PVC not backed up (in excluded namespace) | Check `daily-backup` script EXCLUDE_NAMESPACES | | rsync shows 0 files transferred | Wrong backup week or PVC name | Double-check paths: `ls /mnt/backup/pvc-data////` | | Pod stuck in ContainerCreating after restore | LV still active on PVE host | `lvchange -an pve/`, wait 30s, check pod again | -| Backup week missing | Weekly backup hasn't run for that week | Check `systemctl status weekly-backup.service`, verify retention | +| Backup week missing | Daily backup hasn't run for that week | Check `systemctl status daily-backup.service`, verify retention | ## Restore from Synology (if PVE host sda is unavailable) diff --git a/scripts/weekly-backup.service b/scripts/daily-backup.service similarity index 73% rename from scripts/weekly-backup.service rename to scripts/daily-backup.service index 3321d2c6..a2bf2d85 100644 --- a/scripts/weekly-backup.service +++ b/scripts/daily-backup.service @@ -4,8 +4,8 @@ After=network-online.target [Service] Type=oneshot -ExecStart=/usr/local/bin/weekly-backup +ExecStart=/usr/local/bin/daily-backup StandardOutput=journal StandardError=journal -SyslogIdentifier=weekly-backup +SyslogIdentifier=daily-backup TimeoutStartSec=3600 diff --git a/scripts/weekly-backup.sh b/scripts/daily-backup.sh similarity index 94% rename from scripts/weekly-backup.sh rename to scripts/daily-backup.sh index 0a324c1d..de7508b9 100644 --- a/scripts/weekly-backup.sh +++ b/scripts/daily-backup.sh @@ -1,15 +1,15 @@ #!/usr/bin/env bash -# weekly-backup — 3-2-1 backup: PVC file copy + SQLite + pfsense + PVE config to sda -# Deploy to PVE host at /usr/local/bin/weekly-backup +# daily-backup — 3-2-1 backup: PVC file copy + SQLite + pfsense + PVE config to sda +# Deploy to PVE host at /usr/local/bin/daily-backup # Schedule: Daily 05:00 via systemd timer set -euo pipefail # --- Configuration --- BACKUP_ROOT="/mnt/backup" PVC_MOUNT="/tmp/pvc-mount" -PUSHGATEWAY="${WEEKLY_BACKUP_PUSHGATEWAY:-http://10.0.20.100:30091}" -PUSHGATEWAY_JOB="weekly-backup" -LOCKFILE="/run/weekly-backup.lock" +PUSHGATEWAY="${DAILY_BACKUP_PUSHGATEWAY:-http://10.0.20.100:30091}" +PUSHGATEWAY_JOB="daily-backup" +LOCKFILE="/run/daily-backup.lock" MANIFEST="${BACKUP_ROOT}/.changed-files" MAPPING_CACHE="${BACKUP_ROOT}/.lv-pvc-mapping.json" KUBECONFIG="${KUBECONFIG:-/root/.kube/config}" @@ -34,9 +34,9 @@ fi push_metrics() { local status="${1:-0}" bytes="${2:-0}" cat </dev/null || true -weekly_backup_last_run_timestamp $(date +%s) -weekly_backup_last_status ${status} -weekly_backup_bytes_synced ${bytes} +daily_backup_last_run_timestamp $(date +%s) +daily_backup_last_status ${status} +daily_backup_bytes_synced ${bytes} EOF } @@ -204,7 +204,7 @@ fi log "--- Step 4: PVE host config ---" mkdir -p "${BACKUP_ROOT}/pve-config/scripts" rsync -az --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; } -for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/weekly-backup /usr/local/bin/offsite-sync-backup; do +for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do [ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true done find "${BACKUP_ROOT}/pve-config" -type f 2>/dev/null | sed "s|^${BACKUP_ROOT}/||" >> "${MANIFEST}" diff --git a/scripts/weekly-backup.timer b/scripts/daily-backup.timer similarity index 100% rename from scripts/weekly-backup.timer rename to scripts/daily-backup.timer diff --git a/scripts/offsite-sync-backup.service b/scripts/offsite-sync-backup.service index e795ba92..d9ecd931 100644 --- a/scripts/offsite-sync-backup.service +++ b/scripts/offsite-sync-backup.service @@ -1,6 +1,6 @@ [Unit] Description=Daily offsite sync: sda + NFS changes to Synology -After=network-online.target weekly-backup.service +After=network-online.target daily-backup.service [Service] Type=oneshot diff --git a/scripts/offsite-sync-backup.sh b/scripts/offsite-sync-backup.sh index 727ba806..9485ff83 100644 --- a/scripts/offsite-sync-backup.sh +++ b/scripts/offsite-sync-backup.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # offsite-sync-backup — Sync backups to Synology NAS # Deploy to PVE host at /usr/local/bin/offsite-sync-backup -# Schedule: Daily 06:00 via systemd timer (After=weekly-backup.service) +# Schedule: Daily 06:00 via systemd timer (After=daily-backup.service) # # Two sync paths: # Step 1: sda (/mnt/backup) → Synology pve-backup/ (PVC snapshots, pfsense, pve-config, sqlite) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 7572d4d2..86920720 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1282,19 +1282,19 @@ serverFiles: summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion" # --- 3-2-1 Backup Pipeline Alerts --- - alert: WeeklyBackupStale - expr: (time() - weekly_backup_last_run_timestamp{job="weekly-backup"}) > 777600 + expr: (time() - daily_backup_last_run_timestamp{job="daily-backup"}) > 777600 for: 30m labels: severity: warning annotations: - summary: "Weekly backup is {{ $value | humanizeDuration }} old (threshold: 9d)" + summary: "Daily backup is {{ $value | humanizeDuration }} old (threshold: 9d)" - alert: WeeklyBackupFailing - expr: weekly_backup_last_status{job="weekly-backup"} != 0 + expr: daily_backup_last_status{job="daily-backup"} != 0 for: 0m labels: severity: warning annotations: - summary: "Weekly backup completed with errors (status={{ $value }})" + summary: "Daily backup completed with errors (status={{ $value }})" - alert: PfsenseBackupStale expr: (time() - backup_last_success_timestamp{job="pfsense-backup"}) > 777600 for: 30m