diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index 9fa30480..84ae792c 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -333,7 +333,9 @@ Two-step offsite sync: **Change tracking**: `nfs-change-tracker.service` (systemd, inotifywait) on PVE host watches `/srv/nfs` and `/srv/nfs-ssd` continuously. Changed file paths are logged to `/mnt/backup/.nfs-changes.log`. The offsite sync reads this log and transfers only changed files. Incremental syncs complete in seconds instead of 30+ minutes. **Monthly full sync**: On 1st Sunday of month, runs `rsync --delete` for cleanup (removes orphaned files on Synology). -**Path exclusions**: `/srv/nfs/anca-elements/` (~770G) is excluded from both layers — it is itself a downstream replica of `Synology:/volume1/Backup/Anca/Elements` (synced in via `anca-elements-sync.sh`), so backing it up to Synology would be a self-duplicate. The exclusion lives in `nfs-change-tracker.service` (inotify `--exclude` regex) and `offsite-sync-backup` (rsync `--exclude` on full sync + `grep -v` on the incremental files-from list). +**Path exclusions**: `/srv/nfs/anca-elements/` (~770G) is excluded from both layers. From 2026-05-24 onward `/srv/nfs/anca-elements` is the source of truth for this archive — the Synology copy at `/volume1/Backup/Anca/Elements` was deleted (it had been the upstream source, but anca-elements-sync.sh's role inverted: PVE now writes, Synology no longer holds it). Single-disk-failure protection is provided by a SEPARATE local mirror on sda (`anca-elements-mirror.{service,timer}`, weekly Mon 04:00) — not by Synology. The Synology exclusion lives in `nfs-change-tracker.service` (inotify `--exclude` regex) and `offsite-sync-backup` (rsync `--exclude` on full sync + `grep -v` on the incremental files-from list). + +**Layer 3a: anca-elements local mirror (sda)**: `/usr/local/bin/anca-elements-mirror` rsyncs `/srv/nfs/anca-elements/` → `/mnt/backup/anca-elements/` weekly. `rsync -rlt --delete -H --no-perms --no-owner --no-group`. Idempotent; subsequent runs only transfer changes. Pushes `anca_elements_mirror_last_run_timestamp` + `anca_elements_mirror_last_status` to Pushgateway. No offsite copy — by design; the archive is single-disk-failure tolerant only. **Destination**: - `Synology/Backup/Viki/nfs/` — mirrors `/srv/nfs` @@ -360,6 +362,8 @@ Two-step offsite sync: | `/etc/systemd/system/lvm-pvc-snapshot.timer` | Daily 03:00 (LVM snapshots) | | `/etc/systemd/system/daily-backup.timer` | Daily 05:00 (file backup) | | `/etc/systemd/system/offsite-sync-backup.timer` | Daily 06:00 (offsite sync) | +| `/usr/local/bin/anca-elements-mirror` | PVE host: weekly mirror of /srv/nfs/anca-elements → sda /mnt/backup/anca-elements | +| `/etc/systemd/system/anca-elements-mirror.timer` | Weekly Mon 04:00 (anca-elements mirror) | | `stacks/dbaas/` | Terraform: PostgreSQL/MySQL backup CronJobs | | `stacks/vault/` | Terraform: Vault backup CronJob | | `stacks/vaultwarden/` | Terraform: Vaultwarden backup + integrity CronJobs | diff --git a/scripts/anca-elements-mirror.service b/scripts/anca-elements-mirror.service new file mode 100644 index 00000000..db1bf270 --- /dev/null +++ b/scripts/anca-elements-mirror.service @@ -0,0 +1,15 @@ +[Unit] +Description=Mirror /srv/nfs/anca-elements to /mnt/backup (single-disk-failure protection) +After=network-online.target local-fs.target +Wants=network-online.target + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/anca-elements-mirror +StandardOutput=journal +StandardError=journal +SyslogIdentifier=anca-elements-mirror +# Big sustained IO — don't compete with foreground services. +Nice=10 +IOSchedulingClass=idle +TimeoutStartSec=18000 diff --git a/scripts/anca-elements-mirror.sh b/scripts/anca-elements-mirror.sh new file mode 100644 index 00000000..4ce61ca2 --- /dev/null +++ b/scripts/anca-elements-mirror.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# anca-elements-mirror — single-disk-failure mirror of /srv/nfs/anca-elements → /mnt/backup +# +# Deploy to PVE host at /usr/local/bin/anca-elements-mirror. +# Schedule: weekly Mon 04:00 via systemd timer (anca-elements-mirror.timer). +# +# WHY: /srv/nfs/anca-elements lives on the sdc thin pool. Synology no longer +# holds the original (deleted after this mirror was verified). sda /mnt/backup +# is the only other local disk with room (~770G) — this gives us a single- +# disk-failure copy. No offsite for this archive (intentional, see backup-dr.md). +# +# Idempotent: `rsync -aH --delete` makes destination match source exactly. +# Re-runs only transfer changed files. + +set -euo pipefail + +SRC=/srv/nfs/anca-elements +DST=/mnt/backup/anca-elements +LOG=/var/log/anca-elements-mirror.log +LOCKFILE=/run/anca-elements-mirror.lock +PUSHGATEWAY="${ANCA_MIRROR_PUSHGATEWAY:-http://10.0.20.100:30091}" +PUSHGATEWAY_JOB=anca-elements-mirror + +log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG"; } +warn() { log "WARN: $*"; } + +push_metrics() { + local status="${1:-0}" bytes="${2:-0}" + cat </dev/null || true +anca_elements_mirror_last_run_timestamp $(date +%s) +anca_elements_mirror_last_status ${status} +anca_elements_mirror_bytes ${bytes} +EOF +} + +KILLED="" +cleanup() { + rm -f "$LOCKFILE" + if [ -n "$KILLED" ]; then + push_metrics 2 0 # status=2 → aborted (matches lvm-pvc-snapshot convention) + fi +} +trap cleanup EXIT +trap 'KILLED=1; exit 143' TERM INT + +if ! ( set -o noclobber; echo $$ > "$LOCKFILE" ) 2>/dev/null; then + log "FATAL: another instance running (pid $(cat "$LOCKFILE" 2>/dev/null || echo unknown))" + exit 1 +fi + +mountpoint -q /mnt/backup || { log "FATAL: /mnt/backup not mounted"; push_metrics 1 0; exit 1; } +[ -d "$SRC" ] || { log "FATAL: source $SRC missing"; push_metrics 1 0; exit 1; } + +mkdir -p "$DST" + +log "=== mirror starting: $SRC → $DST ===" +SRC_SIZE_GB=$(du -sBG "$SRC" 2>/dev/null | awk '{print $1}') +log "source size: $SRC_SIZE_GB" + +# -aH preserves hardlinks (probably none here, cheap insurance). +# --info=stats2 emits a final transfer summary into the log. +# --no-perms / --no-owner / --no-group: source has root:www-data 2775 and +# we don't need to perfectly preserve those on the mirror copy — dest will +# inherit /mnt/backup's defaults. (Symmetric with anca-elements-sync.sh's +# choice when copying FROM Synology.) +RSYNC_RC=0 +rsync \ + -rlt --delete -H \ + --no-perms --no-owner --no-group \ + --info=stats2 \ + "$SRC/" "$DST/" 2>&1 | tee -a "$LOG" || RSYNC_RC=${PIPESTATUS[0]} + +DST_BYTES=$(du -sb "$DST" 2>/dev/null | awk '{print $1}') + +if [ "$RSYNC_RC" -eq 0 ]; then + log "=== mirror complete; dest size: $(du -sh "$DST" | cut -f1) ===" + push_metrics 0 "$DST_BYTES" +else + log "=== mirror failed: rsync exited $RSYNC_RC ===" + push_metrics 1 "$DST_BYTES" + exit "$RSYNC_RC" +fi diff --git a/scripts/anca-elements-mirror.timer b/scripts/anca-elements-mirror.timer new file mode 100644 index 00000000..642a7773 --- /dev/null +++ b/scripts/anca-elements-mirror.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Weekly anca-elements mirror to /mnt/backup + +[Timer] +OnCalendar=Mon *-*-* 04:00:00 +Persistent=true +RandomizedDelaySec=15min + +[Install] +WantedBy=timers.target