backup: fix daily-backup silent failures, postiz pg_dump CronJob, doc reconcile
daily-backup ran out of its 1h budget and SIGTERMed for 10 days straight (Apr 30 → May 9). Each failed run left its snapshot mount stacked on /tmp/pvc-mount, which blocked the next run from completing — root cause of the WeeklyBackupStale alert going silent (the metric never reached its end-of-script push). Fixes: - TimeoutStartSec 1h → 4h (current workload of 118 PVCs needs ~1.5h, was hitting the wall during week 18 runs) - Recursive umount + LUKS cleanup on EXIT trap, plus the same at script start as belt-and-braces for any inherited stuck state from a prior crashed run - TERM/INT trap pushes status=2 metric so WeeklyBackupFailing fires instead of the alert going blind on systemd kills - pfsense metric pushed in BOTH success and failure paths (was only on success; any ssh-to-pfsense outage made PfsenseBackupStale silent until the alert threshold expired) Postiz backup CronJob: bundled bitnami PG/Redis live on local-path (K8s node OS disk) — outside Layer 1+2 of the 3-2-1 pipeline. Added postiz-postgres-backup that pg_dumps postiz + temporal + temporal_visibility daily 03:00 to /srv/nfs/postiz-backup, getting Layer 3 offsite coverage. Verified end-to-end: 3 dumps written, Pushgateway metric received. Note: bitnamilegacy/postgresql image is stripped (no curl/wget/python) — switched to docker.io/library/postgres matching the dbaas/postgresql-backup pattern with apt-installed curl. Doc reconcile (backup-dr.md): metric names had drifted (e.g. the docs claimed backup_weekly_last_success_timestamp but the script pushes daily_backup_last_run_timestamp). Updated to match what's actually emitted, and added a "default-covered" footnote to the Service Protection Matrix so the ~40 services with PVCs not enumerated in the table are no longer ambiguous. Manual PVE-host actions (out-of-band, not in TF): - unmounted 6 stacked snapshots from /tmp/pvc-mount - pruned 5 stale snapshots on vm-9999-pvc-67c90b6b... (origin LV that the loop got SIGTERMed against repeatedly, so prune kept failing) - created /srv/nfs/postiz-backup directory - triggered a one-shot daily-backup run with the new TimeoutStartSec to validate the fix end-to-end Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8c619278d3
commit
0d8e0ca6fc
4 changed files with 174 additions and 12 deletions
|
|
@ -8,4 +8,7 @@ ExecStart=/usr/local/bin/daily-backup
|
|||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=daily-backup
|
||||
TimeoutStartSec=3600
|
||||
# 4h budget — the snapshot mount + LUKS decrypt + rsync + sqlite scan loop
|
||||
# scales with the number of PVCs (118 today). Hit the 1h ceiling around week
|
||||
# 18 of 2026 and silently SIGTERM'd for 10 days. Bumped to 4h with margin.
|
||||
TimeoutStartSec=14400
|
||||
|
|
|
|||
|
|
@ -21,15 +21,48 @@ warn() { log "WARN: $*" >&2; }
|
|||
die() { log "FATAL: $*" >&2; push_metrics 1 0; exit 1; }
|
||||
|
||||
# --- Locking ---
|
||||
# Track whether we got SIGTERM/SIGINT so cleanup can push a non-success metric.
|
||||
# Without this, a systemd timeout-kill leaves WeeklyBackupFailing alerts blind:
|
||||
# the script never reaches the success push at the end and the metric goes stale
|
||||
# silently. (Root cause of 2026-04-30 → 2026-05-09 silent-failure run.)
|
||||
KILLED=""
|
||||
|
||||
cleanup() {
|
||||
umount "${PVC_MOUNT}" 2>/dev/null || true
|
||||
# Recursively unmount /tmp/pvc-mount: previous SIGTERM'd runs left snapshot
|
||||
# mounts stacked here, which made every subsequent run start with an
|
||||
# already-occupied mountpoint and time out before reaching its own umount.
|
||||
while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
|
||||
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
|
||||
done
|
||||
# Close any LUKS mappers we opened (or that were left over from a prior crash).
|
||||
for m in /dev/mapper/pvc-snap-*; do
|
||||
[ -e "$m" ] || continue
|
||||
cryptsetup close "$(basename "$m")" 2>/dev/null || true
|
||||
done
|
||||
rm -f "${LOCKFILE}"
|
||||
if [ -n "${KILLED}" ]; then
|
||||
# status=2 = aborted (matches lvm-pvc-snapshot's convention)
|
||||
push_metrics 2 "${TOTAL_BYTES:-0}"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
trap 'KILLED=1; exit 143' TERM INT
|
||||
|
||||
if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
|
||||
die "Another instance is running (PID $(cat "${LOCKFILE}" 2>/dev/null || echo unknown))"
|
||||
fi
|
||||
|
||||
# Belt-and-braces: if a previous run was SIGTERM'd before its trap completed,
|
||||
# /tmp/pvc-mount may have stacked mounts and stale LUKS mappers. The lock above
|
||||
# guarantees we're alone, so it's safe to clean these up now.
|
||||
while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
|
||||
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
|
||||
done
|
||||
for m in /dev/mapper/pvc-snap-*; do
|
||||
[ -e "$m" ] || continue
|
||||
cryptsetup close "$(basename "$m")" 2>/dev/null || true
|
||||
done
|
||||
|
||||
# --- Metrics ---
|
||||
push_metrics() {
|
||||
local status="${1:-0}" bytes="${2:-0}"
|
||||
|
|
@ -243,6 +276,7 @@ fi
|
|||
log "--- Step 3: pfsense backup ---"
|
||||
PFSENSE_DEST="${BACKUP_ROOT}/pfsense"
|
||||
DATE=$(date +%Y%m%d)
|
||||
PFSENSE_STATUS=0
|
||||
mkdir -p "${PFSENSE_DEST}"
|
||||
|
||||
if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/dev/null; then
|
||||
|
|
@ -253,6 +287,7 @@ if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/de
|
|||
else
|
||||
warn "Failed to copy pfsense config.xml"
|
||||
STATUS=1
|
||||
PFSENSE_STATUS=1
|
||||
fi
|
||||
|
||||
# Full filesystem tar
|
||||
|
|
@ -264,21 +299,28 @@ if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/de
|
|||
else
|
||||
warn "Failed to tar pfsense filesystem"
|
||||
STATUS=1
|
||||
PFSENSE_STATUS=1
|
||||
fi
|
||||
|
||||
# Retention: keep 4 weekly copies
|
||||
ls -t "${PFSENSE_DEST}"/config-*.xml 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
|
||||
ls -t "${PFSENSE_DEST}"/pfsense-full-*.tar.gz 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
|
||||
|
||||
# Push pfsense-specific metric
|
||||
echo "backup_last_success_timestamp $(date +%s)" | \
|
||||
curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"${PUSHGATEWAY}/metrics/job/pfsense-backup" 2>/dev/null || true
|
||||
else
|
||||
warn "Cannot SSH to pfsense (10.0.20.1) — skipping"
|
||||
STATUS=1
|
||||
PFSENSE_STATUS=1
|
||||
fi
|
||||
|
||||
# Push pfsense-backup metrics in BOTH success and failure paths so
|
||||
# PfsenseBackupStale + PfsenseBackupFailing alerts can fire instead of going
|
||||
# silent when ssh-to-pfsense is broken.
|
||||
{
|
||||
echo "backup_last_run_timestamp $(date +%s)"
|
||||
echo "backup_last_status ${PFSENSE_STATUS}"
|
||||
[ "${PFSENSE_STATUS}" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
|
||||
} | curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"${PUSHGATEWAY}/metrics/job/pfsense-backup" 2>/dev/null || true
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: PVE host config backup
|
||||
# ============================================================
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue