scripts: timeout rsync + sqlite calls in daily-backup
Per-PVC rsync had no timeout, so any single hung PVC (e.g. on a corrupted snapshot or a sqlite held open by a writer) blocked the whole script until systemd's 4h TimeoutStartSec kicked in, leaving every later PVC silently unbacked. Today's run hung on mailserver/roundcubemail-enigma-encrypted at 05:09 and didn't recover — hence WeeklyBackupFailing alert. Now: - rsync per PVC: timeout 30 min, exit 124 logged separately - sqlite3 per database: timeout 5 min - /etc/pve rsync: timeout 5 min Each timed-out PVC bumps PVC_FAIL but the loop keeps moving. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
fddf168ecb
commit
d2be0921e8
1 changed files with 21 additions and 3 deletions
|
|
@ -207,7 +207,15 @@ else
|
|||
dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}"
|
||||
mkdir -p "${dst}"
|
||||
rsync_rc=0
|
||||
rsync -az --delete \
|
||||
# Per-PVC rsync timeout (30 min). Without this, a single hung
|
||||
# PVC blocks the entire backup until systemd's TimeoutStartSec
|
||||
# kills the script (4h ceiling), leaving every later PVC
|
||||
# unbacked and silently triggering WeeklyBackupFailing. Picked
|
||||
# 30 min as well above the largest PVC's normal copy time
|
||||
# (immich-postgres ~10 GiB, ~3 min on local ext4) and well
|
||||
# below the unit-level budget so we still have headroom to
|
||||
# finish the rest.
|
||||
timeout 1800 rsync -az --delete \
|
||||
${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \
|
||||
"${PVC_MOUNT}/" "${dst}/" 2>&1 || rsync_rc=$?
|
||||
if [ "$rsync_rc" -eq 0 ]; then
|
||||
|
|
@ -217,6 +225,12 @@ else
|
|||
# (in-flight writes have corrupt metadata from skipped journal replay)
|
||||
PVC_COUNT=$((PVC_COUNT + 1))
|
||||
log " partial rsync (LUKS noload) for ${ns_pvc} — OK"
|
||||
elif [ "$rsync_rc" -eq 124 ]; then
|
||||
# `timeout` exit 124 = wall-clock killed the rsync. Track
|
||||
# separately so the next run still produces a metric and
|
||||
# doesn't pretend nothing happened.
|
||||
warn "rsync timed out for ${ns_pvc} after 30 min — moving on"
|
||||
PVC_FAIL=$((PVC_FAIL + 1))
|
||||
else
|
||||
warn "rsync failed for ${ns_pvc} (rc=$rsync_rc)"
|
||||
PVC_FAIL=$((PVC_FAIL + 1))
|
||||
|
|
@ -232,7 +246,11 @@ else
|
|||
relpath="${dbfile#${PVC_MOUNT}/}"
|
||||
dest_file="${BACKUP_ROOT}/sqlite-backup/${WEEK}/${ns_pvc}/${relpath}"
|
||||
mkdir -p "$(dirname "${dest_file}")"
|
||||
if sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
|
||||
# 5-min sqlite timeout — same hang-prevention idea
|
||||
# as rsync above. A corrupted SQLite or one held
|
||||
# open by a writer in the snapshot can otherwise
|
||||
# block .backup indefinitely.
|
||||
if timeout 300 sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
|
||||
log " SQLite: ${ns_pvc}/${relpath}"
|
||||
else
|
||||
cp "${dbfile}" "${dest_file}" 2>/dev/null || true
|
||||
|
|
@ -326,7 +344,7 @@ fi
|
|||
# ============================================================
|
||||
log "--- Step 4: PVE host config ---"
|
||||
mkdir -p "${BACKUP_ROOT}/pve-config/scripts"
|
||||
rsync -az --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
|
||||
timeout 300 rsync -az --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
|
||||
for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do
|
||||
[ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true
|
||||
done
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue