scripts: timeout rsync + sqlite calls in daily-backup
Per-PVC rsync had no timeout, so any single hung PVC (e.g. on a corrupted snapshot or a sqlite held open by a writer) blocked the whole script until systemd's 4h TimeoutStartSec kicked in, leaving every later PVC silently unbacked. Today's run hung on mailserver/roundcubemail-enigma-encrypted at 05:09 and didn't recover — hence WeeklyBackupFailing alert. Now: - rsync per PVC: timeout 30 min, exit 124 logged separately - sqlite3 per database: timeout 5 min - /etc/pve rsync: timeout 5 min Each timed-out PVC bumps PVC_FAIL but the loop keeps moving. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
fddf168ecb
commit
d2be0921e8
1 changed files with 21 additions and 3 deletions
|
|
@ -207,7 +207,15 @@ else
|
||||||
dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}"
|
dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}"
|
||||||
mkdir -p "${dst}"
|
mkdir -p "${dst}"
|
||||||
rsync_rc=0
|
rsync_rc=0
|
||||||
rsync -az --delete \
|
# Per-PVC rsync timeout (30 min). Without this, a single hung
|
||||||
|
# PVC blocks the entire backup until systemd's TimeoutStartSec
|
||||||
|
# kills the script (4h ceiling), leaving every later PVC
|
||||||
|
# unbacked and silently triggering WeeklyBackupFailing. Picked
|
||||||
|
# 30 min as well above the largest PVC's normal copy time
|
||||||
|
# (immich-postgres ~10 GiB, ~3 min on local ext4) and well
|
||||||
|
# below the unit-level budget so we still have headroom to
|
||||||
|
# finish the rest.
|
||||||
|
timeout 1800 rsync -az --delete \
|
||||||
${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \
|
${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \
|
||||||
"${PVC_MOUNT}/" "${dst}/" 2>&1 || rsync_rc=$?
|
"${PVC_MOUNT}/" "${dst}/" 2>&1 || rsync_rc=$?
|
||||||
if [ "$rsync_rc" -eq 0 ]; then
|
if [ "$rsync_rc" -eq 0 ]; then
|
||||||
|
|
@ -217,6 +225,12 @@ else
|
||||||
# (in-flight writes have corrupt metadata from skipped journal replay)
|
# (in-flight writes have corrupt metadata from skipped journal replay)
|
||||||
PVC_COUNT=$((PVC_COUNT + 1))
|
PVC_COUNT=$((PVC_COUNT + 1))
|
||||||
log " partial rsync (LUKS noload) for ${ns_pvc} — OK"
|
log " partial rsync (LUKS noload) for ${ns_pvc} — OK"
|
||||||
|
elif [ "$rsync_rc" -eq 124 ]; then
|
||||||
|
# `timeout` exit 124 = wall-clock killed the rsync. Track
|
||||||
|
# separately so the next run still produces a metric and
|
||||||
|
# doesn't pretend nothing happened.
|
||||||
|
warn "rsync timed out for ${ns_pvc} after 30 min — moving on"
|
||||||
|
PVC_FAIL=$((PVC_FAIL + 1))
|
||||||
else
|
else
|
||||||
warn "rsync failed for ${ns_pvc} (rc=$rsync_rc)"
|
warn "rsync failed for ${ns_pvc} (rc=$rsync_rc)"
|
||||||
PVC_FAIL=$((PVC_FAIL + 1))
|
PVC_FAIL=$((PVC_FAIL + 1))
|
||||||
|
|
@ -232,7 +246,11 @@ else
|
||||||
relpath="${dbfile#${PVC_MOUNT}/}"
|
relpath="${dbfile#${PVC_MOUNT}/}"
|
||||||
dest_file="${BACKUP_ROOT}/sqlite-backup/${WEEK}/${ns_pvc}/${relpath}"
|
dest_file="${BACKUP_ROOT}/sqlite-backup/${WEEK}/${ns_pvc}/${relpath}"
|
||||||
mkdir -p "$(dirname "${dest_file}")"
|
mkdir -p "$(dirname "${dest_file}")"
|
||||||
if sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
|
# 5-min sqlite timeout — same hang-prevention idea
|
||||||
|
# as rsync above. A corrupted SQLite or one held
|
||||||
|
# open by a writer in the snapshot can otherwise
|
||||||
|
# block .backup indefinitely.
|
||||||
|
if timeout 300 sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
|
||||||
log " SQLite: ${ns_pvc}/${relpath}"
|
log " SQLite: ${ns_pvc}/${relpath}"
|
||||||
else
|
else
|
||||||
cp "${dbfile}" "${dest_file}" 2>/dev/null || true
|
cp "${dbfile}" "${dest_file}" 2>/dev/null || true
|
||||||
|
|
@ -326,7 +344,7 @@ fi
|
||||||
# ============================================================
|
# ============================================================
|
||||||
log "--- Step 4: PVE host config ---"
|
log "--- Step 4: PVE host config ---"
|
||||||
mkdir -p "${BACKUP_ROOT}/pve-config/scripts"
|
mkdir -p "${BACKUP_ROOT}/pve-config/scripts"
|
||||||
rsync -az --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
|
timeout 300 rsync -az --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
|
||||||
for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do
|
for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do
|
||||||
[ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true
|
[ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue