diff --git a/scripts/nfs-mirror.sh b/scripts/nfs-mirror.sh index 882a8a9c..3e293c03 100644 --- a/scripts/nfs-mirror.sh +++ b/scripts/nfs-mirror.sh @@ -93,6 +93,16 @@ EXCLUDES=( --exclude='*@synoeastream' --exclude='/.DS_Store' --exclude='/Thumbs.db' + + # ---- transient SQLite sidecars (WAL mode) ---- + # Created/checkpointed/deleted constantly, so they vanish mid-rsync and trip + # exit code 24 (root cause of NfsMirrorFailing on calibre-web-automated's + # queue.db, 2026-05/06). They must NEVER be in a raw mirror anyway: a -wal/-shm + # without an atomic .db snapshot is useless to restore from. Consistent SQLite + # copies are made separately by daily-backup (SQLite backup API). + --exclude='*-wal' + --exclude='*-shm' + --exclude='*-journal' ) log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG"; } @@ -155,7 +165,12 @@ rsync \ DST_BYTES=$(df -B1 --output=used /mnt/backup | tail -1) -if [ "$RSYNC_RC" -eq 0 ]; then +# rsync exit 24 = "some source files vanished before transfer" — benign for a +# backup mirror: everything else copied; the vanished files are transient (e.g. +# SQLite WAL/SHM, now mostly caught by the excludes above). Treat as success so +# the offsite manifest still updates and NfsMirrorFailing doesn't false-fire. +if [ "$RSYNC_RC" -eq 0 ] || [ "$RSYNC_RC" -eq 24 ]; then + [ "$RSYNC_RC" -eq 24 ] && warn "rsync exited 24 (source files vanished mid-transfer) — treating as success" # Capture files that rsync created/modified and feed them to the offsite-sync # manifest so daily Step 1 incremental picks them up tomorrow morning. # Use -cnewer (ctime), not -newer (mtime): rsync -t preserves SOURCE mtime diff --git a/stacks/chrome-service/main.tf b/stacks/chrome-service/main.tf index 30210808..d0db5c97 100644 --- a/stacks/chrome-service/main.tf +++ b/stacks/chrome-service/main.tf @@ -445,6 +445,10 @@ resource "kubernetes_deployment" "chrome_service" { # clobber to the novnc image stick (chromium-not-found crashloop 2026-06-16) # because TF could not revert the ignored field. Removed so TF re-asserts the # pinned image. Keel is inert (keel.sh/policy=never) and no deploy step touches these. + # NOTE: the LIVE pod's container order had drifted to [novnc, chrome-service, + # snapshot] vs this file's [chrome-service, novnc, snapshot]; a TF apply reorders + # them to match here (harmless), so `containers[0]` differs between live and TF + # until the next apply lands — don't be alarmed reading it back mid-reconcile. spec[0].template[0].spec[0].init_container[0].image, metadata[0].annotations["kubernetes.io/change-cause"], metadata[0].annotations["deployment.kubernetes.io/revision"], diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index a86c832f..4ca6667c 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2840,6 +2840,7 @@ serverFiles: annotations: summary: "MAM ratio is {{ $value | printf \"%.2f\" }} for 24h (target: >= 1.0)" - alert: MAMFarmingStuck + # Metric source: stacks/servarr/mam-farming/files/freeleech-grabber.py # Heartbeat-based: fires only when the grabber CronJob has not COMPLETED # a run in >4h (the original failure mode: Forbid-blocked / wedged in # ContainerCreating). The grabber heartbeats mam_grabber_last_run_timestamp