t3-safe-restart: extract shared safe-restart library from t3-autoupdate

Pull the per-unit backup->restart->verify->recover routine (and the small helpers it needs) out of t3-autoupdate.sh into a sourced library, so a second job (the upcoming idle migrator) can reuse the exact same audited recovery path instead of forking safety-critical code. safe_restart_unit returns non-zero on failure (after recovery+freeze) rather than exiting, so callers control flow. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 12:27:37 +00:00 · 2026-06-21 12:27:37 +00:00 · 2ab5b94748
commit 2ab5b94748
parent 0cebeeb0ee
1 changed files with 96 additions and 0 deletions
--- a/scripts/t3-safe-restart.sh
+++ b/scripts/t3-safe-restart.sh
@ -0,0 +1,96 @@
 #!/usr/bin/env bash
 # t3-safe-restart.sh — SOURCED library (not executed). Shared by t3-autoupdate.sh
 # (daily gated tracker) and t3-migrate-idle.sh (overnight deferral drainer).
 #
 # Holds the per-unit "dangerous" routine — backup -> restart -> verify pairing ->
 # recover (restore DB + roll global binary back to last-good + freeze) — extracted
 # verbatim from t3-autoupdate.sh step 6, plus the small helpers it depends on.
 # The only change from the inline original: safe_restart_unit RETURNS non-zero on
 # failure (after performing recovery+freeze) instead of `exit 1`, so the CALLER
 # decides what to do (the daily job exits; the idle job stops draining).
 #
 # Callers must set, before calling safe_restart_unit: $target (version being moved
 # TO, for log lines + the prebump filename) and $last_good (rollback target).
 # Set $LOG_TAG before sourcing to tag syslog ("t3-autoupdate" / "t3-migrate-idle").
 # ---- shared config defaults (honour the original T3_* override names) -----------
 : "${LOG_TAG:=t3-safe-restart}"
 : "${FREEZE_FILE:=${T3_FREEZE_FILE:-/etc/t3-autoupdate.freeze}}"
 : "${STATE_DIR:=${T3_STATE_DIR:-/var/lib/t3-autoupdate}}"
 : "${LAST_GOOD_FILE:=$STATE_DIR/last-good}"
 : "${DEFER_DIR:=$STATE_DIR/deferred}"
 : "${BACKUP_DIR:=${T3_BACKUP_DEST:-/var/backups/t3-state}}"
 : "${DISPATCH:=${T3_DISPATCH:-127.0.0.1:3780}}"
 : "${USER_MAP:=${T3_USER_MAP:-/etc/ttyd-user-map}}"
 : "${T3_BACKUP_TIMEOUT:=900}"
 LOG() { logger -t "$LOG_TAG" "$*"; echo "$LOG_TAG: $*"; }
 ver() { t3 --version 2>/dev/null | awk '{print $NF}' | sed 's/^v//'; }
 # OS users owning a ~/.t3 (RHS of each non-comment "authentik=os_user" map line).
 osusers() { awk -F= '!/^[[:space:]]*#/&&NF==2{gsub(/[[:space:]]/,"",$2);print $2}' "$USER_MAP" 2>/dev/null | sort -u; }
 # authentik username for an OS user (reverse map; first match) — for dispatch verify.
 ak_for() { awk -F= -v u="$1" '!/^[[:space:]]*#/&&NF==2{gsub(/[[:space:]]/,"",$1);gsub(/[[:space:]]/,"",$2);if($2==u){print $1;exit}}' "$USER_MAP" 2>/dev/null; }
 # Online consistent snapshot of ONE user's state.sqlite (run AS the owner so the
 # WAL stays owned; never stops the serve). Uses global $target for the filename.
 # Echoes the backup path on success; non-zero on failure.
 backup_user() {
  local u="$1" src out dst ts
  src="/home/$u/.t3/userdata/state.sqlite"; [ -f "$src" ] || return 1
  ts="$(date +%Y%m%d-%H%M%S)"
  out="$BACKUP_DIR/$u"; dst="$out/state-prebump-$target-$ts.sqlite"
  install -d -o "$u" -g "$u" -m700 "$out" 2>/dev/null || mkdir -p "$out"
  if runuser -u "$u" -- timeout "$T3_BACKUP_TIMEOUT" sqlite3 "$src" "VACUUM INTO '$dst'" 2>/dev/null && [ -s "$dst" ]; then
    printf '%s\n' "$dst"; return 0
  fi
  rm -f "$dst"; return 1
 }
 # newest pre-bump backup for a user taken for the current $target (restore source).
 prebump_of() { ls -1t "$BACKUP_DIR/$1/state-prebump-$target-"*.sqlite 2>/dev/null | head -1; }
 # roll the GLOBAL binary back to last-good. In the idle path last_good==installed,
 # so this is a harmless no-op reinstall (does NOT downgrade other users).
 rollback_binary() {
  LOG "rolling back binary $target -> $last_good"
  if npm i -g "t3@$last_good" >/dev/null 2>&1; then LOG "rolled back to $last_good"; return 0; fi
  LOG "ROLLBACK FAILED — could not reinstall t3@$last_good (t3 may be broken; manual fix per runbook)"; return 1
 }
 # verify a user's pairing through the REAL dispatch (mint -> exchange -> cookie).
 verify_pairing() {
  local u="$1" ak out; ak="$(ak_for "$u")"; [ -n "$ak" ] || { LOG "no authentik mapping for $u — skipping dispatch verify"; return 0; }
  out="$(curl -s -i --max-time 10 -H "X-authentik-username: $ak" -H 'Sec-Fetch-Dest: document' "http://$DISPATCH/" 2>/dev/null)"
  printf '%s' "$out" | grep -qi '^set-cookie:[[:space:]]*t3_session='
 }
 # safe_restart_unit <unit> <user>: restart the unit, verify pairing; on failure
 # restore the user's DB from its pre-restart backup, roll the binary back, freeze.
 # Assumes a pre-restart backup already exists for <user> at the current $target
 # (the daily job's backup_all, or the idle job's backup_user, takes it first).
 # Returns 0 on verified success, non-zero after recovery+freeze on failure.
 safe_restart_unit() {
  local unit="$1" u="$2" ok=0 _ bak
  systemctl restart "$unit" || LOG "WARN: systemctl restart $unit returned non-zero"
  for _ in $(seq 1 15); do
    if verify_pairing "$u"; then ok=1; break; fi
    sleep 2
  done
  if [ "$ok" = "1" ]; then
    LOG "restarted $unit -> $target (pairing verified via dispatch)"; return 0
  fi
  LOG "HEALTH-CHECK FAILED: $u pairing broken AFTER restart onto $target — rolling back + restoring its DB"
  rollback_binary
  bak="$(prebump_of "$u")"
  if [ -n "$bak" ]; then
    systemctl stop "$unit" 2>/dev/null
    if install -o "$u" -g "$u" -m600 "$bak" "/home/$u/.t3/userdata/state.sqlite" 2>/dev/null; then
      rm -f "/home/$u/.t3/userdata/state.sqlite-wal" "/home/$u/.t3/userdata/state.sqlite-shm"
      LOG "restored $u state.sqlite from $bak"
    fi
    systemctl start "$unit" 2>/dev/null
  fi
  touch "$FREEZE_FILE" 2>/dev/null
  LOG "FROZEN ($FREEZE_FILE) after $u failed on $target; last_good stays $last_good — investigate, then remove the freeze file to resume"
  return 1
 }