t3-autoupdate: source the shared safe-restart lib + record deferrals
Behavior-preserving refactor: the per-unit restart/recover body and small helpers now come from t3-safe-restart.sh (one audited copy). Additionally, when a unit is deferred for an active agent, write a marker under /var/lib/t3-autoupdate/deferred/ so the new idle migrator can drain it later; clear the marker on a successful restart. Install/health-gate/canary logic is unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
2ab5b94748
commit
de97696ff0
1 changed files with 17 additions and 61 deletions
|
|
@ -21,7 +21,7 @@
|
||||||
# - canary rollout: restart idle instances ONE AT A TIME, verifying pairing
|
# - canary rollout: restart idle instances ONE AT A TIME, verifying pairing
|
||||||
# through the real dispatch after each, and roll back (binary + that user's DB)
|
# through the real dispatch after each, and roll back (binary + that user's DB)
|
||||||
# + self-freeze on the first failure — active-agent instances are deferred,
|
# + self-freeze on the first failure — active-agent instances are deferred,
|
||||||
# never killed;
|
# never killed (deferred instances are recorded for t3-migrate-idle to drain);
|
||||||
# - rollback target is the recorded LAST-GOOD build, not "whatever was installed".
|
# - rollback target is the recorded LAST-GOOD build, not "whatever was installed".
|
||||||
# Detection backstop (real-user pairing failure/fallback) lives in the dispatch
|
# Detection backstop (real-user pairing failure/fallback) lives in the dispatch
|
||||||
# logs + Loki alerts (T3PairingBroken / T3PairFallbackHigh / T3AutoUpdate*).
|
# logs + Loki alerts (T3PairingBroken / T3PairFallbackHigh / T3AutoUpdate*).
|
||||||
|
|
@ -29,24 +29,17 @@
|
||||||
# Full procedure + manual rollback: docs/runbooks/t3-version-bump.md.
|
# Full procedure + manual rollback: docs/runbooks/t3-version-bump.md.
|
||||||
set -uo pipefail
|
set -uo pipefail
|
||||||
|
|
||||||
|
# ---- autoupdate-specific config (shared config + helpers come from the lib) -----
|
||||||
T3_TRACK="${T3_TRACK:-nightly}" # npm dist-tag to follow (nightly | latest)
|
T3_TRACK="${T3_TRACK:-nightly}" # npm dist-tag to follow (nightly | latest)
|
||||||
T3_PIN="${T3_PIN:-}" # optional HARD pin to an exact version (disables tracking)
|
T3_PIN="${T3_PIN:-}" # optional HARD pin to an exact version (disables tracking)
|
||||||
FREEZE_FILE="${T3_FREEZE_FILE:-/etc/t3-autoupdate.freeze}"
|
|
||||||
STATE_DIR="${T3_STATE_DIR:-/var/lib/t3-autoupdate}"
|
|
||||||
LAST_GOOD_FILE="$STATE_DIR/last-good"
|
|
||||||
BACKUP_DIR="${T3_BACKUP_DEST:-/var/backups/t3-state}"
|
|
||||||
SMOKE_PORT="${T3_SMOKE_PORT:-3799}"
|
SMOKE_PORT="${T3_SMOKE_PORT:-3799}"
|
||||||
DISPATCH="${T3_DISPATCH:-127.0.0.1:3780}"
|
|
||||||
USER_MAP="${T3_USER_MAP:-/etc/ttyd-user-map}"
|
|
||||||
DRY_RUN="${T3_DRY_RUN:-0}"
|
DRY_RUN="${T3_DRY_RUN:-0}"
|
||||||
TMPROOT="${T3_TMPDIR:-/var/tmp}" # health-check scratch on DISK — /tmp is a 2G tmpfs and a populated state.sqlite (~hundreds of MB) overflows it
|
TMPROOT="${T3_TMPDIR:-/var/tmp}" # health-check scratch on DISK — /tmp is a 2G tmpfs and a populated state.sqlite (~hundreds of MB) overflows it
|
||||||
|
|
||||||
LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
|
LOG_TAG=t3-autoupdate
|
||||||
ver() { t3 --version 2>/dev/null | awk '{print $NF}' | sed 's/^v//'; }
|
# shellcheck source=scripts/t3-safe-restart.sh
|
||||||
# OS users owning a ~/.t3 (RHS of each non-comment "authentik=os_user" map line).
|
. "${T3_SAFE_RESTART_LIB:-/usr/local/lib/t3-safe-restart.sh}"
|
||||||
osusers() { awk -F= '!/^[[:space:]]*#/&&NF==2{gsub(/[[:space:]]/,"",$2);print $2}' "$USER_MAP" 2>/dev/null | sort -u; }
|
|
||||||
# authentik username for an OS user (reverse map; first match) — for dispatch verify.
|
|
||||||
ak_for() { awk -F= -v u="$1" '!/^[[:space:]]*#/&&NF==2{gsub(/[[:space:]]/,"",$1);gsub(/[[:space:]]/,"",$2);if($2==u){print $1;exit}}' "$USER_MAP" 2>/dev/null; }
|
|
||||||
# is $1 a strictly-newer version than $2 (version-sort)?
|
# is $1 a strictly-newer version than $2 (version-sort)?
|
||||||
newer() { [ "$1" != "$2" ] && [ "$(printf '%s\n%s\n' "$1" "$2" | sort -V | tail -1)" = "$1" ]; }
|
newer() { [ "$1" != "$2" ] && [ "$(printf '%s\n%s\n' "$1" "$2" | sort -V | tail -1)" = "$1" ]; }
|
||||||
|
|
||||||
|
|
@ -86,27 +79,21 @@ LOG "candidate: $current -> $target (track=$T3_TRACK, last_good=$last_good, dry_
|
||||||
# ---- helpers: backup, health-check, rollback, restart-verify --------------------
|
# ---- helpers: backup, health-check, rollback, restart-verify --------------------
|
||||||
# Online consistent per-user snapshot (run AS the owner so WAL stays owned; never
|
# Online consistent per-user snapshot (run AS the owner so WAL stays owned; never
|
||||||
# stops the serve). Sets $ADMIN_SEED to wizard's backup for the migration health
|
# stops the serve). Sets $ADMIN_SEED to wizard's backup for the migration health
|
||||||
# check. Mirrors t3-backup-state.sh.
|
# check. Mirrors t3-backup-state.sh. (backup_user lives in the shared lib.)
|
||||||
ADMIN_SEED=""
|
ADMIN_SEED=""
|
||||||
backup_all() {
|
backup_all() {
|
||||||
local u src out dst ts; ts="$(date +%Y%m%d-%H%M%S)"
|
local u dst
|
||||||
for u in $(osusers); do
|
for u in $(osusers); do
|
||||||
src="/home/$u/.t3/userdata/state.sqlite"; [ -f "$src" ] || continue
|
if dst="$(backup_user "$u")"; then
|
||||||
out="$BACKUP_DIR/$u"; dst="$out/state-prebump-$target-$ts.sqlite"
|
|
||||||
install -d -o "$u" -g "$u" -m700 "$out" 2>/dev/null || mkdir -p "$out"
|
|
||||||
if runuser -u "$u" -- timeout "${T3_BACKUP_TIMEOUT:-900}" sqlite3 "$src" "VACUUM INTO '$dst'" 2>/dev/null && [ -s "$dst" ]; then
|
|
||||||
LOG "pre-bump backup: $u -> $dst ($(stat -c%s "$dst" 2>/dev/null) bytes)"
|
LOG "pre-bump backup: $u -> $dst ($(stat -c%s "$dst" 2>/dev/null) bytes)"
|
||||||
[ "$u" = "wizard" ] && ADMIN_SEED="$dst"
|
[ "$u" = "wizard" ] && ADMIN_SEED="$dst"
|
||||||
else
|
else
|
||||||
LOG "WARN: pre-bump backup FAILED for $u ($src)"; rm -f "$dst"
|
LOG "WARN: pre-bump backup FAILED for $u (/home/$u/.t3/userdata/state.sqlite)"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
[ -n "$ADMIN_SEED" ] || ADMIN_SEED="$(ls -1t "$BACKUP_DIR"/*/"state-prebump-$target-"*.sqlite 2>/dev/null | head -1)"
|
[ -n "$ADMIN_SEED" ] || ADMIN_SEED="$(ls -1t "$BACKUP_DIR"/*/"state-prebump-$target-"*.sqlite 2>/dev/null | head -1)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# newest pre-bump backup taken THIS run for a user (for restore-on-rollback).
|
|
||||||
prebump_of() { ls -1t "$BACKUP_DIR/$1/state-prebump-$target-"*.sqlite 2>/dev/null | head -1; }
|
|
||||||
|
|
||||||
# health_check <t3bin> [seed_db]: start a throwaway serve (seeded with a copy of a
|
# health_check <t3bin> [seed_db]: start a throwaway serve (seeded with a copy of a
|
||||||
# real populated DB if given, so the forward migration runs on real data), then do
|
# real populated DB if given, so the forward migration runs on real data), then do
|
||||||
# the real mint -> credential-exchange -> t3_session pairing handshake with the
|
# the real mint -> credential-exchange -> t3_session pairing handshake with the
|
||||||
|
|
@ -143,27 +130,12 @@ health_check() {
|
||||||
rm -rf "$dir"; return 1
|
rm -rf "$dir"; return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# roll the GLOBAL binary back to last-good. Pre-restart failures need only this
|
|
||||||
# (no real DB migrated yet); post-restart failures also restore the user's DB.
|
|
||||||
rollback_binary() {
|
|
||||||
LOG "rolling back binary $target -> $last_good"
|
|
||||||
if npm i -g "t3@$last_good" >/dev/null 2>&1; then LOG "rolled back to $last_good"; return 0; fi
|
|
||||||
LOG "ROLLBACK FAILED — could not reinstall t3@$last_good (t3 may be broken; manual fix per runbook)"; return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# is this t3-serve@<unit> running an active agent (claude/codex/opencode)? never restart those.
|
# is this t3-serve@<unit> running an active agent (claude/codex/opencode)? never restart those.
|
||||||
unit_busy() {
|
unit_busy() {
|
||||||
local unit="$1" pid; pid="$(systemctl show -p MainPID --value "$unit" 2>/dev/null)"
|
local unit="$1" pid; pid="$(systemctl show -p MainPID --value "$unit" 2>/dev/null)"
|
||||||
[ -n "$pid" ] && [ "$pid" != "0" ] && pgrep -aP "$pid" 2>/dev/null | grep -qiE 'claude|codex|opencode'
|
[ -n "$pid" ] && [ "$pid" != "0" ] && pgrep -aP "$pid" 2>/dev/null | grep -qiE 'claude|codex|opencode'
|
||||||
}
|
}
|
||||||
|
|
||||||
# verify a user's pairing through the REAL dispatch (mint -> exchange -> cookie).
|
|
||||||
verify_pairing() {
|
|
||||||
local u="$1" ak out; ak="$(ak_for "$u")"; [ -n "$ak" ] || { LOG "no authentik mapping for $u — skipping dispatch verify"; return 0; }
|
|
||||||
out="$(curl -s -i --max-time 10 -H "X-authentik-username: $ak" -H 'Sec-Fetch-Dest: document' "http://$DISPATCH/" 2>/dev/null)"
|
|
||||||
printf '%s' "$out" | grep -qi '^set-cookie:[[:space:]]*t3_session='
|
|
||||||
}
|
|
||||||
|
|
||||||
# ---- 3. DRY RUN: preview only (install candidate to temp prefix, gate it) -------
|
# ---- 3. DRY RUN: preview only (install candidate to temp prefix, gate it) -------
|
||||||
if [ "$DRY_RUN" = "1" ]; then
|
if [ "$DRY_RUN" = "1" ]; then
|
||||||
LOG "DRY_RUN: would back up [$(osusers | tr '\n' ' ')]; testing candidate $target in a temp prefix (no global change, no restarts)"
|
LOG "DRY_RUN: would back up [$(osusers | tr '\n' ' ')]; testing candidate $target in a temp prefix (no global change, no restarts)"
|
||||||
|
|
@ -196,31 +168,15 @@ restarted=0; deferred=0
|
||||||
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' 2>/dev/null | awk '{print $1}'); do
|
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' 2>/dev/null | awk '{print $1}'); do
|
||||||
u="$(printf '%s' "$unit" | sed -n 's/^t3-serve@\(.*\)\.service$/\1/p')"; [ -n "$u" ] || continue
|
u="$(printf '%s' "$unit" | sed -n 's/^t3-serve@\(.*\)\.service$/\1/p')"; [ -n "$u" ] || continue
|
||||||
if unit_busy "$unit"; then
|
if unit_busy "$unit"; then
|
||||||
LOG "deferring $unit (active agent) — migrates on its next idle restart"; deferred=$((deferred+1)); continue
|
LOG "deferring $unit (active agent) — migrates on its next idle restart"
|
||||||
|
mkdir -p "$DEFER_DIR" 2>/dev/null && printf '%s\n' "$target" >"$DEFER_DIR/$u" # record for t3-migrate-idle
|
||||||
|
deferred=$((deferred+1)); continue
|
||||||
fi
|
fi
|
||||||
systemctl restart "$unit" || LOG "WARN: systemctl restart $unit returned non-zero"
|
if safe_restart_unit "$unit" "$u"; then
|
||||||
ok=0
|
restarted=$((restarted+1))
|
||||||
for _ in $(seq 1 15); do
|
rm -f "$DEFER_DIR/$u" 2>/dev/null # now current — clear any stale marker
|
||||||
if verify_pairing "$u"; then ok=1; break; fi
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
if [ "$ok" = "1" ]; then
|
|
||||||
LOG "restarted $unit -> $target (pairing verified via dispatch)"; restarted=$((restarted+1))
|
|
||||||
else
|
else
|
||||||
LOG "HEALTH-CHECK FAILED: $u pairing broken AFTER restart onto $target — rolling back + restoring its DB"
|
exit 1 # frozen by safe_restart_unit — preserve today's behavior
|
||||||
rollback_binary
|
|
||||||
bak="$(prebump_of "$u")"
|
|
||||||
if [ -n "$bak" ]; then
|
|
||||||
systemctl stop "$unit" 2>/dev/null
|
|
||||||
if install -o "$u" -g "$u" -m600 "$bak" "/home/$u/.t3/userdata/state.sqlite" 2>/dev/null; then
|
|
||||||
rm -f "/home/$u/.t3/userdata/state.sqlite-wal" "/home/$u/.t3/userdata/state.sqlite-shm"
|
|
||||||
LOG "restored $u state.sqlite from $bak"
|
|
||||||
fi
|
|
||||||
systemctl start "$unit" 2>/dev/null
|
|
||||||
fi
|
|
||||||
touch "$FREEZE_FILE" 2>/dev/null
|
|
||||||
LOG "FROZEN ($FREEZE_FILE) after canary $u failed on $target; last_good stays $last_good — investigate, then remove the freeze file to resume"
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue