t3: prepare to adopt 0.0.25 — version-agnostic dispatch + real pairing health-check + state backup [ci skip]
Investigated the 0.0.25 break: it is ONLY an endpoint rename
(/api/auth/bootstrap -> /api/auth/browser-session). The rest of the pairing
contract (credential payload, t3_session cookie, /api/auth/session) is
byte-identical, verified in isolated 0.0.24-vs-0.0.25 sandbox serves. So a
future pin bump is now safe + reversible (pin STAYS 0.0.24 — this is prep):
- t3-dispatch: autoPair tries /api/auth/browser-session, falls back to
/api/auth/bootstrap on 404 — one binary pairs across both versions and any
rolling-restart skew. TDD via TestAutoPairAcrossVersions (red on 0.0.25
before, green after). Built, deployed, verified live on 0.0.24 (all three
users still 302 + t3_session via the fallback).
- t3-autoupdate.sh: health-check now exercises the REAL mint->credential->cookie
handshake (was GET / -> 200, which passed the pairing-broken nightly). A bad
build now auto-rolls-back. Validated against both versions.
- t3-backup-state.{sh,service,timer}: daily online VACUUM INTO of each ~/.t3
state.sqlite (was the only copy, unbacked) -> the one-way forward schema
migration becomes a restore, not sqlite surgery. timeout-guarded.
- runbooks/t3-version-bump.md: the reversible cutover checklist.
- post-mortem #5 (health-check) DONE + #6 added; service-catalog updated.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
5ea238c707
commit
bccaa08d8e
9 changed files with 311 additions and 19 deletions
|
|
@ -9,8 +9,10 @@
|
|||
# To move the pin: bump T3_PIN AND first verify t3-dispatch's bootstrap flow against the
|
||||
# new build (curl the dispatch -> expect 302 + Set-Cookie t3_session). See post-mortem
|
||||
# 2026-06-09-t3-nightly-autoupdate-auth-outage.md.
|
||||
# CAVEAT: the health-check below only probes GET / (200) — it does NOT exercise the
|
||||
# mint/bootstrap/pairing path, so it will NOT catch an auth regression on its own.
|
||||
# The health-check below exercises the REAL pairing handshake (mint -> credential
|
||||
# exchange -> t3_session cookie), mirroring t3-dispatch's endpoint fallback — so a
|
||||
# build that renames or breaks the pairing API fails the check and auto-rolls-back
|
||||
# (closes the 2026-06-09 miss, where a GET / probe passed a pairing-broken build).
|
||||
set -uo pipefail
|
||||
T3_PIN="${T3_PIN:-0.0.24}" # known-good, t3-dispatch-compatible (2026-06-09 post-mortem)
|
||||
LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
|
||||
|
|
@ -27,17 +29,34 @@ fi
|
|||
LOG "re-pinned to $after (was $before); health-checking…"
|
||||
|
||||
# Health-check the NEW binary on a throwaway port/base-dir before trusting it.
|
||||
# Gate 1 = liveness (GET / -> 200); Gate 2 = the REAL pairing handshake t3-dispatch
|
||||
# performs (mint -> POST credential -> 200 + t3_session cookie), trying the same
|
||||
# endpoint fallback. Gate 2 catches a bootstrap-API rename / pairing regression.
|
||||
SMOKE_PORT=3799; SMOKE_DIR=$(mktemp -d)
|
||||
t3 serve --host 127.0.0.1 --port "$SMOKE_PORT" --base-dir "$SMOKE_DIR" >/dev/null 2>&1 &
|
||||
smoke=$!; ok=0
|
||||
smoke=$!; live=0; pair_ok=0
|
||||
for _ in $(seq 1 15); do
|
||||
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { ok=1; break; }
|
||||
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { live=1; break; }
|
||||
sleep 2
|
||||
done
|
||||
if [[ "$live" == "1" ]]; then
|
||||
cred=$(t3 auth pairing create --base-dir "$SMOKE_DIR" --ttl 5m --json 2>/dev/null \
|
||||
| tr -d '\n ' | sed -n 's/.*"credential":"\([^"]*\)".*/\1/p')
|
||||
if [[ -n "$cred" ]]; then
|
||||
for ep in /api/auth/browser-session /api/auth/bootstrap; do # mirror t3-dispatch's fallback
|
||||
hdr=$(curl -s -i --max-time 5 -X POST -H 'Content-Type: application/json' \
|
||||
-d "{\"credential\":\"$cred\"}" "http://127.0.0.1:$SMOKE_PORT$ep" 2>/dev/null)
|
||||
code=$(printf '%s' "$hdr" | sed -n '1s#.* \([0-9][0-9][0-9]\).*#\1#p')
|
||||
[[ "$code" == "404" ]] && continue # endpoint absent in this build — try the next
|
||||
printf '%s' "$hdr" | grep -qi '^set-cookie:[[:space:]]*t3_session=' && pair_ok=1
|
||||
break
|
||||
done
|
||||
fi
|
||||
fi
|
||||
kill "$smoke" 2>/dev/null; wait "$smoke" 2>/dev/null; rm -rf "$SMOKE_DIR"
|
||||
|
||||
if [[ "$ok" != "1" ]]; then
|
||||
LOG "HEALTH-CHECK FAILED for $after — rolling back to $before"
|
||||
if [[ "$live" != "1" || "$pair_ok" != "1" ]]; then
|
||||
LOG "HEALTH-CHECK FAILED for $after (live=$live pair=$pair_ok) — rolling back to $before"
|
||||
if [[ -n "$before" ]] && npm i -g "t3@$before" >/dev/null 2>&1; then
|
||||
LOG "rolled back to $before"
|
||||
else
|
||||
|
|
@ -45,7 +64,7 @@ if [[ "$ok" != "1" ]]; then
|
|||
fi
|
||||
exit 1
|
||||
fi
|
||||
LOG "health OK; restarting idle instances"
|
||||
LOG "health OK (live + pairing handshake); restarting idle instances"
|
||||
|
||||
# Restart only IDLE per-user instances; defer any with an active agent child.
|
||||
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' | awk '{print $1}'); do
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue