t3: prepare to adopt 0.0.25 — version-agnostic dispatch + real pairing health-check + state backup [ci skip]

Investigated the 0.0.25 break: it is ONLY an endpoint rename
(/api/auth/bootstrap -> /api/auth/browser-session). The rest of the pairing
contract (credential payload, t3_session cookie, /api/auth/session) is
byte-identical, verified in isolated 0.0.24-vs-0.0.25 sandbox serves. So a
future pin bump is now safe + reversible (pin STAYS 0.0.24 — this is prep):

- t3-dispatch: autoPair tries /api/auth/browser-session, falls back to
  /api/auth/bootstrap on 404 — one binary pairs across both versions and any
  rolling-restart skew. TDD via TestAutoPairAcrossVersions (red on 0.0.25
  before, green after). Built, deployed, verified live on 0.0.24 (all three
  users still 302 + t3_session via the fallback).
- t3-autoupdate.sh: health-check now exercises the REAL mint->credential->cookie
  handshake (was GET / -> 200, which passed the pairing-broken nightly). A bad
  build now auto-rolls-back. Validated against both versions.
- t3-backup-state.{sh,service,timer}: daily online VACUUM INTO of each ~/.t3
  state.sqlite (was the only copy, unbacked) -> the one-way forward schema
  migration becomes a restore, not sqlite surgery. timeout-guarded.
- runbooks/t3-version-bump.md: the reversible cutover checklist.
- post-mortem #5 (health-check) DONE + #6 added; service-catalog updated.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 20:00:11 +00:00
parent 5ea238c707
commit bccaa08d8e
9 changed files with 311 additions and 19 deletions

View file

@ -9,8 +9,10 @@
# To move the pin: bump T3_PIN AND first verify t3-dispatch's bootstrap flow against the
# new build (curl the dispatch -> expect 302 + Set-Cookie t3_session). See post-mortem
# 2026-06-09-t3-nightly-autoupdate-auth-outage.md.
# CAVEAT: the health-check below only probes GET / (200) — it does NOT exercise the
# mint/bootstrap/pairing path, so it will NOT catch an auth regression on its own.
# The health-check below exercises the REAL pairing handshake (mint -> credential
# exchange -> t3_session cookie), mirroring t3-dispatch's endpoint fallback — so a
# build that renames or breaks the pairing API fails the check and auto-rolls-back
# (closes the 2026-06-09 miss, where a GET / probe passed a pairing-broken build).
set -uo pipefail
T3_PIN="${T3_PIN:-0.0.24}" # known-good, t3-dispatch-compatible (2026-06-09 post-mortem)
LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
@ -27,17 +29,34 @@ fi
LOG "re-pinned to $after (was $before); health-checking…"
# Health-check the NEW binary on a throwaway port/base-dir before trusting it.
# Gate 1 = liveness (GET / -> 200); Gate 2 = the REAL pairing handshake t3-dispatch
# performs (mint -> POST credential -> 200 + t3_session cookie), trying the same
# endpoint fallback. Gate 2 catches a bootstrap-API rename / pairing regression.
SMOKE_PORT=3799; SMOKE_DIR=$(mktemp -d)
t3 serve --host 127.0.0.1 --port "$SMOKE_PORT" --base-dir "$SMOKE_DIR" >/dev/null 2>&1 &
smoke=$!; ok=0
smoke=$!; live=0; pair_ok=0
for _ in $(seq 1 15); do
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { ok=1; break; }
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { live=1; break; }
sleep 2
done
if [[ "$live" == "1" ]]; then
cred=$(t3 auth pairing create --base-dir "$SMOKE_DIR" --ttl 5m --json 2>/dev/null \
| tr -d '\n ' | sed -n 's/.*"credential":"\([^"]*\)".*/\1/p')
if [[ -n "$cred" ]]; then
for ep in /api/auth/browser-session /api/auth/bootstrap; do # mirror t3-dispatch's fallback
hdr=$(curl -s -i --max-time 5 -X POST -H 'Content-Type: application/json' \
-d "{\"credential\":\"$cred\"}" "http://127.0.0.1:$SMOKE_PORT$ep" 2>/dev/null)
code=$(printf '%s' "$hdr" | sed -n '1s#.* \([0-9][0-9][0-9]\).*#\1#p')
[[ "$code" == "404" ]] && continue # endpoint absent in this build — try the next
printf '%s' "$hdr" | grep -qi '^set-cookie:[[:space:]]*t3_session=' && pair_ok=1
break
done
fi
fi
kill "$smoke" 2>/dev/null; wait "$smoke" 2>/dev/null; rm -rf "$SMOKE_DIR"
if [[ "$ok" != "1" ]]; then
LOG "HEALTH-CHECK FAILED for $after — rolling back to $before"
if [[ "$live" != "1" || "$pair_ok" != "1" ]]; then
LOG "HEALTH-CHECK FAILED for $after (live=$live pair=$pair_ok) — rolling back to $before"
if [[ -n "$before" ]] && npm i -g "t3@$before" >/dev/null 2>&1; then
LOG "rolled back to $before"
else
@ -45,7 +64,7 @@ if [[ "$ok" != "1" ]]; then
fi
exit 1
fi
LOG "health OK; restarting idle instances"
LOG "health OK (live + pairing handshake); restarting idle instances"
# Restart only IDLE per-user instances; defer any with an active agent child.
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' | awk '{print $1}'); do

View file

@ -0,0 +1,6 @@
[Unit]
Description=Consistent backup of per-user t3 ~/.t3 state.sqlite (history + auth)
[Service]
Type=oneshot
ExecStart=/usr/local/bin/t3-backup-state

View file

@ -0,0 +1,43 @@
#!/usr/bin/env bash
# Consistent online backup of each t3 user's ~/.t3 state.sqlite (chat/session
# history AND auth tables). ~/.t3 lives on the devvm local disk — NOT a K8s PVC and
# NOT in the 3-2-1 pipeline — so without this it is the only copy and a rebuild
# loses it. It also makes a t3 version bump REVERSIBLE: 0.0.25+ migrate the schema
# FORWARD (a one-way door), so a clean pre-bump backup turns rollback into a restore
# instead of per-user sqlite surgery (see runbooks/t3-version-bump.md). Runs as root
# via t3-backup-state.timer; the per-user .backup runs AS the owning user so the live
# WAL/-shm files keep their owner and the running t3-serve is never perturbed.
set -uo pipefail
DEST="${T3_BACKUP_DEST:-/var/backups/t3-state}"
KEEP="${T3_BACKUP_KEEP:-14}"
MAP=/etc/ttyd-user-map
LOG() { logger -t t3-backup-state "$*"; echo "t3-backup-state: $*"; }
ts=$(date +%Y%m%d-%H%M%S)
# RHS of each non-comment "authentik=os_user" line = an OS user owning a ~/.t3.
mapfile -t users < <(awk -F= '!/^[[:space:]]*#/ && NF==2 { gsub(/[[:space:]]/,"",$2); print $2 }' "$MAP" 2>/dev/null | sort -u)
[[ ${#users[@]} -gt 0 ]] || { LOG "no users in $MAP; nothing to back up"; exit 0; }
rc=0
for u in "${users[@]}"; do
src="/home/$u/.t3/userdata/state.sqlite"
if [[ ! -f "$src" ]]; then LOG "skip $u (no state.sqlite)"; continue; fi
out="$DEST/$u"; dst="$out/state-$ts.sqlite"
install -d -o "$u" -g "$u" -m 0700 "$out"
# VACUUM INTO takes a consistent read-snapshot copy — unlike .backup it does NOT
# restart when the source is written mid-copy, so it finishes in a single pass even
# for the actively-used instance (the admin's own live session, which .backup would
# loop on forever). Run as the owning user so WAL access keeps the live serve happy.
# timeout caps a pathologically-slow copy (huge DB + concurrent writes on a contended
# disk) so the daily run can never wedge — it just logs + retries next cycle. The
# daily 03:30 slot normally finds instances idle, where even a large DB copies fast.
if runuser -u "$u" -- timeout "${T3_BACKUP_TIMEOUT:-900}" sqlite3 "$src" "VACUUM INTO '$dst'" 2>/dev/null && [[ -s "$dst" ]]; then
LOG "backed up $u -> $dst ($(stat -c%s "$dst" 2>/dev/null) bytes)"
else
LOG "WARN: backup FAILED for $u ($src)"; rc=1; rm -f "$dst"
fi
# retention: keep newest $KEEP per user
ls -1t "$out"/state-*.sqlite 2>/dev/null | tail -n +$((KEEP+1)) | xargs -r rm -f
done
LOG "done (rc=$rc)"
exit $rc

View file

@ -0,0 +1,10 @@
[Unit]
Description=Daily t3 state.sqlite backup (the only copy of ~/.t3; enables version-bump rollback)
[Timer]
OnCalendar=*-*-* 03:30:00
RandomizedDelaySec=20m
Persistent=true
[Install]
WantedBy=timers.target

View file

@ -113,9 +113,42 @@ func isDocumentNav(r *http.Request) bool {
return strings.Contains(r.Header.Get("Accept"), "text/html")
}
// pairEndpoints are the instance's session-bootstrap paths in preference order.
// t3 renamed /api/auth/bootstrap -> /api/auth/browser-session in 0.0.25; trying the
// new name first and falling back to the old lets ONE dispatch binary pair against
// either version — so the t3 pin can move forward (and survive a rolling-restart
// skew where some instances are already on the new version) without a 502 storm.
var pairEndpoints = []string{"/api/auth/browser-session", "/api/auth/bootstrap"}
// exchangeCredential POSTs the pairing credential to the user's instance, trying
// each pairEndpoint in turn. A 404 means "absent in this t3 version" -> try the
// next; any other status is that endpoint's verdict, returned as-is. Caller owns
// resp.Body.
func exchangeCredential(port int, credential string) (*http.Response, error) {
body, _ := json.Marshal(map[string]string{"credential": credential})
var lastErr error
for _, ep := range pairEndpoints {
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d%s", port, ep),
"application/json", bytes.NewReader(body))
if err != nil {
lastErr = err
continue
}
if resp.StatusCode == http.StatusNotFound {
resp.Body.Close() // endpoint absent in this t3 version — try the next
continue
}
return resp, nil
}
if lastErr != nil {
return nil, lastErr
}
return nil, fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
}
// autoPair mints a one-time pairing token for the user's instance (as that OS
// user, via the scoped sudoers entry) and exchanges it at the instance's
// /api/auth/bootstrap, relaying the returned t3_session Set-Cookie to the browser.
// user, via the scoped sudoers entry) and exchanges it at the instance's pairing
// endpoint, relaying the returned t3_session Set-Cookie to the browser.
func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
// t3-mint (root, via scoped sudoers) validates the OS user is in
// /etc/ttyd-user-map, then mints as that user. The dispatch service itself
@ -133,16 +166,15 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
return
}
body, _ := json.Marshal(map[string]string{"credential": pc.Credential})
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/auth/bootstrap", e.Port),
"application/json", bytes.NewReader(body))
resp, err := exchangeCredential(e.Port, pc.Credential)
if err != nil {
log.Printf("pairing exchange for %s failed: %v", e.OsUser, err)
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
return
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("bootstrap for %s returned %d", e.OsUser, resp.StatusCode)
log.Printf("pairing for %s returned %d", e.OsUser, resp.StatusCode)
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
return
}

View file

@ -117,6 +117,8 @@ func fakeInstance(authenticated bool, bootstrapCalled *bool) *httptest.Server {
}
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "fresh", Path: "/"})
_, _ = w.Write([]byte(`{"authenticated":true}`))
case "/api/auth/browser-session":
http.NotFound(w, r) // models a 0.0.24 instance: the 0.0.25 endpoint is absent
default:
_, _ = w.Write([]byte("APP"))
}
@ -198,3 +200,61 @@ func TestHandlerProxiesXHREvenIfCookieInvalid(t *testing.T) {
t.Fatalf("XHR should proxy through, got %d %q", w.Code, w.Body.String())
}
}
// pairInstance simulates a t3 instance that exposes pairing at exactly one path
// (200 + t3_session) and 404s the other known path — modeling the 0.0.25 rename of
// /api/auth/bootstrap -> /api/auth/browser-session. records which path was hit.
func pairInstance(pairPath string, hit *string) *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/api/auth/browser-session", "/api/auth/bootstrap":
if r.URL.Path != pairPath {
http.NotFound(w, r) // endpoint absent in this t3 version
return
}
if hit != nil {
*hit = r.URL.Path
}
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "fresh", Path: "/"})
_, _ = w.Write([]byte(`{"authenticated":true}`))
default:
http.NotFound(w, r)
}
}))
}
// TestAutoPairAcrossVersions: one dispatch binary must pair against BOTH the
// 0.0.24 endpoint (/api/auth/bootstrap) and the 0.0.25 one (/api/auth/browser-session),
// so the pin can move forward (and survive rolling-restart skew) without a 502 storm.
func TestAutoPairAcrossVersions(t *testing.T) {
orig := mintToken
mintToken = func(string) ([]byte, error) { return []byte(`{"credential":"tok"}`), nil }
defer func() { mintToken = orig }()
for _, tc := range []struct{ name, pairPath string }{
{"0.0.25 browser-session", "/api/auth/browser-session"},
{"0.0.24 bootstrap", "/api/auth/bootstrap"},
} {
t.Run(tc.name, func(t *testing.T) {
var hit string
ts := pairInstance(tc.pairPath, &hit)
defer ts.Close()
setTable(portOf(t, ts))
r := httptest.NewRequest("GET", "/", nil)
r.Header.Set("X-authentik-username", "vbarzin@gmail.com") // no cookie -> autoPair
w := httptest.NewRecorder()
handler(w, r)
if w.Code != http.StatusFound {
t.Fatalf("want 302 re-pair, got %d body=%q", w.Code, w.Body.String())
}
if hit != tc.pairPath {
t.Fatalf("want pairing via %s, hit=%q", tc.pairPath, hit)
}
if cs := w.Result().Cookies(); len(cs) == 0 || cs[0].Value != "fresh" {
t.Fatalf("want fresh t3_session relayed, got %+v", cs)
}
})
}
}