t3: prepare to adopt 0.0.25 — version-agnostic dispatch + real pairing health-check + state backup [ci skip]
Investigated the 0.0.25 break: it is ONLY an endpoint rename
(/api/auth/bootstrap -> /api/auth/browser-session). The rest of the pairing
contract (credential payload, t3_session cookie, /api/auth/session) is
byte-identical, verified in isolated 0.0.24-vs-0.0.25 sandbox serves. So a
future pin bump is now safe + reversible (pin STAYS 0.0.24 — this is prep):
- t3-dispatch: autoPair tries /api/auth/browser-session, falls back to
/api/auth/bootstrap on 404 — one binary pairs across both versions and any
rolling-restart skew. TDD via TestAutoPairAcrossVersions (red on 0.0.25
before, green after). Built, deployed, verified live on 0.0.24 (all three
users still 302 + t3_session via the fallback).
- t3-autoupdate.sh: health-check now exercises the REAL mint->credential->cookie
handshake (was GET / -> 200, which passed the pairing-broken nightly). A bad
build now auto-rolls-back. Validated against both versions.
- t3-backup-state.{sh,service,timer}: daily online VACUUM INTO of each ~/.t3
state.sqlite (was the only copy, unbacked) -> the one-way forward schema
migration becomes a restore, not sqlite surgery. timeout-guarded.
- runbooks/t3-version-bump.md: the reversible cutover checklist.
- post-mortem #5 (health-check) DONE + #6 added; service-catalog updated.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
5ea238c707
commit
bccaa08d8e
9 changed files with 311 additions and 19 deletions
|
|
@ -9,8 +9,10 @@
|
|||
# To move the pin: bump T3_PIN AND first verify t3-dispatch's bootstrap flow against the
|
||||
# new build (curl the dispatch -> expect 302 + Set-Cookie t3_session). See post-mortem
|
||||
# 2026-06-09-t3-nightly-autoupdate-auth-outage.md.
|
||||
# CAVEAT: the health-check below only probes GET / (200) — it does NOT exercise the
|
||||
# mint/bootstrap/pairing path, so it will NOT catch an auth regression on its own.
|
||||
# The health-check below exercises the REAL pairing handshake (mint -> credential
|
||||
# exchange -> t3_session cookie), mirroring t3-dispatch's endpoint fallback — so a
|
||||
# build that renames or breaks the pairing API fails the check and auto-rolls-back
|
||||
# (closes the 2026-06-09 miss, where a GET / probe passed a pairing-broken build).
|
||||
set -uo pipefail
|
||||
T3_PIN="${T3_PIN:-0.0.24}" # known-good, t3-dispatch-compatible (2026-06-09 post-mortem)
|
||||
LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
|
||||
|
|
@ -27,17 +29,34 @@ fi
|
|||
LOG "re-pinned to $after (was $before); health-checking…"
|
||||
|
||||
# Health-check the NEW binary on a throwaway port/base-dir before trusting it.
|
||||
# Gate 1 = liveness (GET / -> 200); Gate 2 = the REAL pairing handshake t3-dispatch
|
||||
# performs (mint -> POST credential -> 200 + t3_session cookie), trying the same
|
||||
# endpoint fallback. Gate 2 catches a bootstrap-API rename / pairing regression.
|
||||
SMOKE_PORT=3799; SMOKE_DIR=$(mktemp -d)
|
||||
t3 serve --host 127.0.0.1 --port "$SMOKE_PORT" --base-dir "$SMOKE_DIR" >/dev/null 2>&1 &
|
||||
smoke=$!; ok=0
|
||||
smoke=$!; live=0; pair_ok=0
|
||||
for _ in $(seq 1 15); do
|
||||
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { ok=1; break; }
|
||||
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { live=1; break; }
|
||||
sleep 2
|
||||
done
|
||||
if [[ "$live" == "1" ]]; then
|
||||
cred=$(t3 auth pairing create --base-dir "$SMOKE_DIR" --ttl 5m --json 2>/dev/null \
|
||||
| tr -d '\n ' | sed -n 's/.*"credential":"\([^"]*\)".*/\1/p')
|
||||
if [[ -n "$cred" ]]; then
|
||||
for ep in /api/auth/browser-session /api/auth/bootstrap; do # mirror t3-dispatch's fallback
|
||||
hdr=$(curl -s -i --max-time 5 -X POST -H 'Content-Type: application/json' \
|
||||
-d "{\"credential\":\"$cred\"}" "http://127.0.0.1:$SMOKE_PORT$ep" 2>/dev/null)
|
||||
code=$(printf '%s' "$hdr" | sed -n '1s#.* \([0-9][0-9][0-9]\).*#\1#p')
|
||||
[[ "$code" == "404" ]] && continue # endpoint absent in this build — try the next
|
||||
printf '%s' "$hdr" | grep -qi '^set-cookie:[[:space:]]*t3_session=' && pair_ok=1
|
||||
break
|
||||
done
|
||||
fi
|
||||
fi
|
||||
kill "$smoke" 2>/dev/null; wait "$smoke" 2>/dev/null; rm -rf "$SMOKE_DIR"
|
||||
|
||||
if [[ "$ok" != "1" ]]; then
|
||||
LOG "HEALTH-CHECK FAILED for $after — rolling back to $before"
|
||||
if [[ "$live" != "1" || "$pair_ok" != "1" ]]; then
|
||||
LOG "HEALTH-CHECK FAILED for $after (live=$live pair=$pair_ok) — rolling back to $before"
|
||||
if [[ -n "$before" ]] && npm i -g "t3@$before" >/dev/null 2>&1; then
|
||||
LOG "rolled back to $before"
|
||||
else
|
||||
|
|
@ -45,7 +64,7 @@ if [[ "$ok" != "1" ]]; then
|
|||
fi
|
||||
exit 1
|
||||
fi
|
||||
LOG "health OK; restarting idle instances"
|
||||
LOG "health OK (live + pairing handshake); restarting idle instances"
|
||||
|
||||
# Restart only IDLE per-user instances; defer any with an active agent child.
|
||||
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' | awk '{print $1}'); do
|
||||
|
|
|
|||
6
scripts/t3-backup-state.service
Normal file
6
scripts/t3-backup-state.service
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
[Unit]
|
||||
Description=Consistent backup of per-user t3 ~/.t3 state.sqlite (history + auth)
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/t3-backup-state
|
||||
43
scripts/t3-backup-state.sh
Normal file
43
scripts/t3-backup-state.sh
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/env bash
|
||||
# Consistent online backup of each t3 user's ~/.t3 state.sqlite (chat/session
|
||||
# history AND auth tables). ~/.t3 lives on the devvm local disk — NOT a K8s PVC and
|
||||
# NOT in the 3-2-1 pipeline — so without this it is the only copy and a rebuild
|
||||
# loses it. It also makes a t3 version bump REVERSIBLE: 0.0.25+ migrate the schema
|
||||
# FORWARD (a one-way door), so a clean pre-bump backup turns rollback into a restore
|
||||
# instead of per-user sqlite surgery (see runbooks/t3-version-bump.md). Runs as root
|
||||
# via t3-backup-state.timer; the per-user .backup runs AS the owning user so the live
|
||||
# WAL/-shm files keep their owner and the running t3-serve is never perturbed.
|
||||
set -uo pipefail
|
||||
DEST="${T3_BACKUP_DEST:-/var/backups/t3-state}"
|
||||
KEEP="${T3_BACKUP_KEEP:-14}"
|
||||
MAP=/etc/ttyd-user-map
|
||||
LOG() { logger -t t3-backup-state "$*"; echo "t3-backup-state: $*"; }
|
||||
|
||||
ts=$(date +%Y%m%d-%H%M%S)
|
||||
# RHS of each non-comment "authentik=os_user" line = an OS user owning a ~/.t3.
|
||||
mapfile -t users < <(awk -F= '!/^[[:space:]]*#/ && NF==2 { gsub(/[[:space:]]/,"",$2); print $2 }' "$MAP" 2>/dev/null | sort -u)
|
||||
[[ ${#users[@]} -gt 0 ]] || { LOG "no users in $MAP; nothing to back up"; exit 0; }
|
||||
|
||||
rc=0
|
||||
for u in "${users[@]}"; do
|
||||
src="/home/$u/.t3/userdata/state.sqlite"
|
||||
if [[ ! -f "$src" ]]; then LOG "skip $u (no state.sqlite)"; continue; fi
|
||||
out="$DEST/$u"; dst="$out/state-$ts.sqlite"
|
||||
install -d -o "$u" -g "$u" -m 0700 "$out"
|
||||
# VACUUM INTO takes a consistent read-snapshot copy — unlike .backup it does NOT
|
||||
# restart when the source is written mid-copy, so it finishes in a single pass even
|
||||
# for the actively-used instance (the admin's own live session, which .backup would
|
||||
# loop on forever). Run as the owning user so WAL access keeps the live serve happy.
|
||||
# timeout caps a pathologically-slow copy (huge DB + concurrent writes on a contended
|
||||
# disk) so the daily run can never wedge — it just logs + retries next cycle. The
|
||||
# daily 03:30 slot normally finds instances idle, where even a large DB copies fast.
|
||||
if runuser -u "$u" -- timeout "${T3_BACKUP_TIMEOUT:-900}" sqlite3 "$src" "VACUUM INTO '$dst'" 2>/dev/null && [[ -s "$dst" ]]; then
|
||||
LOG "backed up $u -> $dst ($(stat -c%s "$dst" 2>/dev/null) bytes)"
|
||||
else
|
||||
LOG "WARN: backup FAILED for $u ($src)"; rc=1; rm -f "$dst"
|
||||
fi
|
||||
# retention: keep newest $KEEP per user
|
||||
ls -1t "$out"/state-*.sqlite 2>/dev/null | tail -n +$((KEEP+1)) | xargs -r rm -f
|
||||
done
|
||||
LOG "done (rc=$rc)"
|
||||
exit $rc
|
||||
10
scripts/t3-backup-state.timer
Normal file
10
scripts/t3-backup-state.timer
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
[Unit]
|
||||
Description=Daily t3 state.sqlite backup (the only copy of ~/.t3; enables version-bump rollback)
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 03:30:00
|
||||
RandomizedDelaySec=20m
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -113,9 +113,42 @@ func isDocumentNav(r *http.Request) bool {
|
|||
return strings.Contains(r.Header.Get("Accept"), "text/html")
|
||||
}
|
||||
|
||||
// pairEndpoints are the instance's session-bootstrap paths in preference order.
|
||||
// t3 renamed /api/auth/bootstrap -> /api/auth/browser-session in 0.0.25; trying the
|
||||
// new name first and falling back to the old lets ONE dispatch binary pair against
|
||||
// either version — so the t3 pin can move forward (and survive a rolling-restart
|
||||
// skew where some instances are already on the new version) without a 502 storm.
|
||||
var pairEndpoints = []string{"/api/auth/browser-session", "/api/auth/bootstrap"}
|
||||
|
||||
// exchangeCredential POSTs the pairing credential to the user's instance, trying
|
||||
// each pairEndpoint in turn. A 404 means "absent in this t3 version" -> try the
|
||||
// next; any other status is that endpoint's verdict, returned as-is. Caller owns
|
||||
// resp.Body.
|
||||
func exchangeCredential(port int, credential string) (*http.Response, error) {
|
||||
body, _ := json.Marshal(map[string]string{"credential": credential})
|
||||
var lastErr error
|
||||
for _, ep := range pairEndpoints {
|
||||
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d%s", port, ep),
|
||||
"application/json", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
resp.Body.Close() // endpoint absent in this t3 version — try the next
|
||||
continue
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
if lastErr != nil {
|
||||
return nil, lastErr
|
||||
}
|
||||
return nil, fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
|
||||
}
|
||||
|
||||
// autoPair mints a one-time pairing token for the user's instance (as that OS
|
||||
// user, via the scoped sudoers entry) and exchanges it at the instance's
|
||||
// /api/auth/bootstrap, relaying the returned t3_session Set-Cookie to the browser.
|
||||
// user, via the scoped sudoers entry) and exchanges it at the instance's pairing
|
||||
// endpoint, relaying the returned t3_session Set-Cookie to the browser.
|
||||
func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
|
||||
// t3-mint (root, via scoped sudoers) validates the OS user is in
|
||||
// /etc/ttyd-user-map, then mints as that user. The dispatch service itself
|
||||
|
|
@ -133,16 +166,15 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
|
|||
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
body, _ := json.Marshal(map[string]string{"credential": pc.Credential})
|
||||
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/auth/bootstrap", e.Port),
|
||||
"application/json", bytes.NewReader(body))
|
||||
resp, err := exchangeCredential(e.Port, pc.Credential)
|
||||
if err != nil {
|
||||
log.Printf("pairing exchange for %s failed: %v", e.OsUser, err)
|
||||
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Printf("bootstrap for %s returned %d", e.OsUser, resp.StatusCode)
|
||||
log.Printf("pairing for %s returned %d", e.OsUser, resp.StatusCode)
|
||||
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
|
|
|
|||
|
|
@ -117,6 +117,8 @@ func fakeInstance(authenticated bool, bootstrapCalled *bool) *httptest.Server {
|
|||
}
|
||||
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "fresh", Path: "/"})
|
||||
_, _ = w.Write([]byte(`{"authenticated":true}`))
|
||||
case "/api/auth/browser-session":
|
||||
http.NotFound(w, r) // models a 0.0.24 instance: the 0.0.25 endpoint is absent
|
||||
default:
|
||||
_, _ = w.Write([]byte("APP"))
|
||||
}
|
||||
|
|
@ -198,3 +200,61 @@ func TestHandlerProxiesXHREvenIfCookieInvalid(t *testing.T) {
|
|||
t.Fatalf("XHR should proxy through, got %d %q", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// pairInstance simulates a t3 instance that exposes pairing at exactly one path
|
||||
// (200 + t3_session) and 404s the other known path — modeling the 0.0.25 rename of
|
||||
// /api/auth/bootstrap -> /api/auth/browser-session. records which path was hit.
|
||||
func pairInstance(pairPath string, hit *string) *httptest.Server {
|
||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/api/auth/browser-session", "/api/auth/bootstrap":
|
||||
if r.URL.Path != pairPath {
|
||||
http.NotFound(w, r) // endpoint absent in this t3 version
|
||||
return
|
||||
}
|
||||
if hit != nil {
|
||||
*hit = r.URL.Path
|
||||
}
|
||||
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "fresh", Path: "/"})
|
||||
_, _ = w.Write([]byte(`{"authenticated":true}`))
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
// TestAutoPairAcrossVersions: one dispatch binary must pair against BOTH the
|
||||
// 0.0.24 endpoint (/api/auth/bootstrap) and the 0.0.25 one (/api/auth/browser-session),
|
||||
// so the pin can move forward (and survive rolling-restart skew) without a 502 storm.
|
||||
func TestAutoPairAcrossVersions(t *testing.T) {
|
||||
orig := mintToken
|
||||
mintToken = func(string) ([]byte, error) { return []byte(`{"credential":"tok"}`), nil }
|
||||
defer func() { mintToken = orig }()
|
||||
|
||||
for _, tc := range []struct{ name, pairPath string }{
|
||||
{"0.0.25 browser-session", "/api/auth/browser-session"},
|
||||
{"0.0.24 bootstrap", "/api/auth/bootstrap"},
|
||||
} {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
var hit string
|
||||
ts := pairInstance(tc.pairPath, &hit)
|
||||
defer ts.Close()
|
||||
setTable(portOf(t, ts))
|
||||
|
||||
r := httptest.NewRequest("GET", "/", nil)
|
||||
r.Header.Set("X-authentik-username", "vbarzin@gmail.com") // no cookie -> autoPair
|
||||
w := httptest.NewRecorder()
|
||||
handler(w, r)
|
||||
|
||||
if w.Code != http.StatusFound {
|
||||
t.Fatalf("want 302 re-pair, got %d body=%q", w.Code, w.Body.String())
|
||||
}
|
||||
if hit != tc.pairPath {
|
||||
t.Fatalf("want pairing via %s, hit=%q", tc.pairPath, hit)
|
||||
}
|
||||
if cs := w.Result().Cookies(); len(cs) == 0 || cs[0].Value != "fresh" {
|
||||
t.Fatalf("want fresh t3_session relayed, got %+v", cs)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue