t3: session-auth detection for the gated nightly tracker (dispatch fallback logging + Loki alerts)
Some checks failed
ci/woodpecker/push/default Pipeline failed

Before auto-tracking t3 nightly builds (Viktor's call, risk accepted), stand up
the detection that was missing on 2026-06-09 — when an auto-pulled nightly broke
pairing for ALL users and nothing alerted. Viktor's explicit requirement: make
sure session auth keeps working and revert if the pairing fallback/failure rate
climbs. This is phase 0 (detection) of that work.

- t3-dispatch: exchangeCredential now reports WHICH pairing endpoint answered,
  and autoPair logs every outcome (paired user=.. endpoint=.. fallback=..) — so
  the real-user browser-session->bootstrap fallback rate is observable. A
  non-zero rate flags that a build moved the pairing API (the 2026-06-09 class).
- Loki ruler alerts (devvm journal -> Alertmanager -> Slack): T3PairingBroken
  (real users failing to pair), T3PairFallbackHigh (build moved the pairing API),
  T3AutoUpdateRolledBack / RollbackFailed / Frozen (enforcer outcomes). Closes
  the post-mortem's open "nothing monitors end-to-end pairing" detection gap.

The existing t3-probe only checks GET /api/auth/session==200, which stays 200
even when pairing is dead, so it never caught the outage class.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-16 09:56:55 +00:00
parent e783cae2cb
commit 994d305d04
3 changed files with 132 additions and 8 deletions

View file

@ -122,9 +122,11 @@ var pairEndpoints = []string{"/api/auth/browser-session", "/api/auth/bootstrap"}
// exchangeCredential POSTs the pairing credential to the user's instance, trying
// each pairEndpoint in turn. A 404 means "absent in this t3 version" -> try the
// next; any other status is that endpoint's verdict, returned as-is. Caller owns
// resp.Body.
func exchangeCredential(port int, credential string) (*http.Response, error) {
// next; any other status is that endpoint's verdict, returned as-is. It also
// returns WHICH endpoint answered, so the caller can log the browser-session ->
// bootstrap fallback rate (a non-zero rate flags that the running t3 build moved
// the pairing API — the 2026-06-09 contract-drift class). Caller owns resp.Body.
func exchangeCredential(port int, credential string) (*http.Response, string, error) {
body, _ := json.Marshal(map[string]string{"credential": credential})
var lastErr error
for _, ep := range pairEndpoints {
@ -138,12 +140,12 @@ func exchangeCredential(port int, credential string) (*http.Response, error) {
resp.Body.Close() // endpoint absent in this t3 version — try the next
continue
}
return resp, nil
return resp, ep, nil
}
if lastErr != nil {
return nil, lastErr
return nil, "", lastErr
}
return nil, fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
return nil, "", fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
}
// autoPair mints a one-time pairing token for the user's instance (as that OS
@ -166,7 +168,7 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
return
}
resp, err := exchangeCredential(e.Port, pc.Credential)
resp, ep, err := exchangeCredential(e.Port, pc.Credential)
if err != nil {
log.Printf("pairing exchange for %s failed: %v", e.OsUser, err)
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
@ -174,13 +176,17 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("pairing for %s returned %d", e.OsUser, resp.StatusCode)
log.Printf("pairing for %s returned %d (endpoint=%s)", e.OsUser, resp.StatusCode, ep)
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
return
}
for _, c := range resp.Cookies() {
http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax)
}
// Success line is the steady-state signal: endpoint= which pairing path won,
// fallback=true iff we fell back off the first-preference endpoint (running
// t3 build moved the pairing API). t3-probe / Loki alert on the fallback rate.
log.Printf("paired user=%s endpoint=%s fallback=%t", e.OsUser, ep, ep != pairEndpoints[0])
http.Redirect(w, r, "/", http.StatusFound)
}

View file

@ -262,6 +262,42 @@ func TestAutoPairAcrossVersions(t *testing.T) {
}
}
// TestExchangeCredentialReportsEndpoint: exchangeCredential must report WHICH
// pairing endpoint accepted the credential, so the dispatch can log it and we
// can alert on the browser-session -> bootstrap fallback rate (a non-zero rate
// means the running t3 build moved/renamed the pairing API — contract drift, the
// 2026-06-09 failure class). fallback = endpoint is not the first-preference one.
func TestExchangeCredentialReportsEndpoint(t *testing.T) {
for _, tc := range []struct {
name, pairPath, wantEP string
wantFallback bool
}{
{"0.0.25 browser-session (primary)", "/api/auth/browser-session", "/api/auth/browser-session", false},
{"0.0.24 bootstrap (fallback)", "/api/auth/bootstrap", "/api/auth/bootstrap", true},
} {
t.Run(tc.name, func(t *testing.T) {
var hit string
ts := pairInstance(tc.pairPath, &hit)
defer ts.Close()
resp, ep, err := exchangeCredential(portOf(t, ts), "tok")
if err != nil {
t.Fatalf("exchangeCredential: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want 200", resp.StatusCode)
}
if ep != tc.wantEP {
t.Fatalf("endpoint = %q, want %q", ep, tc.wantEP)
}
if gotFallback := ep != pairEndpoints[0]; gotFallback != tc.wantFallback {
t.Fatalf("fallback = %v, want %v", gotFallback, tc.wantFallback)
}
})
}
}
func TestProbeHealthz(t *testing.T) {
mux := http.NewServeMux()
registerProbe(mux)