t3: session-auth detection for the gated nightly tracker (dispatch fallback logging + Loki alerts)
Some checks failed
ci/woodpecker/push/default Pipeline failed
Some checks failed
ci/woodpecker/push/default Pipeline failed
Before auto-tracking t3 nightly builds (Viktor's call, risk accepted), stand up the detection that was missing on 2026-06-09 — when an auto-pulled nightly broke pairing for ALL users and nothing alerted. Viktor's explicit requirement: make sure session auth keeps working and revert if the pairing fallback/failure rate climbs. This is phase 0 (detection) of that work. - t3-dispatch: exchangeCredential now reports WHICH pairing endpoint answered, and autoPair logs every outcome (paired user=.. endpoint=.. fallback=..) — so the real-user browser-session->bootstrap fallback rate is observable. A non-zero rate flags that a build moved the pairing API (the 2026-06-09 class). - Loki ruler alerts (devvm journal -> Alertmanager -> Slack): T3PairingBroken (real users failing to pair), T3PairFallbackHigh (build moved the pairing API), T3AutoUpdateRolledBack / RollbackFailed / Frozen (enforcer outcomes). Closes the post-mortem's open "nothing monitors end-to-end pairing" detection gap. The existing t3-probe only checks GET /api/auth/session==200, which stays 200 even when pairing is dead, so it never caught the outage class. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
e783cae2cb
commit
994d305d04
3 changed files with 132 additions and 8 deletions
|
|
@ -122,9 +122,11 @@ var pairEndpoints = []string{"/api/auth/browser-session", "/api/auth/bootstrap"}
|
|||
|
||||
// exchangeCredential POSTs the pairing credential to the user's instance, trying
|
||||
// each pairEndpoint in turn. A 404 means "absent in this t3 version" -> try the
|
||||
// next; any other status is that endpoint's verdict, returned as-is. Caller owns
|
||||
// resp.Body.
|
||||
func exchangeCredential(port int, credential string) (*http.Response, error) {
|
||||
// next; any other status is that endpoint's verdict, returned as-is. It also
|
||||
// returns WHICH endpoint answered, so the caller can log the browser-session ->
|
||||
// bootstrap fallback rate (a non-zero rate flags that the running t3 build moved
|
||||
// the pairing API — the 2026-06-09 contract-drift class). Caller owns resp.Body.
|
||||
func exchangeCredential(port int, credential string) (*http.Response, string, error) {
|
||||
body, _ := json.Marshal(map[string]string{"credential": credential})
|
||||
var lastErr error
|
||||
for _, ep := range pairEndpoints {
|
||||
|
|
@ -138,12 +140,12 @@ func exchangeCredential(port int, credential string) (*http.Response, error) {
|
|||
resp.Body.Close() // endpoint absent in this t3 version — try the next
|
||||
continue
|
||||
}
|
||||
return resp, nil
|
||||
return resp, ep, nil
|
||||
}
|
||||
if lastErr != nil {
|
||||
return nil, lastErr
|
||||
return nil, "", lastErr
|
||||
}
|
||||
return nil, fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
|
||||
return nil, "", fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
|
||||
}
|
||||
|
||||
// autoPair mints a one-time pairing token for the user's instance (as that OS
|
||||
|
|
@ -166,7 +168,7 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
|
|||
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
resp, err := exchangeCredential(e.Port, pc.Credential)
|
||||
resp, ep, err := exchangeCredential(e.Port, pc.Credential)
|
||||
if err != nil {
|
||||
log.Printf("pairing exchange for %s failed: %v", e.OsUser, err)
|
||||
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
|
||||
|
|
@ -174,13 +176,17 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
|
|||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Printf("pairing for %s returned %d", e.OsUser, resp.StatusCode)
|
||||
log.Printf("pairing for %s returned %d (endpoint=%s)", e.OsUser, resp.StatusCode, ep)
|
||||
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
for _, c := range resp.Cookies() {
|
||||
http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax)
|
||||
}
|
||||
// Success line is the steady-state signal: endpoint= which pairing path won,
|
||||
// fallback=true iff we fell back off the first-preference endpoint (running
|
||||
// t3 build moved the pairing API). t3-probe / Loki alert on the fallback rate.
|
||||
log.Printf("paired user=%s endpoint=%s fallback=%t", e.OsUser, ep, ep != pairEndpoints[0])
|
||||
http.Redirect(w, r, "/", http.StatusFound)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -262,6 +262,42 @@ func TestAutoPairAcrossVersions(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// TestExchangeCredentialReportsEndpoint: exchangeCredential must report WHICH
|
||||
// pairing endpoint accepted the credential, so the dispatch can log it and we
|
||||
// can alert on the browser-session -> bootstrap fallback rate (a non-zero rate
|
||||
// means the running t3 build moved/renamed the pairing API — contract drift, the
|
||||
// 2026-06-09 failure class). fallback = endpoint is not the first-preference one.
|
||||
func TestExchangeCredentialReportsEndpoint(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
name, pairPath, wantEP string
|
||||
wantFallback bool
|
||||
}{
|
||||
{"0.0.25 browser-session (primary)", "/api/auth/browser-session", "/api/auth/browser-session", false},
|
||||
{"0.0.24 bootstrap (fallback)", "/api/auth/bootstrap", "/api/auth/bootstrap", true},
|
||||
} {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
var hit string
|
||||
ts := pairInstance(tc.pairPath, &hit)
|
||||
defer ts.Close()
|
||||
|
||||
resp, ep, err := exchangeCredential(portOf(t, ts), "tok")
|
||||
if err != nil {
|
||||
t.Fatalf("exchangeCredential: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200", resp.StatusCode)
|
||||
}
|
||||
if ep != tc.wantEP {
|
||||
t.Fatalf("endpoint = %q, want %q", ep, tc.wantEP)
|
||||
}
|
||||
if gotFallback := ep != pairEndpoints[0]; gotFallback != tc.wantFallback {
|
||||
t.Fatalf("fallback = %v, want %v", gotFallback, tc.wantFallback)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestProbeHealthz(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
registerProbe(mux)
|
||||
|
|
|
|||
|
|
@ -194,6 +194,88 @@ resource "kubernetes_config_map" "loki_alert_rules" {
|
|||
},
|
||||
]
|
||||
},
|
||||
{
|
||||
# t3 session-auth + auto-upgrade health (devvm host scripts → journald →
|
||||
# Loki). Backstops the gated-nightly t3 tracker: the dispatch logs every
|
||||
# real-user pairing outcome (success endpoint + fallback) and the enforcer
|
||||
# logs every rollback/freeze. These catch a bad nightly that broke pairing
|
||||
# for real users between the tracker's own bump-time gate runs — the
|
||||
# 2026-06-09 failure class (mint/bootstrap broke, all users on the pair
|
||||
# prompt). Route: Loki ruler → Alertmanager → default #alerts Slack.
|
||||
# Runbook: docs/runbooks/t3-version-bump.md.
|
||||
name = "t3 Auth & Upgrades"
|
||||
rules = [
|
||||
{
|
||||
# Real users failing to pair: mint error, exchange transport error, or
|
||||
# a non-2xx from the instance pairing API. Threshold >3/10m rides out a
|
||||
# benign single-instance restart race; sustained = pairing is broken.
|
||||
alert = "T3PairingBroken"
|
||||
expr = "sum(count_over_time({job=\"devvm-journal\", unit=\"t3-dispatch.service\"} |~ \"mint for .* failed|pairing exchange for .* failed|pairing for .* returned [0-9]\" [10m])) > 3"
|
||||
for = "5m"
|
||||
labels = { severity = "critical" }
|
||||
annotations = {
|
||||
summary = "t3 dispatch pairing is failing for real users (>3/10m)"
|
||||
description = "t3-dispatch is failing to mint/exchange session cookies — users land on the t3 pair prompt instead of their workspace. Likely a bad t3 build broke the pairing API/schema (2026-06-09 class). Freeze the tracker (touch /etc/t3-autoupdate.freeze) and roll back per the runbook."
|
||||
runbook = "docs/runbooks/t3-version-bump.md"
|
||||
}
|
||||
},
|
||||
{
|
||||
# The dispatch fell back off its first-preference pairing endpoint
|
||||
# (browser-session) to the legacy one — the running build moved/renamed
|
||||
# the pairing API. Pin-compatible today (the fallback works), but it
|
||||
# signals contract drift that a future build could break entirely.
|
||||
alert = "T3PairFallbackHigh"
|
||||
expr = "sum(count_over_time({job=\"devvm-journal\", unit=\"t3-dispatch.service\"} |~ \"paired .* fallback=true\" [30m])) > 0"
|
||||
for = "0m"
|
||||
labels = { severity = "warning" }
|
||||
annotations = {
|
||||
summary = "t3 dispatch is using the FALLBACK pairing endpoint — t3 moved the pairing API"
|
||||
description = "A t3 build is pairing via the legacy /api/auth/bootstrap because the preferred /api/auth/browser-session 404s. Still works via fallback, but add the new endpoint to pairEndpoints in scripts/t3-dispatch/main.go before a future build drops the legacy one."
|
||||
runbook = "docs/runbooks/t3-version-bump.md"
|
||||
}
|
||||
},
|
||||
{
|
||||
# The enforcer's health-check failed a build and auto-rolled-back the
|
||||
# binary. The gate worked — but a bad nightly shipped, so you should know.
|
||||
alert = "T3AutoUpdateRolledBack"
|
||||
expr = "sum(count_over_time({job=\"devvm-journal\", identifier=\"t3-autoupdate\"} |~ \"rolling back|rolled back\" [15m])) > 0"
|
||||
for = "0m"
|
||||
labels = { severity = "warning" }
|
||||
annotations = {
|
||||
summary = "t3 auto-update rolled back a bad build (gate worked)"
|
||||
description = "The t3 enforcer installed a new build, its pairing health-check failed, and it auto-rolled-back. Investigate the bad build before the next cycle retries it; pin T3_PIN to a known-good if it recurs."
|
||||
runbook = "docs/runbooks/t3-version-bump.md"
|
||||
}
|
||||
},
|
||||
{
|
||||
# Rollback itself failed (npm couldn't reinstall the previous build):
|
||||
# the box may be left on a broken t3. Manual fix needed.
|
||||
alert = "T3AutoUpdateRollbackFailed"
|
||||
expr = "sum(count_over_time({job=\"devvm-journal\", identifier=\"t3-autoupdate\"} |~ \"ROLLBACK FAILED\" [15m])) > 0"
|
||||
for = "0m"
|
||||
labels = { severity = "critical" }
|
||||
annotations = {
|
||||
summary = "t3 auto-update rollback FAILED — t3 may be broken on the devvm"
|
||||
description = "The enforcer detected a bad build but could not reinstall the previous version. t3 may be broken for all users. Fix manually per the runbook (set T3_PIN to last-good, npm i -g, restore state if migrated)."
|
||||
runbook = "docs/runbooks/t3-version-bump.md"
|
||||
}
|
||||
},
|
||||
{
|
||||
# The tracker refused to advance (pre-run auth gate tripped, or the
|
||||
# /etc/t3-autoupdate.freeze switch is set). Surfaces a stuck-on-purpose
|
||||
# tracker so it isn't silently frozen forever.
|
||||
alert = "T3AutoUpdateFrozen"
|
||||
expr = "sum(count_over_time({job=\"devvm-journal\", identifier=\"t3-autoupdate\"} |~ \"FROZEN\" [25h])) > 0"
|
||||
for = "0m"
|
||||
labels = { severity = "warning" }
|
||||
annotations = {
|
||||
summary = "t3 auto-update is FROZEN (not tracking nightly)"
|
||||
description = "The t3 tracker froze — either the pre-run pairing gate tripped or /etc/t3-autoupdate.freeze is set. t3 is held at the last-good pin and is NOT picking up new builds until cleared. Confirm pairing is healthy, then remove the freeze."
|
||||
runbook = "docs/runbooks/t3-version-bump.md"
|
||||
}
|
||||
},
|
||||
]
|
||||
},
|
||||
{
|
||||
# Wave 1 security alerts (beads code-8ywc). Routed via Loki ruler →
|
||||
# prometheus-alertmanager → #security Slack receiver. Allowlist CIDRs:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue