diff --git a/scripts/t3-dispatch/main.go b/scripts/t3-dispatch/main.go index 6b85ad54..05e59304 100644 --- a/scripts/t3-dispatch/main.go +++ b/scripts/t3-dispatch/main.go @@ -122,9 +122,11 @@ var pairEndpoints = []string{"/api/auth/browser-session", "/api/auth/bootstrap"} // exchangeCredential POSTs the pairing credential to the user's instance, trying // each pairEndpoint in turn. A 404 means "absent in this t3 version" -> try the -// next; any other status is that endpoint's verdict, returned as-is. Caller owns -// resp.Body. -func exchangeCredential(port int, credential string) (*http.Response, error) { +// next; any other status is that endpoint's verdict, returned as-is. It also +// returns WHICH endpoint answered, so the caller can log the browser-session -> +// bootstrap fallback rate (a non-zero rate flags that the running t3 build moved +// the pairing API — the 2026-06-09 contract-drift class). Caller owns resp.Body. +func exchangeCredential(port int, credential string) (*http.Response, string, error) { body, _ := json.Marshal(map[string]string{"credential": credential}) var lastErr error for _, ep := range pairEndpoints { @@ -138,12 +140,12 @@ func exchangeCredential(port int, credential string) (*http.Response, error) { resp.Body.Close() // endpoint absent in this t3 version — try the next continue } - return resp, nil + return resp, ep, nil } if lastErr != nil { - return nil, lastErr + return nil, "", lastErr } - return nil, fmt.Errorf("no pairing endpoint accepted the request (all returned 404)") + return nil, "", fmt.Errorf("no pairing endpoint accepted the request (all returned 404)") } // autoPair mints a one-time pairing token for the user's instance (as that OS @@ -166,7 +168,7 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) { http.Error(w, "unparseable pairing output", http.StatusInternalServerError) return } - resp, err := exchangeCredential(e.Port, pc.Credential) + resp, ep, err := exchangeCredential(e.Port, pc.Credential) if err != nil { log.Printf("pairing exchange for %s failed: %v", e.OsUser, err) http.Error(w, "bootstrap request failed", http.StatusBadGateway) @@ -174,13 +176,17 @@ func autoPair(e entry, w http.ResponseWriter, r *http.Request) { } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - log.Printf("pairing for %s returned %d", e.OsUser, resp.StatusCode) + log.Printf("pairing for %s returned %d (endpoint=%s)", e.OsUser, resp.StatusCode, ep) http.Error(w, "bootstrap rejected", http.StatusBadGateway) return } for _, c := range resp.Cookies() { http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax) } + // Success line is the steady-state signal: endpoint= which pairing path won, + // fallback=true iff we fell back off the first-preference endpoint (running + // t3 build moved the pairing API). t3-probe / Loki alert on the fallback rate. + log.Printf("paired user=%s endpoint=%s fallback=%t", e.OsUser, ep, ep != pairEndpoints[0]) http.Redirect(w, r, "/", http.StatusFound) } diff --git a/scripts/t3-dispatch/main_test.go b/scripts/t3-dispatch/main_test.go index 407bdd92..8d021a24 100644 --- a/scripts/t3-dispatch/main_test.go +++ b/scripts/t3-dispatch/main_test.go @@ -262,6 +262,42 @@ func TestAutoPairAcrossVersions(t *testing.T) { } } +// TestExchangeCredentialReportsEndpoint: exchangeCredential must report WHICH +// pairing endpoint accepted the credential, so the dispatch can log it and we +// can alert on the browser-session -> bootstrap fallback rate (a non-zero rate +// means the running t3 build moved/renamed the pairing API — contract drift, the +// 2026-06-09 failure class). fallback = endpoint is not the first-preference one. +func TestExchangeCredentialReportsEndpoint(t *testing.T) { + for _, tc := range []struct { + name, pairPath, wantEP string + wantFallback bool + }{ + {"0.0.25 browser-session (primary)", "/api/auth/browser-session", "/api/auth/browser-session", false}, + {"0.0.24 bootstrap (fallback)", "/api/auth/bootstrap", "/api/auth/bootstrap", true}, + } { + t.Run(tc.name, func(t *testing.T) { + var hit string + ts := pairInstance(tc.pairPath, &hit) + defer ts.Close() + + resp, ep, err := exchangeCredential(portOf(t, ts), "tok") + if err != nil { + t.Fatalf("exchangeCredential: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + if ep != tc.wantEP { + t.Fatalf("endpoint = %q, want %q", ep, tc.wantEP) + } + if gotFallback := ep != pairEndpoints[0]; gotFallback != tc.wantFallback { + t.Fatalf("fallback = %v, want %v", gotFallback, tc.wantFallback) + } + }) + } +} + func TestProbeHealthz(t *testing.T) { mux := http.NewServeMux() registerProbe(mux) diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index bf333463..d9d7f7a7 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -194,6 +194,88 @@ resource "kubernetes_config_map" "loki_alert_rules" { }, ] }, + { + # t3 session-auth + auto-upgrade health (devvm host scripts → journald → + # Loki). Backstops the gated-nightly t3 tracker: the dispatch logs every + # real-user pairing outcome (success endpoint + fallback) and the enforcer + # logs every rollback/freeze. These catch a bad nightly that broke pairing + # for real users between the tracker's own bump-time gate runs — the + # 2026-06-09 failure class (mint/bootstrap broke, all users on the pair + # prompt). Route: Loki ruler → Alertmanager → default #alerts Slack. + # Runbook: docs/runbooks/t3-version-bump.md. + name = "t3 Auth & Upgrades" + rules = [ + { + # Real users failing to pair: mint error, exchange transport error, or + # a non-2xx from the instance pairing API. Threshold >3/10m rides out a + # benign single-instance restart race; sustained = pairing is broken. + alert = "T3PairingBroken" + expr = "sum(count_over_time({job=\"devvm-journal\", unit=\"t3-dispatch.service\"} |~ \"mint for .* failed|pairing exchange for .* failed|pairing for .* returned [0-9]\" [10m])) > 3" + for = "5m" + labels = { severity = "critical" } + annotations = { + summary = "t3 dispatch pairing is failing for real users (>3/10m)" + description = "t3-dispatch is failing to mint/exchange session cookies — users land on the t3 pair prompt instead of their workspace. Likely a bad t3 build broke the pairing API/schema (2026-06-09 class). Freeze the tracker (touch /etc/t3-autoupdate.freeze) and roll back per the runbook." + runbook = "docs/runbooks/t3-version-bump.md" + } + }, + { + # The dispatch fell back off its first-preference pairing endpoint + # (browser-session) to the legacy one — the running build moved/renamed + # the pairing API. Pin-compatible today (the fallback works), but it + # signals contract drift that a future build could break entirely. + alert = "T3PairFallbackHigh" + expr = "sum(count_over_time({job=\"devvm-journal\", unit=\"t3-dispatch.service\"} |~ \"paired .* fallback=true\" [30m])) > 0" + for = "0m" + labels = { severity = "warning" } + annotations = { + summary = "t3 dispatch is using the FALLBACK pairing endpoint — t3 moved the pairing API" + description = "A t3 build is pairing via the legacy /api/auth/bootstrap because the preferred /api/auth/browser-session 404s. Still works via fallback, but add the new endpoint to pairEndpoints in scripts/t3-dispatch/main.go before a future build drops the legacy one." + runbook = "docs/runbooks/t3-version-bump.md" + } + }, + { + # The enforcer's health-check failed a build and auto-rolled-back the + # binary. The gate worked — but a bad nightly shipped, so you should know. + alert = "T3AutoUpdateRolledBack" + expr = "sum(count_over_time({job=\"devvm-journal\", identifier=\"t3-autoupdate\"} |~ \"rolling back|rolled back\" [15m])) > 0" + for = "0m" + labels = { severity = "warning" } + annotations = { + summary = "t3 auto-update rolled back a bad build (gate worked)" + description = "The t3 enforcer installed a new build, its pairing health-check failed, and it auto-rolled-back. Investigate the bad build before the next cycle retries it; pin T3_PIN to a known-good if it recurs." + runbook = "docs/runbooks/t3-version-bump.md" + } + }, + { + # Rollback itself failed (npm couldn't reinstall the previous build): + # the box may be left on a broken t3. Manual fix needed. + alert = "T3AutoUpdateRollbackFailed" + expr = "sum(count_over_time({job=\"devvm-journal\", identifier=\"t3-autoupdate\"} |~ \"ROLLBACK FAILED\" [15m])) > 0" + for = "0m" + labels = { severity = "critical" } + annotations = { + summary = "t3 auto-update rollback FAILED — t3 may be broken on the devvm" + description = "The enforcer detected a bad build but could not reinstall the previous version. t3 may be broken for all users. Fix manually per the runbook (set T3_PIN to last-good, npm i -g, restore state if migrated)." + runbook = "docs/runbooks/t3-version-bump.md" + } + }, + { + # The tracker refused to advance (pre-run auth gate tripped, or the + # /etc/t3-autoupdate.freeze switch is set). Surfaces a stuck-on-purpose + # tracker so it isn't silently frozen forever. + alert = "T3AutoUpdateFrozen" + expr = "sum(count_over_time({job=\"devvm-journal\", identifier=\"t3-autoupdate\"} |~ \"FROZEN\" [25h])) > 0" + for = "0m" + labels = { severity = "warning" } + annotations = { + summary = "t3 auto-update is FROZEN (not tracking nightly)" + description = "The t3 tracker froze — either the pre-run pairing gate tripped or /etc/t3-autoupdate.freeze is set. t3 is held at the last-good pin and is NOT picking up new builds until cleared. Confirm pairing is healthy, then remove the freeze." + runbook = "docs/runbooks/t3-version-bump.md" + } + }, + ] + }, { # Wave 1 security alerts (beads code-8ywc). Routed via Loki ruler → # prometheus-alertmanager → #security Slack receiver. Allowlist CIDRs: