infra/scripts/t3-dispatch/main.go
Viktor Barzin 9b55d53be0 t3: differential drop-attribution probe + devvm metrics
Closes the loop on Viktor's ask to find the t3 disconnect root cause and
definitively rule infra in or out. Server logs alone cannot separate
'client network broke' from 'Cloudflare/tunnel broke' from 't3-serve
stalled' — every cause collapses into the same 20s-watchdog reconnect.

The t3-probe (stacks/t3code) holds three permanent legs that differ only
in path segment: 'cloudflare' (WS via DoH-resolved public DNS -> WAN ->
CF edge -> tunnel -> Traefik -> dispatch), 'internal' (same WS pinned to
the Traefik LB, no Cloudflare), 't3serve' (HTTP straight to the serve
process). Whichever leg drops convicts its segment; all legs clean while
a user drops exonerates infra with data. Dispatch gains an
unauthenticated /probe/ws echo + /probe/healthz (gorilla/websocket,
test-first) behind an auth=none path carve-out, guarded by the
authentik-walloff probe.

Also starts scraping devvm's node_exporter (job 'devvm') — it ran
unscraped, so the box whose memory/IO stalls cause the drops had zero
pressure history. Alerts T3ProbeLegDown + T3ProbeDropBurst; runbook
docs/runbooks/t3-drop-attribution.md.
2026-06-10 21:11:29 +00:00

235 lines
7.8 KiB
Go

// t3-dispatch: per-user dispatch + auto-pair for t3code.
// Sits behind Traefik+Authentik (which injects X-authentik-username) and routes
// each authenticated user to their own `t3 serve` instance. On a user's first
// visit (no t3 session cookie) it mints a pairing token for that user's instance
// and exchanges it for the session cookie, which it injects into the browser —
// so an Authentik login lands straight in the user's workspace.
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"net/http"
"net/http/httputil"
"net/url"
"os"
"os/exec"
"strings"
"sync"
"time"
)
type entry struct {
OsUser string `json:"os_user"`
Port int `json:"port"`
}
const (
cookieName = "t3_session" // discovered: apps/server/src/auth/utils.ts (web mode)
listenAddr = ":3780"
dispatchFile = "/etc/t3-serve/dispatch.json"
)
var (
mu sync.RWMutex
table map[string]entry
)
func loadTable() error {
b, err := os.ReadFile(dispatchFile)
if err != nil {
return err
}
m := map[string]entry{}
if err := json.Unmarshal(b, &m); err != nil {
return err
}
mu.Lock()
table = m
mu.Unlock()
return nil
}
func lookup(ak string) (entry, bool) {
mu.RLock()
defer mu.RUnlock()
e, ok := table[ak]
return e, ok
}
// mintToken mints a one-time pairing token for osUser via the scoped sudoers
// entry (the dispatch service can invoke nothing else). Indirected through a var
// so tests can stub the privileged exec.
var mintToken = func(osUser string) ([]byte, error) {
return exec.Command("sudo", "-n", "/usr/local/bin/t3-mint", osUser).Output()
}
var sessionClient = &http.Client{Timeout: 5 * time.Second}
// sessionValid asks the user's instance whether the presented t3_session cookie
// is still valid. Server-side sessions can be wiped/expired independently of the
// 30-day cookie (e.g. an auth-schema rollback drops every session row), leaving
// the browser with a live-looking but dead cookie. Fails OPEN: any error/non-200/
// parse failure returns true so the request still proxies — a re-pair is forced
// only on a definitive authenticated:false.
func sessionValid(e entry, c *http.Cookie) bool {
req, err := http.NewRequest(http.MethodGet,
fmt.Sprintf("http://127.0.0.1:%d/api/auth/session", e.Port), nil)
if err != nil {
return true
}
req.AddCookie(c)
resp, err := sessionClient.Do(req)
if err != nil {
return true
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return true
}
var s struct {
Authenticated bool `json:"authenticated"`
}
if json.NewDecoder(resp.Body).Decode(&s) != nil {
return true
}
return s.Authenticated
}
// isDocumentNav reports whether r is a top-level browser document navigation, as
// opposed to an XHR/fetch/asset/WebSocket sub-request. Only such requests are
// safe to answer with a re-pair 302 — redirecting a sub-resource would corrupt
// the SPA's fetch/WebSocket contract. Trust Sec-Fetch-Dest when present (all
// modern browsers send it); fall back to the Accept header otherwise.
func isDocumentNav(r *http.Request) bool {
if r.Method != http.MethodGet {
return false
}
if dest := r.Header.Get("Sec-Fetch-Dest"); dest != "" {
return dest == "document"
}
return strings.Contains(r.Header.Get("Accept"), "text/html")
}
// pairEndpoints are the instance's session-bootstrap paths in preference order.
// t3 renamed /api/auth/bootstrap -> /api/auth/browser-session in 0.0.25; trying the
// new name first and falling back to the old lets ONE dispatch binary pair against
// either version — so the t3 pin can move forward (and survive a rolling-restart
// skew where some instances are already on the new version) without a 502 storm.
var pairEndpoints = []string{"/api/auth/browser-session", "/api/auth/bootstrap"}
// exchangeCredential POSTs the pairing credential to the user's instance, trying
// each pairEndpoint in turn. A 404 means "absent in this t3 version" -> try the
// next; any other status is that endpoint's verdict, returned as-is. Caller owns
// resp.Body.
func exchangeCredential(port int, credential string) (*http.Response, error) {
body, _ := json.Marshal(map[string]string{"credential": credential})
var lastErr error
for _, ep := range pairEndpoints {
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d%s", port, ep),
"application/json", bytes.NewReader(body))
if err != nil {
lastErr = err
continue
}
if resp.StatusCode == http.StatusNotFound {
resp.Body.Close() // endpoint absent in this t3 version — try the next
continue
}
return resp, nil
}
if lastErr != nil {
return nil, lastErr
}
return nil, fmt.Errorf("no pairing endpoint accepted the request (all returned 404)")
}
// autoPair mints a one-time pairing token for the user's instance (as that OS
// user, via the scoped sudoers entry) and exchanges it at the instance's pairing
// endpoint, relaying the returned t3_session Set-Cookie to the browser.
func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
// t3-mint (root, via scoped sudoers) validates the OS user is in
// /etc/ttyd-user-map, then mints as that user. The dispatch service itself
// runs unprivileged and can invoke nothing else.
out, err := mintToken(e.OsUser)
if err != nil {
log.Printf("mint for %s failed: %v", e.OsUser, err)
http.Error(w, "pairing mint failed", http.StatusInternalServerError)
return
}
var pc struct {
Credential string `json:"credential"` // CLI returns the token under "credential"
}
if err := json.Unmarshal(out, &pc); err != nil || pc.Credential == "" {
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
return
}
resp, err := exchangeCredential(e.Port, pc.Credential)
if err != nil {
log.Printf("pairing exchange for %s failed: %v", e.OsUser, err)
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
return
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("pairing for %s returned %d", e.OsUser, resp.StatusCode)
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
return
}
for _, c := range resp.Cookies() {
http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax)
}
http.Redirect(w, r, "/", http.StatusFound)
}
func handler(w http.ResponseWriter, r *http.Request) {
ak := r.Header.Get("X-authentik-username")
// Authentik injects the full email (e.g. vbarzin@gmail.com); /etc/ttyd-user-map
// (and thus dispatch.json) keys on the local part. Strip @domain, matching the
// terminal stack's tmux-attach.sh (`${auth_user%%@*}`).
if i := strings.IndexByte(ak, '@'); i >= 0 {
ak = ak[:i]
}
e, ok := lookup(ak)
if !ok {
http.Error(w, "no t3 instance provisioned for this user", http.StatusForbidden)
return
}
c, err := r.Cookie(cookieName)
if err != nil {
autoPair(e, w, r)
return
}
// A present cookie can still be server-side-invalid (sessions wiped/expired
// while the 30-day cookie lingers). On a top-level navigation, verify it and
// re-pair if dead — otherwise the instance just renders its pair page. Gated
// to document navs so we never 302 an XHR/asset/WebSocket sub-request.
if isDocumentNav(r) && !sessionValid(e, c) {
autoPair(e, w, r)
return
}
// Steady state: reverse-proxy (incl. WebSocket upgrade) to the user's instance.
target, _ := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", e.Port))
httputil.NewSingleHostReverseProxy(target).ServeHTTP(w, r)
}
func main() {
if err := loadTable(); err != nil {
log.Fatalf("load %s: %v", dispatchFile, err)
}
go func() {
for range time.Tick(60 * time.Second) {
if err := loadTable(); err != nil {
log.Printf("reload %s: %v", dispatchFile, err)
}
}
}()
mux := http.NewServeMux()
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte("ok\n")) })
registerProbe(mux)
mux.HandleFunc("/", handler)
log.Printf("t3-dispatch listening on %s", listenAddr)
log.Fatal(http.ListenAndServe(listenAddr, mux))
}