t3: differential drop-attribution probe + devvm metrics

Closes the loop on Viktor's ask to find the t3 disconnect root cause and
definitively rule infra in or out. Server logs alone cannot separate
'client network broke' from 'Cloudflare/tunnel broke' from 't3-serve
stalled' — every cause collapses into the same 20s-watchdog reconnect.

The t3-probe (stacks/t3code) holds three permanent legs that differ only
in path segment: 'cloudflare' (WS via DoH-resolved public DNS -> WAN ->
CF edge -> tunnel -> Traefik -> dispatch), 'internal' (same WS pinned to
the Traefik LB, no Cloudflare), 't3serve' (HTTP straight to the serve
process). Whichever leg drops convicts its segment; all legs clean while
a user drops exonerates infra with data. Dispatch gains an
unauthenticated /probe/ws echo + /probe/healthz (gorilla/websocket,
test-first) behind an auth=none path carve-out, guarded by the
authentik-walloff probe.

Also starts scraping devvm's node_exporter (job 'devvm') — it ran
unscraped, so the box whose memory/IO stalls cause the drops had zero
pressure history. Alerts T3ProbeLegDown + T3ProbeDropBurst; runbook
docs/runbooks/t3-drop-attribution.md.
This commit is contained in:
Viktor Barzin 2026-06-10 21:11:29 +00:00
parent b5c6639272
commit 9b55d53be0
11 changed files with 548 additions and 1 deletions

View file

@ -1,3 +1,5 @@
module t3-dispatch
go 1.22
require github.com/gorilla/websocket v1.5.3 // indirect

View file

@ -0,0 +1,2 @@
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=

View file

@ -228,6 +228,7 @@ func main() {
}()
mux := http.NewServeMux()
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte("ok\n")) })
registerProbe(mux)
mux.HandleFunc("/", handler)
log.Printf("t3-dispatch listening on %s", listenAddr)
log.Fatal(http.ListenAndServe(listenAddr, mux))

View file

@ -5,7 +5,10 @@ import (
"net/http/httptest"
"net/url"
"strconv"
"strings"
"testing"
"github.com/gorilla/websocket"
)
func portOf(t *testing.T, ts *httptest.Server) int {
@ -258,3 +261,43 @@ func TestAutoPairAcrossVersions(t *testing.T) {
})
}
}
func TestProbeHealthz(t *testing.T) {
mux := http.NewServeMux()
registerProbe(mux)
ts := httptest.NewServer(mux)
defer ts.Close()
resp, err := http.Get(ts.URL + "/probe/healthz")
if err != nil {
t.Fatalf("GET /probe/healthz: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Errorf("status = %d, want 200", resp.StatusCode)
}
}
func TestProbeWSEcho(t *testing.T) {
mux := http.NewServeMux()
registerProbe(mux)
ts := httptest.NewServer(mux)
defer ts.Close()
wsURL := "ws" + strings.TrimPrefix(ts.URL, "http") + "/probe/ws"
c, _, err := websocket.DefaultDialer.Dial(wsURL, nil)
if err != nil {
t.Fatalf("dial %s: %v", wsURL, err)
}
defer c.Close()
for _, msg := range []string{"ping 1718000000", "ping 1718000010"} {
if err := c.WriteMessage(websocket.TextMessage, []byte(msg)); err != nil {
t.Fatalf("write: %v", err)
}
_, got, err := c.ReadMessage()
if err != nil {
t.Fatalf("read: %v", err)
}
if string(got) != msg {
t.Errorf("echo = %q, want %q", got, msg)
}
}
}

View file

@ -0,0 +1,49 @@
// probe.go: unauthenticated path-health surface for the in-cluster t3-probe.
// /probe/* is carved out of Authentik (stacks/t3code `module "ingress_probe"`)
// so a synthetic client can hold a long-lived WebSocket here via two routes
// (Cloudflare edge vs internal Traefik) and attribute connection drops to a
// path segment. It echoes tiny frames and reaches no t3 instance — nothing
// user-grade is exposed.
package main
import (
"net/http"
"time"
"github.com/gorilla/websocket"
)
// Reap connections whose client went silent; the probe pings every 10s, so 90s
// of silence means the peer is gone even if TCP never noticed.
const probeIdleLimit = 90 * time.Second
var probeUpgrader = websocket.Upgrader{
// No cookies or credentials are at stake on an echo endpoint, and the
// probe connects without a browser Origin — checking it would only break it.
CheckOrigin: func(*http.Request) bool { return true },
}
func registerProbe(mux *http.ServeMux) {
mux.HandleFunc("/probe/healthz", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte("ok\n"))
})
mux.HandleFunc("/probe/ws", func(w http.ResponseWriter, r *http.Request) {
c, err := probeUpgrader.Upgrade(w, r, nil)
if err != nil {
return // Upgrade has already written the HTTP error
}
defer c.Close()
for {
if err := c.SetReadDeadline(time.Now().Add(probeIdleLimit)); err != nil {
return
}
mt, msg, err := c.ReadMessage()
if err != nil {
return
}
if err := c.WriteMessage(mt, msg); err != nil {
return
}
}
})
}