t3: connection logging across the path for drop attribution
All checks were successful
ci/woodpecker/push/default Pipeline was successful
ci/woodpecker/push/build-cli Pipeline was successful

Viktor asked to add connection logs (Traefik/Cloudflare) to catch the
real-path t3 WS drops: a direct-to-t3-serve browser ran 40 min clean
while real tunnel sessions cycle every 15-35s, so the drop originates
above t3-serve and we need to see which layer cuts the socket.

Traefik (/ws duration) and cloudflared (WS close events) already ship to
Loki; the gap was the devvm side. This adds:

- t3-dispatch logs every /ws open/close with dur_ms + cause:
  downstream_closed (client/CF/Traefik hung up = last-mile/network),
  upstream_closed (t3-serve closed/reset), or graceful. Graceful closes
  previously left no trace (default ReverseProxy only logs on error), so a
  watchdog-driven reconnect was invisible. Helpers unit-tested.
- devvm-promtail.{yaml,service}: ships devvm journald (t3-dispatch +
  t3-serve@<user>) to cluster Loki as job=devvm-journal, mirroring the
  pve/rpi-sofia shippers. devvm was never in Loki (standalone VM).

Joined in Loki the three layers attribute any future drop to a segment
with no repro needed. Runbook + service-catalog updated.
This commit is contained in:
Viktor Barzin 2026-06-11 13:48:10 +00:00
parent 933e4649fb
commit 9b19caff47
7 changed files with 231 additions and 3 deletions

View file

@ -0,0 +1,17 @@
# systemd unit for promtail on the devvm (10.0.10.10). Install to
# /etc/systemd/system/promtail.service. See scripts/devvm-promtail.yaml for the full deploy.
[Unit]
Description=Promtail (ships devvm journal -> cluster Loki)
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
ExecStart=/usr/local/bin/promtail -config.file=/etc/promtail/config.yml
Restart=on-failure
RestartSec=5
User=root
Group=root
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,59 @@
# Promtail config for the devvm (10.0.10.10) — ships the systemd journal to cluster Loki.
#
# devvm is a standalone VM (NOT a k8s node), so its journal — including the t3
# stack (t3-dispatch, t3-serve@<user>) — was never in Loki. Added 2026-06-11 for
# t3 drop forensics: t3-dispatch now logs each /ws connection's open/close with
# duration + which side hung up (downstream_closed = client/CF/Traefik went away;
# upstream_closed = t3-serve closed/stalled; graceful = clean close). Joined with
# Traefik's per-/ws duration (already in Loki) this attributes every drop to a layer.
#
# NOT Terraform-managed (devvm is outside k8s) — same hand-deployed pattern as
# scripts/pve-promtail.* and the rpi-sofia promtail. This file is source-of-truth.
#
# Deploy (on devvm, as root via sudo):
# sudo install -d -m 0755 /etc/promtail /var/lib/promtail
# sudo install -m 0644 scripts/devvm-promtail.yaml /etc/promtail/config.yml
# sudo install -m 0644 scripts/devvm-promtail.service /etc/systemd/system/promtail.service
# # Binary: grafana/loki v3.5.1 promtail-linux-amd64 -> /usr/local/bin/promtail (chmod 0755).
# sudo systemctl daemon-reload && sudo systemctl enable --now promtail
# # Loki reach: loki.viktorbarzin.lan (Technitium CNAME -> live Traefik LB; insecure cert).
#
# Streams produced:
# {job="devvm-journal"} — full devvm journal
# {job="devvm-journal", unit="t3-dispatch.service"} — dispatch (ws open/close lines)
# {job="devvm-journal", unit="t3-serve@wizard.service"} — per-user t3 serve
# {job="sshd-devvm"} — sshd auth lines (parity with sshd-pve)
server:
http_listen_port: 9080
grpc_listen_port: 0
log_level: warn
positions:
filename: /var/lib/promtail/positions.yaml
clients:
- url: https://loki.viktorbarzin.lan/loki/api/v1/push
tls_config:
insecure_skip_verify: true
scrape_configs:
- job_name: journal
journal:
max_age: 12h
json: false
path: /var/log/journal
labels:
host: devvm
job: devvm-journal
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: unit
- source_labels: ['__journal_priority_keyword']
target_label: level
- source_labels: ['__journal_syslog_identifier']
target_label: identifier
# sshd auth lines -> job=sshd-devvm (parity with the pve shipper's sshd-pve).
- source_labels: ['__journal_syslog_identifier']
regex: 'sshd.*'
target_label: job
replacement: 'sshd-devvm'

View file

@ -2,4 +2,4 @@ module t3-dispatch
go 1.22
require github.com/gorilla/websocket v1.5.3 // indirect
require github.com/gorilla/websocket v1.5.3

View file

@ -212,7 +212,64 @@ func handler(w http.ResponseWriter, r *http.Request) {
}
// Steady state: reverse-proxy (incl. WebSocket upgrade) to the user's instance.
target, _ := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", e.Port))
httputil.NewSingleHostReverseProxy(target).ServeHTTP(w, r)
proxy := httputil.NewSingleHostReverseProxy(target)
// WebSocket connection logging: t3 drops manifest as the client's 20s
// heartbeat watchdog reconnecting, so a flood of short-lived /ws connections
// IS the symptom. Log each WS open + close (duration + which side hung up) so
// a drop is attributable from logs alone — graceful closes otherwise leave no
// trace (the default ReverseProxy only logs on error). cause stays "graceful"
// unless ErrorHandler fires; ErrorHandler runs within ServeHTTP, so reading
// cause after ServeHTTP returns needs no synchronisation.
if isWebSocket(r) {
start := time.Now()
ip := clientIP(r)
cause := "graceful"
proxy.ErrorHandler = func(rw http.ResponseWriter, _ *http.Request, err error) {
cause = classifyClose(err)
}
log.Printf("ws open user=%s ip=%s", e.OsUser, ip)
proxy.ServeHTTP(w, r)
log.Printf("ws close user=%s ip=%s dur_ms=%d cause=%s",
e.OsUser, ip, time.Since(start).Milliseconds(), cause)
return
}
proxy.ServeHTTP(w, r)
}
// isWebSocket reports whether r is a WebSocket upgrade request.
func isWebSocket(r *http.Request) bool {
return strings.EqualFold(r.Header.Get("Upgrade"), "websocket") &&
strings.Contains(strings.ToLower(r.Header.Get("Connection")), "upgrade")
}
// clientIP returns the forwarded client chain (X-Forwarded-For, set by
// Traefik/CF) when present, else the immediate peer — for correlating a drop
// to a specific client/edge.
func clientIP(r *http.Request) string {
if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
return xff
}
return r.RemoteAddr
}
// classifyClose maps a reverse-proxy copy error to which side ended the socket:
// downstream (client/CF/Traefik went away) vs upstream (the user's t3 serve
// closed/reset). Distinguishes a last-mile/client drop from a t3-serve stall.
func classifyClose(err error) string {
if err == nil {
return "graceful"
}
s := err.Error()
switch {
case strings.Contains(s, "context canceled"):
return "downstream_closed" // client / CF / Traefik tore down
case strings.Contains(s, "reset by peer"), strings.Contains(s, "broken pipe"),
strings.Contains(s, "EOF"), strings.Contains(s, "connection refused"):
return "upstream_closed" // t3 serve closed / unreachable
default:
return s
}
}
func main() {

View file

@ -301,3 +301,63 @@ func TestProbeWSEcho(t *testing.T) {
}
}
}
func TestIsWebSocket(t *testing.T) {
cases := []struct {
up, conn string
want bool
}{
{"websocket", "Upgrade", true},
{"websocket", "keep-alive, Upgrade", true},
{"WebSocket", "upgrade", true},
{"", "keep-alive", false},
{"h2c", "Upgrade", false},
{"websocket", "keep-alive", false},
}
for _, c := range cases {
r, _ := http.NewRequest("GET", "/ws", nil)
if c.up != "" {
r.Header.Set("Upgrade", c.up)
}
r.Header.Set("Connection", c.conn)
if got := isWebSocket(r); got != c.want {
t.Errorf("isWebSocket(up=%q conn=%q)=%v want %v", c.up, c.conn, got, c.want)
}
}
}
func TestClassifyClose(t *testing.T) {
cases := []struct {
in error
want string
}{
{nil, "graceful"},
{errTest("context canceled"), "downstream_closed"},
{errTest("read tcp 127.0.0.1:60664->127.0.0.1:3773: read: connection reset by peer"), "upstream_closed"},
{errTest("write: broken pipe"), "upstream_closed"},
{errTest("unexpected EOF"), "upstream_closed"},
{errTest("dial tcp 127.0.0.1:3773: connect: connection refused"), "upstream_closed"},
{errTest("some novel error"), "some novel error"},
}
for _, c := range cases {
if got := classifyClose(c.in); got != c.want {
t.Errorf("classifyClose(%v)=%q want %q", c.in, got, c.want)
}
}
}
type errTest string
func (e errTest) Error() string { return string(e) }
func TestClientIP(t *testing.T) {
r, _ := http.NewRequest("GET", "/ws", nil)
r.RemoteAddr = "10.0.0.5:1234"
if got := clientIP(r); got != "10.0.0.5:1234" {
t.Errorf("clientIP no-xff = %q", got)
}
r.Header.Set("X-Forwarded-For", "1.2.3.4, 10.10.1.1")
if got := clientIP(r); got != "1.2.3.4, 10.10.1.1" {
t.Errorf("clientIP xff = %q", got)
}
}