From c7ead032ec504ca5022b40a78cc84dd106bf433d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 22 Jun 2026 17:34:03 +0000 Subject: [PATCH] chrome-service: fix noVNC stuck-"Connecting" (x11vnc fd-sweep under nofile=2^31) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The noVNC view hung on "Connecting" forever then timed out. Root cause: x11vnc sweeps the entire fd table (fcntl per fd) on every client connection, and containerd grants pods RLIMIT_NOFILE=2^31, so the RFB handshake never completes (websockify accepts the WS and dials localhost:5900, but x11vnc never sends its banner — verified: handshake timed out at 8s, x11vnc had burned 1h41m CPU spinning). Same bug + fix the android-emulator stack already carries. Cap nofile before x11vnc starts, in two places: - files/novnc/entrypoint.sh: `ulimit -n 65536` (root fix, makes the image correct) - main.tf novnc container: `command = ["bash","-c","ulimit -n 65536; exec /entrypoint.sh"]` so the cap applies deterministically on rollout even though the image is :latest/IfNotPresent (a rebuilt entrypoint isn't guaranteed to be re-pulled). Also documents the gotcha + diagnosis in docs/architecture/chrome-service.md and notes the black-when-idle behaviour + the autoconnect URL. (A live x11vnc relaunch with the cap already unblocked the running pod; this makes it survive restarts.) Co-Authored-By: Claude Opus 4.8 --- docs/architecture/chrome-service.md | 21 ++++++++++++++++++- .../chrome-service/files/novnc/entrypoint.sh | 7 +++++++ stacks/chrome-service/main.tf | 8 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/architecture/chrome-service.md b/docs/architecture/chrome-service.md index 000eec90..ba1a853c 100644 --- a/docs/architecture/chrome-service.md +++ b/docs/architecture/chrome-service.md @@ -167,7 +167,26 @@ minor, with Python-side bindings pre-installed. `x11vnc` (connected to Xvfb on `localhost:6099`) bridged to `websockify` on port 6080. Service `chrome` maps :80 → :6080 and is exposed via `ingress_factory` at `chrome.viktorbarzin.me`, - Authentik-gated. + Authentik-gated. The bare host serves `vnc.html` (image symlinks + `index.html → vnc.html`); add `?autoconnect=true&resize=scale&path=websockify` + to skip the Connect button. The view is **black when no browser window is + open** (idle) — that is normal, not a failed connection. + +### noVNC fd-sweep gotcha (stuck "Connecting") + +If the noVNC client hangs on **"Connecting" forever then times out**, the cause +is almost always x11vnc's fd-table sweep: containerd grants pods +`RLIMIT_NOFILE = 2^31`, and x11vnc `fcntl`-sweeps the **entire** fd table on +every client connection, so the RFB handshake never completes (websockify +accepts the WS and logs `connecting to: localhost:5900`, but x11vnc never sends +the `RFB 003.008` banner). Diagnose: `grep "open files" /proc/$(pgrep -n +x11vnc)/limits` (huge = bad) and time the handshake from a sibling container +(`python3 -c "import socket;s=socket.socket();s.connect(('127.0.0.1',5900));print(s.recv(12))"` — +healthy <0.3s, broken hangs). **Fix: cap `ulimit -n 65536` before x11vnc starts** +— done both in `files/novnc/entrypoint.sh` (root) and via the container `command` +wrapper in `main.tf` (so it applies deterministically even though the image is +`:latest`/`IfNotPresent` and won't re-pull a rebuilt entrypoint). Same bug + fix +as the android-emulator stack. - **snapshot-server sidecar** (`mcr.microsoft.com/playwright/python:v1.48.0-noble`) serves `GET /api/snapshot` from `/profile/snapshots/storage-state.json`, bearer-gated by `PW_TOKEN`. Service `chrome-snapshot` maps :8088 → :8088 diff --git a/stacks/chrome-service/files/novnc/entrypoint.sh b/stacks/chrome-service/files/novnc/entrypoint.sh index 1ec6657f..fae5c641 100644 --- a/stacks/chrome-service/files/novnc/entrypoint.sh +++ b/stacks/chrome-service/files/novnc/entrypoint.sh @@ -3,6 +3,13 @@ # and serve the noVNC HTML5 client + websockify bridge on :6080. set -e +# Containerd grants pods an effectively unbounded RLIMIT_NOFILE (2^31). x11vnc +# sweeps the WHOLE fd table with fcntl on every client connection, so each VNC +# connect hangs for ~forever and the noVNC client sits on "Connecting" until it +# times out. Cap it before launching x11vnc. (Same fix as the android-emulator +# stack; see docs/architecture/chrome-service.md "noVNC fd-sweep".) +ulimit -n 65536 + for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do if echo > /dev/tcp/127.0.0.1/6099 2>/dev/null; then echo "Xvfb TCP up after attempt $i" diff --git a/stacks/chrome-service/main.tf b/stacks/chrome-service/main.tf index d0db5c97..c1dd48a8 100644 --- a/stacks/chrome-service/main.tf +++ b/stacks/chrome-service/main.tf @@ -326,6 +326,14 @@ resource "kubernetes_deployment" "chrome_service" { # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. image = "ghcr.io/viktorbarzin/chrome-service-novnc:latest" image_pull_policy = "IfNotPresent" + # Cap RLIMIT_NOFILE before the entrypoint runs. Containerd grants pods + # nofile=2^31; x11vnc sweeps the whole fd table on each client connect, + # so every VNC connection hangs on "Connecting" until it times out + # (fd-sweep bug, same as android-emulator). entrypoint.sh now also sets + # this, but the image is :latest/IfNotPresent so a rebuilt entrypoint + # isn't guaranteed to be pulled — this wrapper applies the cap + # deterministically on every rollout off the cached image. + command = ["bash", "-c", "ulimit -n 65536; exec /entrypoint.sh"] port { name = "http" container_port = 6080