From deede6dd11eb5fb9916b1794f255888a4468b760 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 4 Jun 2026 05:15:49 +0000 Subject: [PATCH] chrome-service: switch to CDP + persistent profile + hourly snapshot pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chrome-service stack ran `playwright launch-server`, which creates ephemeral browser contexts per `connect()`. Despite the encrypted PVC mounted at /profile, no chromium user-data ever persisted — only npm cache + fontconfig. Logging in via noVNC was effectively a no-op. Refactor: - Replace launch-server with direct chromium (TCP CDP on :9223 internal), fronted by a Python HTTP+WS bridge on :9222 that rewrites the Host header to bypass Chrome's hardcoded DNS-rebinding protection (no `--remote-allow-hosts` flag exists in stock Chrome 130; verified by binary string grep). Bridge also forces Connection: close on HTTP responses so Node ws opens a fresh TCP for the WS upgrade rather than trying to reuse the dead keep-alive socket. - Add `--user-data-dir=/profile/chromium-data` so cookies/localStorage actually persist on the encrypted PVC. - New snapshot-server sidecar (stdlib python HTTP) serves GET /api/snapshot at chrome.viktorbarzin.me/api/snapshot, bearer-token-gated by the existing api_bearer_token. - New chrome-service-snapshot-harvester CronJob (hourly) connects via CDP, dumps storage_state() (cookies + localStorage), writes atomically to /profile/snapshots/storage-state.json. - NetworkPolicy: TCP/9222 (was :3000), TCP/8088 added for traefik. Caller migration: - f1-stream: `chromium.connect(ws_url)` → `chromium.connect_over_cdp(cdp_url)`, env var CHROME_WS_URL → CHROME_CDP_URL. CHROME_WS_TOKEN dropped (no longer used by code; ExternalSecret kept for symmetry with the snapshot endpoint). Dev-box side (out of scope for this commit — see ~/.config/systemd/user/): - playwright-mcp.service flips to `--isolated --storage-state=...` so per-Claude-Code-session ephemeral contexts seed from the snapshot. - playwright-snapshot-refresh.{service,timer} (hourly) pulls the snapshot via the bearer-gated HTTPS endpoint. Docs updated: - docs/architecture/chrome-service.md — new architecture diagram + wire protocol. - docs/runbooks/chrome-service-snapshot.md — day-2 ops (refresh, rotation, failure modes, restore). - stacks/chrome-service/README.md — connect_over_cdp recipe. Design spec at docs/superpowers/specs/2026-06-04-playwright-per-session-browser-design.md. --- docs/architecture/chrome-service.md | 151 +++++-- docs/runbooks/chrome-service-snapshot.md | 211 +++++++++ stacks/chrome-service/README.md | 109 +++-- stacks/chrome-service/files/cdp_bridge.py | 214 +++++++++ .../files/snapshot_harvester.py | 69 +++ .../chrome-service/files/snapshot_server.py | 68 +++ stacks/chrome-service/main.tf | 424 +++++++++++++++--- .../backend/extractors/chrome_browser.py | 26 +- .../files/backend/playback_verifier.py | 39 +- stacks/f1-stream/main.tf | 18 +- 10 files changed, 1152 insertions(+), 177 deletions(-) create mode 100644 docs/runbooks/chrome-service-snapshot.md create mode 100644 stacks/chrome-service/files/cdp_bridge.py create mode 100644 stacks/chrome-service/files/snapshot_harvester.py create mode 100644 stacks/chrome-service/files/snapshot_server.py diff --git a/docs/architecture/chrome-service.md b/docs/architecture/chrome-service.md index 9d8901d3..c5c43326 100644 --- a/docs/architecture/chrome-service.md +++ b/docs/architecture/chrome-service.md @@ -1,16 +1,23 @@ -# chrome-service — In-cluster headed Chromium pool +# chrome-service — In-cluster headed Chromium with persistent profile ## Overview -`chrome-service` is a single-replica, persistent-profile, bearer-token-gated -Playwright **launch-server** that exposes a headed Chromium browser over a -WebSocket. Sibling services connect to it instead of running their own -in-process Chromium when the upstream's anti-bot tooling -(`disable-devtool.js` redirect-to-google trap, console-clear timing tricks, -`navigator.webdriver` checks) defeats a headless browser. +`chrome-service` is a single-replica, persistent-profile, headed +Chromium browser exposed over the Chrome DevTools Protocol (CDP). It +serves two distinct populations: -Initial caller: `f1-stream`'s `playback_verifier`. Future callers attach -via the WS+token contract documented in `stacks/chrome-service/README.md`. +1. **In-cluster automation callers** (e.g. `f1-stream`'s + `playback_verifier`, `chrome_browser` extractor) — connect via + `chromium.connect_over_cdp("http://chrome-service.chrome-service.svc:9222")` + to drive a real browser when upstream anti-bot trips a headless one + (`disable-devtool.js` redirect-to-google trap, `navigator.webdriver` + checks, console-clear timing tricks). +2. **External dev-box Claude Code sessions** — pull an hourly snapshot + of cookies + localStorage from `chrome.viktorbarzin.me/api/snapshot` + (bearer-gated) and seed local `@playwright/mcp` instances in + `--isolated --storage-state=…` mode. This is how concurrent Claude + Code sessions get their own isolated browser contexts without losing + shared cookies for logged-in sites. ## Why a separate stack @@ -25,8 +32,8 @@ In-process Chromium inside `f1-stream`: `chrome-service` solves this by: -1. Running **headed** under `Xvfb :99` (via `playwright launch-server` with - a JSON config that pins `headless: false`). +1. Running **headed** under `Xvfb :99` (chromium with `DISPLAY=:99`, + not `--headless`). 2. Living in a long-lived pod so JIT browser launch latency disappears. 3. Allowing a per-context init script (`stacks/chrome-service/files/stealth.js` ~ 40 lines, vendored from @@ -35,25 +42,67 @@ In-process Chromium inside `f1-stream`: to hide the `disable-devtool-auto` script-tag attribute so the lib's IIFE exits early. -## Wire protocol +## Wire protocol — CDP (current, since 2026-06-04) ```text - ws://chrome-service.chrome-service.svc.cluster.local:3000/ + http://chrome-service.chrome-service.svc.cluster.local:9222 │ ┌───────────────────────────────┼───────────────────────────────┐ │ caller pod │ chrome-service pod │ (e.g. f1-stream) │ (single replica) │ │ - │ CHROME_WS_URL ──────────────┘ - │ CHROME_WS_TOKEN ─── from `secret/chrome-service.api_bearer_token` (ESO) + │ CHROME_CDP_URL ──────────────┘ │ - │ await chromium.connect(f"{ws}/{token}") - │ await ctx.add_init_script(STEALTH_JS) + │ await chromium.connect_over_cdp(cdp_url) + │ context = await browser.new_context() ← incognito (no cookies) + │ OR: context = browser.contexts[0] ← persistent (shared cookies) + │ await context.add_init_script(STEALTH_JS) │ page.goto("https://upstream.com/embed/...") │ └─── ←── pages render under Xvfb, headed Chromium ──── ─────────┘ ``` +### Wire protocol — WS (legacy, removed 2026-06-04) + +The previous design used `playwright launch-server --browser chromium` +with a path-token (`ws://...:3000/`). Callers used +`chromium.connect(ws_url)`. **Problem**: `launch-server` creates +ephemeral browser contexts per `connect()` call, so cookies never +persisted to the PVC despite the `/profile` mount. We migrated to +direct chromium launch with `--user-data-dir` + CDP exposed on :9222 +so cookies actually live across pod restarts. + +## Cookie warming + snapshot pipeline + +```text +┌─────────── chrome-service pod ──────────────────────────────────────────┐ +│ │ +│ chrome-service container (chromium --user-data-dir=/profile/chromium-data +│ --remote-debugging-port=9222) │ +│ ▲ │ +│ │ user logs in via noVNC ← chrome.viktorbarzin.me (Authentik) │ +│ │ │ +│ Cookies + localStorage land in /profile/chromium-data/Default/ │ +│ │ +│ snapshot-server sidecar (python stdlib HTTP server, :8088) │ +│ ↑ serves /profile/snapshots/storage-state.json (bearer-gated) │ +└──────────────────────────────────────────────────────────────────────────┘ + ▲ + │ hourly (cron 23 * * * *) + │ +┌──────┴── chrome-service-snapshot-harvester CronJob ─────────────────────┐ +│ podAffinity → same node as chrome-service (RWO PVC) │ +│ python: connect_over_cdp + ctx.storage_state(path=...) │ +│ writes /profile/snapshots/storage-state.json (atomic rename) │ +└──────────────────────────────────────────────────────────────────────────┘ + +External caller (dev box): + systemd timer (hourly) → curl -H "Authorization: Bearer $TOKEN" + https://chrome.viktorbarzin.me/api/snapshot + -o ~/.cache/playwright-shared-storage-state.json + @playwright/mcp --isolated --storage-state ~/.cache/...storage-state.json +``` + ## Image pin Both the server image (`mcr.microsoft.com/playwright:v1.48.0-noble` in @@ -62,17 +111,17 @@ Both the server image (`mcr.microsoft.com/playwright:v1.48.0-noble` in minor-versions**. Bump in lockstep — Playwright protocol changes between minors and the client cannot connect to a mismatched server. -The Microsoft image ships only the browser binaries, not the `playwright` -npm SDK; the start command runs `npx -y playwright@1.48.0 launch-server` -which downloads the SDK on first start (cached under `$HOME/.npm` via the -PVC) and reuses it on subsequent restarts. +The harvester + snapshot-server sidecar use +`mcr.microsoft.com/playwright/python:v1.48.0-noble` — same playwright +minor, with Python-side bindings pre-installed. ## Storage - **`chrome-service-profile-encrypted`** (PVC, 2Gi → 10Gi autoresize, - `proxmox-lvm-encrypted`) — Chromium user-data dir + npm cache. + `proxmox-lvm-encrypted`) — Chromium user-data dir at + `/profile/chromium-data` + snapshot at `/profile/snapshots/storage-state.json`. Encrypted because cookies/localStorage may include third-party auth tokens - for sites callers drive. `HOME=/profile` so npx caches there. + for sites callers drive. - **`chrome-service-backup-host`** (NFS, RWX) — destination for a 6-hourly CronJob that `tar -czf /backup/.tar.gz -C /profile .`, retention 30 days. @@ -82,41 +131,45 @@ PVC) and reuses it on subsequent restarts. - Vault KV `secret/chrome-service.api_bearer_token` — 32-byte URL-safe random, rotated by hand: `vault kv put secret/chrome-service api_bearer_token=$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')`. -- ESO syncs into namespace-local Secret `chrome-service-secrets` - (server pod) and `chrome-service-client-secrets` (each caller pod). +- ESO syncs into namespace-local Secret `chrome-service-secrets`. The + `snapshot-server` sidecar reads it via `secret_key_ref`. +- f1-stream still imports the secret (via `chrome-service-client-secrets`) + for parity, but the CDP endpoint no longer requires it for connection — + NetworkPolicy is the gate. - Reloader (`reloader.stakater.com/auto = "true"`) cascades token rotation - to both server and any annotated caller — no manual rollout. + to the snapshot-server sidecar. +- **Dev-box cache**: each dev box keeps a local copy at + `~/.config/playwright/token` (chmod 600). Re-fetch from Vault after + rotation: `vault kv get -field=api_bearer_token secret/chrome-service > ~/.config/playwright/token`. ## Network controls -- **`kubernetes_network_policy_v1.ws_ingress`** — two separate ingress - rules on the same policy: - - **TCP/3000** (Playwright WS): only namespaces labelled +- **`kubernetes_network_policy_v1.ws_ingress`** — three ingress rules: + - **TCP/9222** (Chromium CDP): only namespaces labelled `chrome-service.viktorbarzin.me/client = "true"` (plus an explicit - fallback for `f1-stream` by `kubernetes.io/metadata.name`). - - **TCP/6080** (noVNC HTTP+WS): only the `traefik` namespace, since - the public-facing path is `chrome.viktorbarzin.me` ingress → - Traefik → sidecar. Authentik forward-auth still gates external - access at the Traefik layer. -- **WS port 3000** is internal-only (no ingress, no Cloudflare DNS). + fallback for `f1-stream` by `kubernetes.io/metadata.name`, plus + `chrome-service`'s own namespace for the harvester CronJob). + - **TCP/6080** (noVNC HTTP+WS): only the `traefik` namespace. + - **TCP/8088** (snapshot-server): only the `traefik` namespace + (bearer-token check happens in `snapshot_server.py`). +- **CDP port 9222** is internal-only (no ingress, no Cloudflare DNS). - **noVNC sidecar** (`forgejo.viktorbarzin.me/viktor/chrome-service-novnc`) exposes a live HTML5 view of the headed Chromium session via `x11vnc` (connected to Xvfb on `localhost:6099`) bridged to `websockify` on port 6080. Service `chrome` maps :80 → :6080 and is exposed via `ingress_factory` at `chrome.viktorbarzin.me`, - Authentik-gated. Both static page and WebSocket upgrade share the - same path — Cloudflare proxy, Cloudflared tunnel, Traefik, and - Authentik forward-auth all preserve `Upgrade: websocket`. + Authentik-gated. +- **snapshot-server sidecar** (`mcr.microsoft.com/playwright/python:v1.48.0-noble`) + serves `GET /api/snapshot` from `/profile/snapshots/storage-state.json`, + bearer-gated by `PW_TOKEN`. Service `chrome-snapshot` maps :8088 → :8088 + and is exposed at `chrome.viktorbarzin.me/api/snapshot` via a second + `ingress_factory` call with `auth = "none"` (the bearer check is in + the sidecar, not at the ingress layer). -## Adding a new caller +## Adding a new in-cluster caller -See `stacks/chrome-service/README.md` for the four-step recipe: - -1. Label the caller's namespace. -2. Add an `ExternalSecret` pulling `secret/chrome-service`. -3. Inject `CHROME_WS_URL` + `CHROME_WS_TOKEN` env vars. -4. Vendor `stealth.js` and apply via `await context.add_init_script(...)` - after every `new_context()`. +See `stacks/chrome-service/README.md` for the recipe (label namespace, +inject `CHROME_CDP_URL`, vendor `stealth.js`). ## Limits + risks @@ -134,3 +187,9 @@ See `stacks/chrome-service/README.md` for the four-step recipe: - **No `/metrics` endpoint** — the cluster's generic `KubePodCrashLooping` rule covers basic alerting. A Prometheus scrape exporter is day-2 work. +- **Snapshot covers cookies + localStorage only** — Playwright's + `storage_state()` API doesn't capture IndexedDB or sessionStorage. + Sites that rely on those for auth won't warm via the snapshot. +- **Snapshot freshness up to 1h stale** — if a site rotates session + cookies more often than that, an on-demand refresh CLI is needed + (deferred to follow-on). diff --git a/docs/runbooks/chrome-service-snapshot.md b/docs/runbooks/chrome-service-snapshot.md new file mode 100644 index 00000000..ab065503 --- /dev/null +++ b/docs/runbooks/chrome-service-snapshot.md @@ -0,0 +1,211 @@ +# Runbook — chrome-service snapshot pipeline + +Operational playbook for the hourly cookie-snapshot pipeline that warms +external Claude Code sessions on the dev box. Architecture in +`architecture/chrome-service.md`. + +## At a glance + +| Component | Where | When | What | +|---|---|---|---| +| chrome-service Deployment | `chrome-service` ns | always-on | headed chromium, CDP :9222, persistent /profile/chromium-data | +| snapshot-server sidecar | same pod | always-on | serves `/api/snapshot`, bearer-gated, port 8088 | +| snapshot-harvester CronJob | `chrome-service` ns | `23 * * * *` | dumps `storage_state()` via CDP → `/profile/snapshots/storage-state.json` | +| dev-box refresh timer | each dev box | hourly | curls `chrome.viktorbarzin.me/api/snapshot` → `~/.cache/playwright-shared-storage-state.json` | +| dev-box `playwright-mcp.service` | each dev box | always-on | `@playwright/mcp --isolated --storage-state=…` per-MCP-connection contexts | + +## Day-to-day + +### Log into a new site (warm the profile) + +1. Open `https://chrome.viktorbarzin.me/` (Authentik will gate). +2. The noVNC view of the in-cluster headed chromium loads. Click on the + browser window, navigate, log in. +3. Cookies land in `/profile/chromium-data/Default/Cookies` on the PVC. +4. Within ≤60 min, the snapshot-harvester CronJob picks them up and + writes the snapshot. Within ≤60 min after that, dev boxes pull the + new file. New Claude Code sessions see the new cookies. +5. To skip the wait: trigger the harvester now (next section). + +### Trigger snapshot harvester manually + +```bash +kubectl -n chrome-service create job \ + --from=cronjob/chrome-service-snapshot-harvester \ + snapshot-harvest-$(date +%s) + +# Watch logs +kubectl -n chrome-service logs -f -l job-name=$(kubectl -n chrome-service get jobs -o name | tail -1 | cut -d/ -f2) +``` + +Expected: `wrote snapshot (… bytes) to /profile/snapshots/storage-state.json`. + +### Trigger dev-box refresh manually + +```bash +# On the dev box, as the user whose Claude Code sessions need the new state: +systemctl --user start playwright-snapshot-refresh.service + +# Or directly: +/usr/local/bin/playwright-snapshot-refresh + +# Verify +ls -la ~/.cache/playwright-shared-storage-state.json +``` + +### Inspect the current snapshot + +```bash +# In-cluster (from any pod with kubectl exec into the chrome-service pod): +kubectl -n chrome-service exec deploy/chrome-service -c snapshot-server -- \ + cat /profile/snapshots/storage-state.json | jq '.cookies | length' + +# Externally (via the bearer-gated endpoint): +TOKEN=$(vault kv get -field=api_bearer_token secret/chrome-service) +curl -fsSL -H "Authorization: Bearer $TOKEN" \ + https://chrome.viktorbarzin.me/api/snapshot | jq '.cookies | length' +``` + +## Failure modes + +### "no browser contexts found" + +The harvester reports `no browser contexts found — chrome-service may +not have launched a persistent context yet` and exits non-zero. + +**Cause**: chromium just started and hasn't created its default context +yet, or it crashed. + +**Fix**: check chrome-service pod logs (`kubectl -n chrome-service logs +deploy/chrome-service -c chrome-service`). The next hourly run will +retry. If chromium is wedged: `kubectl -n chrome-service rollout restart +deploy/chrome-service` (strategy = Recreate, brief downtime). + +### "connect_over_cdp failed" + +Harvester or any in-cluster caller can't reach the CDP endpoint. + +**Cause**: chrome-service pod not Ready, NetworkPolicy doesn't admit +the caller's namespace, or chromium isn't listening on :9222. + +**Diagnose**: +```bash +kubectl -n chrome-service get pods +kubectl -n chrome-service describe networkpolicy chrome-service-ws-ingress + +# From inside the cluster (e.g. a debug pod in chrome-service ns): +nc -zv chrome-service.chrome-service.svc.cluster.local 9222 +curl -fsSL http://chrome-service.chrome-service.svc.cluster.local:9222/json/version +``` + +**Fix**: depends on the diagnosis. NetworkPolicy needs the caller's +namespace label or an explicit name-fallback. If chromium isn't +binding, check the container logs. + +### Dev-box `playwright-snapshot-refresh` returns 401 + +The bearer token in `~/.config/playwright/token` doesn't match the +server's. Almost always means the Vault secret was rotated and the +local cache is stale. + +**Fix**: +```bash +vault login -method=oidc # if needed +vault kv get -field=api_bearer_token secret/chrome-service > ~/.config/playwright/token +chmod 600 ~/.config/playwright/token +systemctl --user start playwright-snapshot-refresh.service +``` + +### Dev-box `playwright-snapshot-refresh` returns 404 with "snapshot not yet available" + +The harvester hasn't run successfully yet (fresh cluster, or all +recent runs failed). Trigger it manually (see "Trigger snapshot +harvester manually"). + +### Claude Code sessions still see old cookies + +The MCP server reads the snapshot file at process start and seeds each +new context with it. **Existing MCP sessions don't hot-reload** — they +keep the cookies they were seeded with at session start. New sessions +get the fresh snapshot. + +**Fix**: restart the MCP server on the dev box to pick up the new file: +```bash +systemctl --user restart playwright-mcp.service +``` + +### Snapshot file is suspiciously small or empty cookies array + +The persistent chromium context isn't holding any cookies. Probably +means the user hasn't logged into anything via noVNC, or chromium was +relaunched without preserving `/profile/chromium-data`. + +**Diagnose**: +```bash +kubectl -n chrome-service exec deploy/chrome-service -c chrome-service -- \ + ls -la /profile/chromium-data/Default/Cookies +``` + +A populated `Cookies` SQLite file should be several hundred KB once +real logins exist. If it's missing or empty, log in via noVNC. + +## Token rotation + +```bash +# Rotate Vault secret (32-byte URL-safe random). +vault kv put secret/chrome-service \ + api_bearer_token=$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))') + +# Reloader auto-restarts chrome-service pod (snapshot-server picks up new token). + +# On EVERY dev box that pulls the snapshot: +vault kv get -field=api_bearer_token secret/chrome-service > ~/.config/playwright/token +chmod 600 ~/.config/playwright/token + +# Verify the next refresh succeeds: +systemctl --user start playwright-snapshot-refresh.service +journalctl --user -u playwright-snapshot-refresh.service -n 20 +``` + +## Restore from a backup tarball + +The 6-hourly backup CronJob writes `tar -czf /backup/YYYY_MM_DD_HH.tar.gz +-C /profile .` to NFS at `/srv/nfs/chrome-service-backup/`. To restore +the entire profile: + +```bash +# 1. Scale chrome-service down so its lock is released. +kubectl -n chrome-service scale deploy/chrome-service --replicas=0 + +# 2. Mount the PVC in a helper pod and restore. +kubectl -n chrome-service apply -f - < storage-state.json + +# Use the snapshot with @playwright/mcp: +npx @playwright/mcp@latest --port 8931 --host localhost \ + --headless --browser chrome \ + --isolated --storage-state ./storage-state.json +``` + +The snapshot is refreshed hourly by the `chrome-service-snapshot-harvester` +CronJob (schedule `23 * * * *`) which calls `context.storageState()` via +the CDP endpoint and writes to `/profile/snapshots/storage-state.json` +(atomic rename). The `snapshot-server` sidecar serves that file. + +## Add a new in-cluster caller 1. **Label the caller's namespace** so the chrome-service NetworkPolicy admits it: @@ -39,27 +65,16 @@ which ESO syncs into a per-namespace K8s Secret in each caller stack } } ``` -2. **Add an ExternalSecret** in the caller stack pulling the token: +2. **Inject `CHROME_CDP_URL`** into the caller's pod env: ```hcl - resource "kubernetes_manifest" "chrome_token" { - manifest = { - apiVersion = "external-secrets.io/v1beta1" - kind = "ExternalSecret" - metadata = { name = "chrome-service-client-secrets", namespace = "" } - spec = { - refreshInterval = "15m" - secretStoreRef = { name = "vault-kv", kind = "ClusterSecretStore" } - target = { name = "chrome-service-client-secrets" } - dataFrom = [{ extract = { key = "chrome-service" } }] - } - } + env { + name = "CHROME_CDP_URL" + value = "http://chrome-service.chrome-service.svc.cluster.local:9222" } ``` -3. **Inject `CHROME_WS_URL` + `CHROME_WS_TOKEN`** into the caller's pod env. - Use `secret_key_ref` for the token; the URL is a plain value. -4. **Vendor `stealth.js`** into the caller (or just paste — it's ~40 lines) - and apply via `await context.add_init_script(STEALTH_JS)` after every - `new_context()`. Without it, hmembeds-class anti-bot still trips. +3. **Vendor `stealth.js`** into the caller (or just paste — it's ~40 + lines) and apply via `await context.add_init_script(STEALTH_JS)` after + every `new_context()`. Without it, hmembeds-class anti-bot still trips. ## Image pin @@ -70,17 +85,20 @@ between minors. ## Operations -- **Storage**: encrypted PVC at `/profile` for cookies + npm cache. Ephemeral - contexts (`browser.new_context()`) bypass the profile; persistent contexts - share it. Backed up tar+gzip every 6h to `/srv/nfs/chrome-service-backup/`, +- **Storage**: encrypted PVC at `/profile`. Chromium user-data-dir lives + at `/profile/chromium-data` — cookies + localStorage + IndexedDB + persist here. Snapshots at `/profile/snapshots/storage-state.json`. + Backed up tar+gzip every 6h to `/srv/nfs/chrome-service-backup/`, 30-day retention. -- **Probes**: TCP/3000. Playwright run-server has no HTTP `/health`; a TCP - open is the only liveness signal available without spinning a browser. -- **Health page**: visit `https://chrome.viktorbarzin.me` (Authentik-gated) - to confirm the pod is up. The WS port stays internal-only. +- **Probes**: TCP/9222. Chrome's CDP serves `/json/version` once it's + bound; TCP-open is enough for readiness. +- **Health page**: visit `https://chrome.viktorbarzin.me` (Authentik- + gated) to confirm the pod is up and to log into sites. The CDP port + stays internal-only. - **Token rotation**: `vault kv put secret/chrome-service api_bearer_token=$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')`. - Reloader cascades the rotation to both the server pod and any caller - whose secret has the `reloader.stakater.com/auto = "true"` annotation. + Reloader cascades to the snapshot-server sidecar. Update the cached + token on any dev box that pulls the snapshot: + `vault kv get -field=api_bearer_token secret/chrome-service > ~/.config/playwright/token`. ## Why headed (Xvfb) instead of headless? @@ -88,3 +106,14 @@ between minors. console-clear timing, and the `HeadlessChromium/...` user-agent suffix. Running headed inside `Xvfb :99` reports as a normal Chromium, and the stealth init script handles the JS-visible giveaways. + +## Why direct chromium (CDP) instead of `playwright launch-server`? + +`playwright launch-server` creates ephemeral browser contexts per +`connect()` call — cookies and localStorage never persist to the PVC. +The `/profile` mount only ever held npm cache + fontconfig cache +despite the original docs claiming it held "cookies, localStorage, +IndexedDB". Switched 2026-06-04 to direct chromium launch with +`--user-data-dir=/profile/chromium-data --remote-debugging-port=9222` +so the persistent profile actually persists, and callers migrate +`chromium.connect(ws_url)` → `chromium.connect_over_cdp(cdp_url)`. diff --git a/stacks/chrome-service/files/cdp_bridge.py b/stacks/chrome-service/files/cdp_bridge.py new file mode 100644 index 00000000..65d2cf5c --- /dev/null +++ b/stacks/chrome-service/files/cdp_bridge.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""CDP-aware proxy: 0.0.0.0:9222 → 127.0.0.1:9223 with Host header rewriting. + +Why this exists: + Stock Chrome binaries silently ignore --remote-debugging-address (the flag is + gated by a build-time switch most distributions don't set), so CDP always + binds 127.0.0.1:. Worse, Chrome enforces DNS rebinding protection on + the HTTP DevTools endpoint: any Host header that isn't `localhost`, + `127.0.0.1`, or `[::1]` returns 500 "Host header is specified and is not an + IP address or localhost". There is no `--remote-allow-hosts` flag in stock + Chrome 130 (verified by binary string search). + + This means a raw TCP forwarder doesn't work — clients hitting the K8s + Service DNS get 500 because Chrome rejects the Host header. + +What this script does: + - Listens on 0.0.0.0:9222 (the public CDP port the K8s Service exposes). + - For each TCP connection from a CDP client: + 1. Read the HTTP request line + headers. + 2. Rewrite `Host: ` to `Host: localhost:9222`, remembering + the original value (for response rewriting). + 3. Open a connection to Chrome at 127.0.0.1:9223 and forward the + modified request line + headers + body. + 4. Read Chrome's HTTP response. If it's 101 Switching Protocols + (WebSocket upgrade), forward it as-is and switch to raw byte piping + in both directions (CDP frames are binary, no further parsing). + 5. Otherwise it's a regular HTTP/JSON response. Substitute + `localhost:9222` (the URL Chrome composed from the rewritten Host) + back to the client's original Host header value. Forward. + - The Microsoft playwright image ships python3 but not socat, hence this + stdlib-only helper. + +Limitations: + - Only HTTP/1.x supported (CDP doesn't use HTTP/2). + - Body is assumed to fit in one read for non-WS responses (CDP JSON + responses are kilobytes, well within limits). + - No SSL/TLS — the cluster network is the trust boundary. +""" + +import os +import socket +import sys +import threading + + +LISTEN_ADDR = os.environ.get("BRIDGE_LISTEN_ADDR", "0.0.0.0") +LISTEN_PORT = int(os.environ.get("BRIDGE_LISTEN_PORT", "9222")) +TARGET_ADDR = os.environ.get("BRIDGE_TARGET_ADDR", "127.0.0.1") +TARGET_PORT = int(os.environ.get("BRIDGE_TARGET_PORT", "9223")) +INTERNAL_HOST = f"localhost:{LISTEN_PORT}" + + +def recv_until(sock: socket.socket, marker: bytes, max_bytes: int = 65536) -> bytes: + """Read from sock until marker is seen or max_bytes hit. Returns everything read.""" + buf = b"" + while marker not in buf and len(buf) < max_bytes: + chunk = sock.recv(4096) + if not chunk: + break + buf += chunk + return buf + + +def rewrite_host(headers: bytes, new_host: str) -> tuple[bytes, str | None]: + """Replace the Host header. Returns (new_headers, original_host).""" + lines = headers.split(b"\r\n") + original = None + out = [] + for line in lines: + if line.lower().startswith(b"host:"): + original = line.split(b":", 1)[1].strip().decode("latin-1") + out.append(f"Host: {new_host}".encode("latin-1")) + else: + out.append(line) + return b"\r\n".join(out), original + + +def pipe(src: socket.socket, dst: socket.socket) -> None: + """Raw byte pipe used after WS upgrade.""" + try: + while True: + data = src.recv(65536) + if not data: + break + dst.sendall(data) + except OSError: + pass + finally: + try: + src.shutdown(socket.SHUT_RD) + except OSError: + pass + try: + dst.shutdown(socket.SHUT_WR) + except OSError: + pass + + +def handle(client: socket.socket) -> None: + upstream: socket.socket | None = None + try: + # Read until end-of-headers. + head_buf = recv_until(client, b"\r\n\r\n") + if b"\r\n\r\n" not in head_buf: + return + head, tail = head_buf.split(b"\r\n\r\n", 1) + new_head, original_host = rewrite_host(head, INTERNAL_HOST) + + upstream = socket.create_connection((TARGET_ADDR, TARGET_PORT), timeout=5) + # `create_connection(timeout=5)` sets the socket's timeout to 5s, + # which then applies to all subsequent recv() calls too. After a WS + # upgrade either side can stay silent for minutes — leave timeouts + # off so the pipe doesn't blow up the connection on idle. + upstream.settimeout(None) + upstream.sendall(new_head + b"\r\n\r\n" + tail) + + # Read response headers from upstream. + resp_head_buf = recv_until(upstream, b"\r\n\r\n") + if b"\r\n\r\n" not in resp_head_buf: + return + resp_head, resp_tail = resp_head_buf.split(b"\r\n\r\n", 1) + first_line = resp_head.split(b"\r\n", 1)[0].decode("latin-1", errors="replace") + + # Match any 101 status (Chrome's CDP says "101 WebSocket Protocol + # Handshake", not the canonical "101 Switching Protocols"). Sniff the + # status code from the first line, e.g. "HTTP/1.1 101 ...". + parts = first_line.split(" ", 2) + status_code = parts[1] if len(parts) >= 2 else "" + + if status_code == "101": + # WS upgrade. Forward as-is and start raw pipe. + client.sendall(resp_head + b"\r\n\r\n" + resp_tail) + t1 = threading.Thread(target=pipe, args=(client, upstream), daemon=True) + t2 = threading.Thread(target=pipe, args=(upstream, client), daemon=True) + t1.start() + t2.start() + t1.join() + t2.join() + return + + # Regular HTTP response. Determine body length (Content-Length only — + # CDP doesn't use chunked encoding for /json/* endpoints) and rewrite. + content_length = 0 + for line in resp_head.split(b"\r\n"): + if line.lower().startswith(b"content-length:"): + try: + content_length = int(line.split(b":", 1)[1].strip()) + except ValueError: + pass + break + + body = resp_tail + while len(body) < content_length: + chunk = upstream.recv(65536) + if not chunk: + break + body += chunk + # Truncate any extra bytes that came past content_length (shouldn't + # happen with stock chrome but defensive against pipelined responses). + if content_length and len(body) > content_length: + body = body[:content_length] + + # Rewrite the URLs Chrome composed using its localhost Host so callers + # can follow them back through this bridge. + if original_host: + body = body.replace(INTERNAL_HOST.encode(), original_host.encode()) + + # Rebuild response headers: drop any existing Content-Length / Connection + # header and force `Connection: close` + the new Content-Length. This + # keeps the bridge one-request-per-connection (no keep-alive); avoids a + # whole class of upstream/downstream desync issues, especially because + # Node's ws library will open a fresh TCP for the WS upgrade rather + # than trying to reuse the HTTP probe's connection. + new_lines = [] + for line in resp_head.split(b"\r\n"): + l = line.lower() + if l.startswith(b"content-length:") or l.startswith(b"connection:"): + continue + new_lines.append(line) + new_lines.append(f"Content-Length: {len(body)}".encode()) + new_lines.append(b"Connection: close") + resp_head = b"\r\n".join(new_lines) + + client.sendall(resp_head + b"\r\n\r\n" + body) + except Exception as e: + sys.stderr.write(f"[cdp-bridge] handle error: {e}\n") + finally: + try: + client.close() + except OSError: + pass + if upstream is not None: + try: + upstream.close() + except OSError: + pass + + +def main() -> int: + listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + listener.bind((LISTEN_ADDR, LISTEN_PORT)) + listener.listen(64) + sys.stderr.write( + f"[cdp-bridge] HTTP-aware proxy listening on {LISTEN_ADDR}:{LISTEN_PORT} → " + f"{TARGET_ADDR}:{TARGET_PORT} (rewriting Host → {INTERNAL_HOST})\n" + ) + while True: + client, _ = listener.accept() + threading.Thread(target=handle, args=(client,), daemon=True).start() + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/stacks/chrome-service/files/snapshot_harvester.py b/stacks/chrome-service/files/snapshot_harvester.py new file mode 100644 index 00000000..76efa363 --- /dev/null +++ b/stacks/chrome-service/files/snapshot_harvester.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +"""Connect to chrome-service via CDP, dump storage state, write atomically. + +Runs hourly as a Kubernetes CronJob. Mounts the chrome-service encrypted +PVC at /profile (same node via pod-affinity) and writes the snapshot to +/profile/snapshots/storage-state.json. The snapshot-server sidecar reads +from the same path and serves it bearer-gated. + +CDP endpoint is plain HTTP — protection is the chrome-service +NetworkPolicy (allow only labelled client namespaces). Same security model +as the previous WS endpoint, just unauthenticated within the trust zone. +""" + +import asyncio +import logging +import os +import pathlib +import sys + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +log = logging.getLogger("snapshot-harvester") + +CDP_URL = os.environ.get( + "CDP_URL", "http://chrome-service.chrome-service.svc.cluster.local:9222" +) +SNAPSHOT_DIR = pathlib.Path(os.environ.get("SNAPSHOT_DIR", "/profile/snapshots")) +SNAPSHOT_FILE = SNAPSHOT_DIR / "storage-state.json" +TMP_FILE = SNAPSHOT_DIR / "storage-state.json.tmp" + + +async def main() -> int: + try: + from playwright.async_api import async_playwright + except ImportError: + log.error("playwright not installed in image") + return 2 + + SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True) + + async with async_playwright() as p: + try: + browser = await p.chromium.connect_over_cdp(CDP_URL, timeout=20_000) + except Exception: + log.exception("connect_over_cdp failed (%s)", CDP_URL) + return 3 + + try: + contexts = browser.contexts + if not contexts: + log.error("no browser contexts found — chrome-service may not have launched a persistent context yet") + return 4 + ctx = contexts[0] + # storage_state writes cookies + localStorage to a JSON file. + # IndexedDB and sessionStorage are NOT included (known Playwright limitation). + await ctx.storage_state(path=str(TMP_FILE)) + os.replace(TMP_FILE, SNAPSHOT_FILE) + size = SNAPSHOT_FILE.stat().st_size + log.info("wrote snapshot (%d bytes) to %s", size, SNAPSHOT_FILE) + finally: + try: + await browser.close() + except Exception: + pass + + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/stacks/chrome-service/files/snapshot_server.py b/stacks/chrome-service/files/snapshot_server.py new file mode 100644 index 00000000..c6c6f801 --- /dev/null +++ b/stacks/chrome-service/files/snapshot_server.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Tiny HTTP server that exposes /api/snapshot, gated by a bearer token. + +Runs as a sidecar in the chrome-service pod. Reads the persisted storage +state written hourly by the snapshot-harvester CronJob and returns it to +authenticated callers (the dev-box `playwright-snapshot-refresh` timer). + +Token is read from the PW_TOKEN env var, same secret the legacy WS path +used. The endpoint is mounted behind Traefik on `chrome.viktorbarzin.me` +at the `/api/snapshot` path (auth=none at the ingress; the bearer check +is here). +""" + +import os +import sys +from http.server import HTTPServer, BaseHTTPRequestHandler + +TOKEN = os.environ.get("PW_TOKEN") +SNAPSHOT_PATH = os.environ.get( + "SNAPSHOT_PATH", "/profile/snapshots/storage-state.json" +) +PORT = int(os.environ.get("PORT", "8088")) + + +class Handler(BaseHTTPRequestHandler): + server_version = "chrome-snapshot/1" + + def _short(self, status: int, body: bytes = b"") -> None: + self.send_response(status) + self.send_header("Content-Length", str(len(body))) + self.end_headers() + if body: + self.wfile.write(body) + + def do_GET(self): + if self.path == "/healthz": + self._short(200, b"ok\n") + return + if self.path != "/api/snapshot": + self._short(404) + return + if TOKEN is None: + self._short(503, b"{\"error\":\"token not configured\"}\n") + return + if self.headers.get("Authorization", "") != f"Bearer {TOKEN}": + self._short(401, b"{\"error\":\"invalid bearer\"}\n") + return + try: + with open(SNAPSHOT_PATH, "rb") as f: + data = f.read() + except FileNotFoundError: + self._short(404, b"{\"error\":\"snapshot not yet available\"}\n") + return + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Cache-Control", "no-cache") + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + + def log_message(self, fmt, *args): + sys.stderr.write( + "[snapshot-server] %s - %s\n" % (self.address_string(), fmt % args) + ) + + +if __name__ == "__main__": + HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() diff --git a/stacks/chrome-service/main.tf b/stacks/chrome-service/main.tf index 97de97f2..7a063bb6 100644 --- a/stacks/chrome-service/main.tf +++ b/stacks/chrome-service/main.tf @@ -10,9 +10,14 @@ locals { app = "chrome-service" } # Pin to the same Playwright minor that the Python client requires. - # If you bump this image, also bump `playwright==X.Y.Z` in the client - # (currently f1-stream) and re-run the connect smoke test. + # If you bump this image, also bump `playwright==X.Y.Z` in callers' + # requirements (currently f1-stream, snapshot-harvester) and re-run the + # connect smoke test. Image ships chromium under /ms-playwright/. image = "mcr.microsoft.com/playwright:v1.48.0-noble" + # Python image for the snapshot-harvester CronJob and the snapshot-server + # sidecar (the latter just runs a 60-line stdlib HTTP server). + python_image = "mcr.microsoft.com/playwright/python:v1.48.0-noble" + snapshot_dir = "/profile/snapshots" } # --- Namespace --- @@ -24,7 +29,7 @@ resource "kubernetes_namespace" "chrome_service" { "istio-injection" = "disabled" tier = local.tiers.aux "chrome-service.viktorbarzin.me/server" = "true" - "keel.sh/enrolled" = "true" + "keel.sh/enrolled" = "true" } } lifecycle { @@ -177,42 +182,77 @@ resource "kubernetes_deployment" "chrome_service" { image = local.image image_pull_policy = "IfNotPresent" - # `launch-server` (not `run-server`) lets us pin headed mode + - # specific args. `run-server` defaults to headless, which the - # disable-devtool.js Performance detector trips under Playwright - # (CDP adds latency to console.log; lib detects + redirects). - # The Microsoft image ships only the browsers, not the playwright - # npm package itself — `npx -y playwright@` downloads it on - # first start (cached under $HOME/.npm via the PVC) and pins to - # the same minor as the Python client. Bump in lockstep. + # Direct chromium launch (NOT `playwright launch-server`). Reason: + # launch-server creates ephemeral browser contexts per `connect()` + # call, so cookies/localStorage never persist to the PVC — the + # `/profile` mount only ever held npm cache + fontconfig. + # Replaced 2026-06-04 with a CDP+persistent-profile model so the + # warm browser (where Viktor logs in via noVNC) keeps cookies, and + # the hourly snapshot-harvester CronJob can dump them via the + # CDP endpoint. Callers migrate `chromium.connect()` → + # `chromium.connect_over_cdp()` (see f1-stream's playback_verifier). + # + # --remote-debugging-port=9222 : TCP CDP (vs default pipe). + # --remote-debugging-address=0.0.0.0 : bind on all pod IFs; + # NetworkPolicy is the gate. + # --remote-allow-origins=* : Chrome 111+ requires for + # non-loopback CDP origins. + # --user-data-dir=/profile/chromium-data: persistent profile on + # the encrypted PVC. command = ["bash", "-c"] args = [ <<-EOT set -e - # `-listen tcp` enables localhost:6099 so the noVNC sidecar can - # connect over the pod's shared network namespace (Ubuntu 24.04 - # defaults Xvfb to -nolisten tcp). - # `-ac` disables X access control so the noVNC sidecar can - # attach without an MIT-MAGIC-COOKIE; safe because Xvfb only - # listens on localhost (pod's lo). + # Locate chromium in the Microsoft image. The path is + # /ms-playwright/chromium-XXXX/chrome-linux/chrome where XXXX + # is the playwright-pinned build; resolve at runtime so a minor + # bump of the image doesn't break the launch line. + CHROMIUM=$(find /ms-playwright -maxdepth 4 -name 'chrome' -type f -executable -path '*/chrome-linux/*' 2>/dev/null | head -1) + if [ -z "$CHROMIUM" ]; then + echo "ERROR: chromium binary not found under /ms-playwright" >&2 + exit 1 + fi + echo "[chrome-service] using chromium: $CHROMIUM" + + # -listen tcp enables localhost:6099 so the noVNC sidecar can + # attach over the pod's shared network ns (Ubuntu 24.04 + # defaults Xvfb to -nolisten tcp). -ac disables X access + # control; safe because Xvfb only listens on the pod's lo. Xvfb :99 -screen 0 1280x720x24 -listen tcp -ac & sleep 1 - cat > /tmp/launch.json < regardless of what we + # pass. The K8s liveness/readiness probe + cluster callers reach + # the pod via its pod-IP, never localhost. + # Fix: chromium listens on 127.0.0.1:9223 (hidden internal port), + # cdp_bridge.py listens on 0.0.0.0:9222 (the public CDP port) and + # transparently forwards. K8s Service, probes, NetworkPolicy all + # stay on 9222 — no caller-side changes needed. + # (Microsoft playwright image ships python3 but not socat, so the + # bridge is a tiny stdlib script — see files/cdp_bridge.py.) + python3 /scripts/cdp_bridge.py & + BRIDGE_PID=$! + trap "kill $BRIDGE_PID 2>/dev/null" EXIT + + exec "$CHROMIUM" \ + --remote-debugging-port=9223 \ + --remote-allow-origins=* \ + --user-data-dir=/profile/chromium-data \ + --no-sandbox \ + --no-first-run \ + --no-default-browser-check \ + --disable-blink-features=AutomationControlled \ + --disable-features=IsolateOrigins,site-per-process \ + --autoplay-policy=no-user-gesture-required \ + --disable-dev-shm-usage \ + --password-store=basic \ + --use-mock-keychain \ + about:blank EOT ] @@ -224,36 +264,28 @@ resource "kubernetes_deployment" "chrome_service" { name = "HOME" value = "/profile" } - env { - name = "PW_TOKEN" - value_from { - secret_key_ref { - name = "chrome-service-secrets" - key = "api_bearer_token" - } - } - } port { - name = "ws" - container_port = 3000 + name = "cdp" + container_port = 9222 protocol = "TCP" } - # Playwright run-server exposes only the WS endpoint; no /health. + # Chrome's CDP endpoint serves /json/version once it's bound; + # TCP-open is enough for readiness. liveness_probe { - tcp_socket { port = 3000 } + tcp_socket { port = 9222 } initial_delay_seconds = 30 period_seconds = 30 failure_threshold = 3 } readiness_probe { - tcp_socket { port = 3000 } + tcp_socket { port = 9222 } initial_delay_seconds = 10 period_seconds = 10 } startup_probe { - tcp_socket { port = 3000 } + tcp_socket { port = 9222 } period_seconds = 5 failure_threshold = 24 # up to 2 minutes } @@ -266,6 +298,13 @@ resource "kubernetes_deployment" "chrome_service" { name = "dshm" mount_path = "/dev/shm" } + # /scripts/cdp_bridge.py provides the 0.0.0.0:9222 → 127.0.0.1:9223 + # TCP forwarder (see entrypoint comment above for why). + volume_mount { + name = "scripts" + mount_path = "/scripts" + read_only = true + } resources { requests = { @@ -280,8 +319,8 @@ resource "kubernetes_deployment" "chrome_service" { # noVNC sidecar — exposes a live HTML5 view of the headed Chromium # session via x11vnc + websockify, gated by the Authentik-protected - # ingress at chrome.viktorbarzin.me. WS port 3000 (the Playwright - # endpoint) stays internal-only. + # ingress at chrome.viktorbarzin.me. CDP port 9222 (the new + # Playwright endpoint) stays internal-only. container { name = "novnc" # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. @@ -301,6 +340,75 @@ resource "kubernetes_deployment" "chrome_service" { } } + # snapshot-server sidecar — serves the hourly storage-state.json + # snapshot (written by the snapshot-harvester CronJob to the same + # PVC) over an HTTP endpoint, bearer-gated by PW_TOKEN. Mounted + # behind Traefik at chrome.viktorbarzin.me/api/snapshot with + # auth=none; the bearer check inside this server is the gate. + # Source: files/snapshot_server.py — 60 lines, stdlib only. + container { + name = "snapshot-server" + image = local.python_image + image_pull_policy = "IfNotPresent" + command = ["python3", "/scripts/snapshot_server.py"] + + env { + name = "PW_TOKEN" + value_from { + secret_key_ref { + name = "chrome-service-secrets" + key = "api_bearer_token" + } + } + } + env { + name = "SNAPSHOT_PATH" + value = "${local.snapshot_dir}/storage-state.json" + } + env { + name = "PORT" + value = "8088" + } + + port { + name = "snap" + container_port = 8088 + protocol = "TCP" + } + liveness_probe { + http_get { + path = "/healthz" + port = 8088 + } + initial_delay_seconds = 5 + period_seconds = 30 + } + readiness_probe { + http_get { + path = "/healthz" + port = 8088 + } + initial_delay_seconds = 2 + period_seconds = 10 + } + + volume_mount { + name = "profile" + mount_path = "/profile" + read_only = true + } + volume_mount { + name = "scripts" + mount_path = "/scripts" + read_only = true + } + + resources { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "96Mi" } + } + } + volume { name = "profile" persistent_volume_claim { @@ -314,6 +422,13 @@ resource "kubernetes_deployment" "chrome_service" { size_limit = "256Mi" } } + volume { + name = "scripts" + config_map { + name = kubernetes_config_map_v1.snapshot_scripts.metadata[0].name + default_mode = "0555" + } + } } } } @@ -334,8 +449,27 @@ resource "kubernetes_deployment" "chrome_service" { } } +# --- ConfigMap: sidecar + harvester scripts --- +resource "kubernetes_config_map_v1" "snapshot_scripts" { + metadata { + name = "snapshot-scripts" + namespace = kubernetes_namespace.chrome_service.metadata[0].name + labels = local.labels + } + data = { + "snapshot_server.py" = file("${path.module}/files/snapshot_server.py") + "snapshot_harvester.py" = file("${path.module}/files/snapshot_harvester.py") + # Tiny TCP forwarder used by chrome-service container to bridge + # 0.0.0.0:9222 → 127.0.0.1:9223 (Chromium silently ignores + # --remote-debugging-address on stock builds; see cdp_bridge.py). + "cdp_bridge.py" = file("${path.module}/files/cdp_bridge.py") + } +} + # --- Services --- -# WS endpoint (internal only, gated by NetworkPolicy + token). +# CDP endpoint (internal only, gated by NetworkPolicy). 2026-06-04: switched +# from Playwright WS (:3000) to direct chromium CDP (:9222) so the persistent +# user-data-dir actually persists cookies; callers use `connect_over_cdp()`. resource "kubernetes_service" "chrome_service" { metadata { name = "chrome-service" @@ -346,9 +480,9 @@ resource "kubernetes_service" "chrome_service" { spec { selector = local.labels port { - name = "ws" - port = 3000 - target_port = 3000 + name = "cdp" + port = 9222 + target_port = 9222 protocol = "TCP" } } @@ -373,6 +507,27 @@ resource "kubernetes_service" "chrome_novnc" { } } +# Snapshot-server endpoint (bearer-gated, exposed via ingress sub-path +# chrome.viktorbarzin.me/api/snapshot — auth=none at the ingress layer +# because the bearer check happens inside snapshot_server.py). +resource "kubernetes_service" "chrome_snapshot" { + metadata { + name = "chrome-snapshot" + namespace = kubernetes_namespace.chrome_service.metadata[0].name + labels = local.labels + } + + spec { + selector = local.labels + port { + name = "snap" + port = 8088 + target_port = 8088 + protocol = "TCP" + } + } +} + module "ingress" { source = "../../modules/kubernetes/ingress_factory" dns_type = "proxied" @@ -391,12 +546,38 @@ module "ingress" { } } +# Second ingress on the same host (chrome.viktorbarzin.me) carving out +# /api/snapshot to the snapshot-server sidecar. Path-level carve-out +# pattern — see CLAUDE.md "For path-level carve-outs (e.g. wrongmove has +# `/` behind Anubis but `/api` direct), declare a second ingress_factory +# with `ingress_path = ["/"]` pointing at the bare backend service." +module "ingress_snapshot" { + source = "../../modules/kubernetes/ingress_factory" + # auth = "none": bearer-token gated inside snapshot-server.py; Authentik + # forward-auth would require an OIDC cookie that the dev-box refresh + # timer can't replay. + auth = "none" + dns_type = "none" # DNS already created by module.ingress + namespace = kubernetes_namespace.chrome_service.metadata[0].name + name = "chrome-snapshot" + host = "chrome" + service_name = kubernetes_service.chrome_snapshot.metadata[0].name + port = 8088 + ingress_path = ["/api/snapshot"] + tls_secret_name = var.tls_secret_name + extra_annotations = { + "gethomepage.dev/enabled" = "false" + } +} + # --- NetworkPolicy: scoped ingress. -# - TCP/3000 (Playwright WS): only from labelled client namespaces. -# - TCP/6080 (noVNC HTTP+WS): only from the traefik namespace, since the -# public-facing path is `chrome.viktorbarzin.me` ingress → Traefik → -# sidecar. Authentik forward-auth still gates external access at the -# Traefik layer. +# - TCP/9222 (Chromium CDP): only from labelled client namespaces. +# - TCP/6080 (noVNC HTTP+WS): only from the traefik namespace (public path +# is chrome.viktorbarzin.me → Traefik → sidecar; Authentik forward-auth +# gates external access at the Traefik layer). +# - TCP/8088 (snapshot-server): only from the traefik namespace +# (chrome.viktorbarzin.me/api/snapshot → Traefik → sidecar; bearer token +# is the gate inside snapshot-server.py). # The cluster has no default-deny, so this NP only takes effect inside # chrome-service ns — pods elsewhere remain unaffected. resource "kubernetes_network_policy_v1" "ws_ingress" { @@ -426,8 +607,17 @@ resource "kubernetes_network_policy_v1" "ws_ingress" { } } } + # Also admit chrome-service's own namespace (the snapshot-harvester + # CronJob runs here and needs to reach the CDP endpoint). + from { + namespace_selector { + match_labels = { + "kubernetes.io/metadata.name" = "chrome-service" + } + } + } ports { - port = "3000" + port = "9222" protocol = "TCP" } } @@ -443,6 +633,10 @@ resource "kubernetes_network_policy_v1" "ws_ingress" { port = "6080" protocol = "TCP" } + ports { + port = "8088" + protocol = "TCP" + } } } } @@ -527,3 +721,113 @@ resource "kubernetes_cron_job_v1" "chrome_service_backup" { ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] } } + +# --- Snapshot harvester CronJob: hourly storage_state() dump via CDP --- +# Connects to the live chrome-service CDP endpoint, accesses the +# persistent default browser context (where Viktor's noVNC logins live), +# and writes cookies + localStorage to /profile/snapshots/storage-state.json +# (atomic rename). The snapshot-server sidecar reads from the same file. +resource "kubernetes_cron_job_v1" "chrome_service_snapshot_harvester" { + metadata { + name = "chrome-service-snapshot-harvester" + namespace = kubernetes_namespace.chrome_service.metadata[0].name + } + spec { + concurrency_policy = "Replace" + failed_jobs_history_limit = 3 + successful_jobs_history_limit = 1 + # Hourly, offset from the backup CronJob (which runs at :47 every 6h) + # so they don't fight for the encrypted PVC at the same minute. + schedule = "23 * * * *" + starting_deadline_seconds = 60 + job_template { + metadata {} + spec { + backoff_limit = 2 + ttl_seconds_after_finished = 300 + template { + metadata {} + spec { + # PVC is RWO — colocate with the chrome-service pod. + affinity { + pod_affinity { + required_during_scheduling_ignored_during_execution { + label_selector { + match_labels = local.labels + } + topology_key = "kubernetes.io/hostname" + } + } + } + container { + name = "harvester" + image = local.python_image + image_pull_policy = "IfNotPresent" + # The Microsoft playwright/python image ships only browsers + + # Python — the `playwright` pip package itself is NOT installed + # (it's meant for CI that brings its own requirements). We + # install at startup, caching to the PVC so subsequent runs + # are near-instant. + command = ["bash", "-c"] + args = [ + <<-EOT + set -e + export PIP_CACHE_DIR=/profile/.cache/pip + export PIP_DISABLE_PIP_VERSION_CHECK=1 + python3 -c 'import playwright' 2>/dev/null \ + || pip install --quiet --no-warn-script-location playwright==1.48.0 + exec python3 /scripts/snapshot_harvester.py + EOT + ] + env { + name = "CDP_URL" + value = "http://chrome-service.chrome-service.svc.cluster.local:9222" + } + env { + name = "SNAPSHOT_DIR" + value = local.snapshot_dir + } + # Don't try to download browsers — connect_over_cdp doesn't + # need them locally. + env { + name = "PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD" + value = "1" + } + volume_mount { + name = "profile" + mount_path = "/profile" + } + volume_mount { + name = "scripts" + mount_path = "/scripts" + read_only = true + } + resources { + requests = { cpu = "20m", memory = "128Mi" } + limits = { memory = "512Mi" } + } + } + volume { + name = "profile" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.profile_encrypted.metadata[0].name + } + } + volume { + name = "scripts" + config_map { + name = kubernetes_config_map_v1.snapshot_scripts.metadata[0].name + default_mode = "0555" + } + } + restart_policy = "OnFailure" + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } +} diff --git a/stacks/f1-stream/files/backend/extractors/chrome_browser.py b/stacks/f1-stream/files/backend/extractors/chrome_browser.py index 299790d4..ec187a1d 100644 --- a/stacks/f1-stream/files/backend/extractors/chrome_browser.py +++ b/stacks/f1-stream/files/backend/extractors/chrome_browser.py @@ -86,12 +86,16 @@ def _looks_like_hls_playlist(url: str) -> bool: return bool(_HLS_URL_RE.search(url)) -def _resolve_chrome_ws() -> str | None: - base = os.getenv("CHROME_WS_URL") - token = os.getenv("CHROME_WS_TOKEN") - if not base or not token: - return None - return f"{base.rstrip('/')}/{token}" +def _resolve_chrome_cdp() -> str | None: + """Resolve the CHROME_CDP_URL env var (set by f1-stream's TF stack). + + Migrated 2026-06-04 from CHROME_WS_URL/CHROME_WS_TOKEN. chrome-service + now runs chromium directly with CDP exposed on :9222 so its persistent + user-data-dir actually persists cookies (the old playwright launch-server + pattern created ephemeral contexts per `connect()`). NetworkPolicy + (labelled client namespaces only) is the only gate — no path token. + """ + return os.getenv("CHROME_CDP_URL") class ChromeBrowserExtractor(BaseExtractor): @@ -106,10 +110,10 @@ class ChromeBrowserExtractor(BaseExtractor): return "Chrome Browser" async def extract(self) -> list[ExtractedStream]: - ws_url = _resolve_chrome_ws() - if not ws_url: + cdp_url = _resolve_chrome_cdp() + if not cdp_url: logger.warning( - "[chrome-browser] CHROME_WS_URL/TOKEN not set — extractor disabled" + "[chrome-browser] CHROME_CDP_URL not set — extractor disabled" ) return [] @@ -123,9 +127,9 @@ class ChromeBrowserExtractor(BaseExtractor): # round. Contexts are cheap; the browser is shared. async with async_playwright() as p: try: - browser = await p.chromium.connect(ws_url, timeout=15_000) + browser = await p.chromium.connect_over_cdp(cdp_url, timeout=15_000) except Exception: - logger.exception("[chrome-browser] connect to chrome-service failed") + logger.exception("[chrome-browser] CDP connect to chrome-service failed") return [] results: list[ExtractedStream] = [] diff --git a/stacks/f1-stream/files/backend/playback_verifier.py b/stacks/f1-stream/files/backend/playback_verifier.py index d6ff1d7f..661cf0a0 100644 --- a/stacks/f1-stream/files/backend/playback_verifier.py +++ b/stacks/f1-stream/files/backend/playback_verifier.py @@ -336,14 +336,32 @@ class PlaybackVerifier: logger.error("playwright not installed — playback verification disabled") return None self._playwright = await async_playwright().start() - ws_base = os.getenv("CHROME_WS_URL") - ws_token = os.getenv("CHROME_WS_TOKEN") - if ws_base and ws_token: - self._browser = await self._playwright.chromium.connect( - f"{ws_base.rstrip('/')}/{ws_token}", timeout=15_000, - ) - logger.info("connected to remote chrome-service (concurrency=%d)", MAX_CONCURRENCY) - else: + # CHROME_CDP_URL points to chrome-service's CDP endpoint + # (http://chrome-service.chrome-service.svc:9222 by default). + # Migrated 2026-06-04 from `chromium.connect(ws_url)` because + # chrome-service now runs chromium directly with persistent + # user-data-dir for cookie warming — launch-server couldn't + # persist. The CDP `Browser` exposes the persistent default + # context via `browser.contexts[0]`; here we just call + # `new_context()` for incognito-style isolation per verify + # round, matching the previous behaviour. + cdp_url = os.getenv("CHROME_CDP_URL") + if cdp_url: + try: + self._browser = await self._playwright.chromium.connect_over_cdp( + cdp_url, timeout=15_000, + ) + logger.info("connected to remote chrome-service via CDP (concurrency=%d)", MAX_CONCURRENCY) + except Exception: + logger.exception( + "CDP connect failed (%s) — falling back to in-process Chromium", cdp_url, + ) + self._browser = None + if self._browser is None: + # Either CHROME_CDP_URL was unset, or CDP connect failed. + # Fall back to in-process headless so the verifier still + # returns playable/unplayable verdicts (degraded but + # functional — anti-bot pages may bypass). self._browser = await self._playwright.chromium.launch( headless=True, args=[ @@ -355,7 +373,10 @@ class PlaybackVerifier: "--autoplay-policy=no-user-gesture-required", ], ) - logger.warning("CHROME_WS_URL not set — using in-process Chromium (concurrency=%d)", MAX_CONCURRENCY) + logger.warning( + "using in-process Chromium (CHROME_CDP_URL unset or CDP connect failed) (concurrency=%d)", + MAX_CONCURRENCY, + ) return self._browser async def shutdown(self) -> None: diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf index b3f2399f..ff64af71 100644 --- a/stacks/f1-stream/main.tf +++ b/stacks/f1-stream/main.tf @@ -148,18 +148,14 @@ resource "kubernetes_deployment" "f1-stream" { } # Verifier connects to in-cluster headed Chromium pool — see # stacks/chrome-service/. Falls back to in-process headless if unset. + # 2026-06-04: migrated WS (:3000 / path-token) → CDP (:9222 / + # NetworkPolicy-gated). Token is no longer needed for the + # connection itself; the chrome-service-client-secrets ExternalSecret + # below stays in place because the snapshot endpoint (dev-box only, + # not used by f1-stream) reuses the same Vault key. env { - name = "CHROME_WS_URL" - value = "ws://chrome-service.chrome-service.svc.cluster.local:3000" - } - env { - name = "CHROME_WS_TOKEN" - value_from { - secret_key_ref { - name = "chrome-service-client-secrets" - key = "api_bearer_token" - } - } + name = "CHROME_CDP_URL" + value = "http://chrome-service.chrome-service.svc.cluster.local:9222" } # The embed proxy (this pod's /embed?url=…) must be reachable from # the remote chrome-service pod. Default 127.0.0.1 only works for