From 29ad2000263e0a018a0384d473ce4da61373780c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 17:19:55 +0000 Subject: [PATCH 1/6] priority-pass: bump image_tag to 4ce9e8e8 [ci skip] Auto-committed by ViktorBarzin/priority-pass GHA on push to main. Source: https://github.com/ViktorBarzin/priority-pass/commit/4ce9e8e8944e4bcfca246d7aebfaa5f3dba84252 --- stacks/priority-pass/terragrunt.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stacks/priority-pass/terragrunt.hcl b/stacks/priority-pass/terragrunt.hcl index 34be6725..926fe6e1 100644 --- a/stacks/priority-pass/terragrunt.hcl +++ b/stacks/priority-pass/terragrunt.hcl @@ -21,5 +21,5 @@ inputs = { # priority-pass repo HEAD — auto-bumped by GHA `build-and-deploy.yml` # on every successful build. Manual edits welcome for local trials, # but CI will overwrite on the next push to main. - image_tag = "63e118c3" + image_tag = "4ce9e8e8" } From 46b5f04f67cec6100da00dd76310e7b59ff462f8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 17:20:08 +0000 Subject: [PATCH 2/6] priority-pass: bump image_tag to 63e118c3 [ci skip] Auto-committed by ViktorBarzin/priority-pass GHA on push to main. Source: https://github.com/ViktorBarzin/priority-pass/commit/63e118c334733dab71061f54ed620ea38c05e8af --- stacks/priority-pass/terragrunt.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stacks/priority-pass/terragrunt.hcl b/stacks/priority-pass/terragrunt.hcl index 926fe6e1..34be6725 100644 --- a/stacks/priority-pass/terragrunt.hcl +++ b/stacks/priority-pass/terragrunt.hcl @@ -21,5 +21,5 @@ inputs = { # priority-pass repo HEAD — auto-bumped by GHA `build-and-deploy.yml` # on every successful build. Manual edits welcome for local trials, # but CI will overwrite on the next push to main. - image_tag = "4ce9e8e8" + image_tag = "63e118c3" } From d1f2e50736aef6e78bc723cd7a956c00e5eac353 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 17:44:40 +0000 Subject: [PATCH 3/6] priority-pass: bump image_tag to 4ce9e8e8 [ci skip] Auto-committed by ViktorBarzin/priority-pass GHA on push to main. Source: https://github.com/ViktorBarzin/priority-pass/commit/4ce9e8e8944e4bcfca246d7aebfaa5f3dba84252 --- stacks/priority-pass/terragrunt.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stacks/priority-pass/terragrunt.hcl b/stacks/priority-pass/terragrunt.hcl index 34be6725..926fe6e1 100644 --- a/stacks/priority-pass/terragrunt.hcl +++ b/stacks/priority-pass/terragrunt.hcl @@ -21,5 +21,5 @@ inputs = { # priority-pass repo HEAD — auto-bumped by GHA `build-and-deploy.yml` # on every successful build. Manual edits welcome for local trials, # but CI will overwrite on the next push to main. - image_tag = "63e118c3" + image_tag = "4ce9e8e8" } From eb47eb1d10ad497794ef2a8c3a6d28d7ee7ad738 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 17:45:33 +0000 Subject: [PATCH 4/6] priority-pass: bump image_tag to 63e118c3 [ci skip] Auto-committed by ViktorBarzin/priority-pass GHA on push to main. Source: https://github.com/ViktorBarzin/priority-pass/commit/63e118c334733dab71061f54ed620ea38c05e8af --- stacks/priority-pass/terragrunt.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stacks/priority-pass/terragrunt.hcl b/stacks/priority-pass/terragrunt.hcl index 926fe6e1..34be6725 100644 --- a/stacks/priority-pass/terragrunt.hcl +++ b/stacks/priority-pass/terragrunt.hcl @@ -21,5 +21,5 @@ inputs = { # priority-pass repo HEAD — auto-bumped by GHA `build-and-deploy.yml` # on every successful build. Manual edits welcome for local trials, # but CI will overwrite on the next push to main. - image_tag = "4ce9e8e8" + image_tag = "63e118c3" } From 0a6ed4b2fe023e4701b0259efb43e7e98a1f6ebd Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 16 Jun 2026 20:33:47 +0000 Subject: [PATCH 5/6] workstation: per-user playwright browser MCP for all users, reproducible from git MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Viktor asked that the playwright browser MCP be available for every devvm user in every directory, with each user running their own server and multiple concurrent sessions per user. Before this, playwright was hand-set-up per user (~/.config/systemd/user/ playwright-mcp.service on 8931/8932/8933) and only wizard was actually wired — emo's and anca's servers ran but their ~/.claude.json had no playwright entry, so their Claude never connected. None of it was reproducible from git (units, refresh script, and the Vault snapshot token lived only in user homes), so a devvm rebuild would silently lose it. This makes it reproducible and fixes the unwired users: - roster_engine.py: sticky per-user PLAYWRIGHT_PORT (PLAYWRIGHT_BASE_PORT=8931, allocated for every roster user incl. the admin), emitted in the derive JSON. - scripts/workstation/playwright/: system-level TEMPLATE units (playwright-mcp@.service + playwright-snapshot-refresh@.{service,timer}, User=%i — system manager, so no systemd --user / linger) + the refresh script. @playwright/mcp pinned to 0.0.76 (avoids the @latest silent-fleet-roll footgun, same rationale as T3_PIN). - setup-devvm.sh: install the templates + script (9e); stage the chrome-service snapshot bearer token from Vault to a root file (8c) — the hourly root reconcile has no Vault token, mirrors the Claude OAuth staging in 8a. - t3-provision-users.sh: install_playwright() (ALL tiers incl. admin) writes PLAYWRIGHT_PORT, seeds the token if-absent, wires the user-scope ~/.claude.json by running `claude mcp add` AS the user (clobber-proof + if-absent, so it fixes existing/new/admin without rewriting a populated config), and enable --now's the instances (idempotent, never restarts a running server). Also hardened the section-1 *.env scan to skip the new playwright-*.env files (no T3_PORT -> grep no-match would abort under set -e -o pipefail). - Docs: chrome-service-snapshot runbook (new Provisioning section + system-unit commands), multi-tenancy.md, and the 2026-06-07 plan Task 2.3. Supersedes the hand-made per-user --user units (one-time idle-gated migration to follow on the live host). Co-Authored-By: Claude Opus 4.8 --- docs/architecture/multi-tenancy.md | 4 +- .../2026-06-07-multi-user-workstation-plan.md | 15 ++++ docs/runbooks/chrome-service-snapshot.md | 73 +++++++++++++----- scripts/t3-provision-users.sh | 74 ++++++++++++++++++- .../playwright/playwright-mcp@.service | 35 +++++++++ .../playwright/playwright-snapshot-refresh | 57 ++++++++++++++ .../playwright-snapshot-refresh@.service | 22 ++++++ .../playwright-snapshot-refresh@.timer | 16 ++++ scripts/workstation/roster_engine.py | 39 ++++++++-- scripts/workstation/setup-devvm.sh | 20 +++++ scripts/workstation/test_roster_engine.py | 47 ++++++++++++ 11 files changed, 373 insertions(+), 29 deletions(-) create mode 100644 scripts/workstation/playwright/playwright-mcp@.service create mode 100755 scripts/workstation/playwright/playwright-snapshot-refresh create mode 100644 scripts/workstation/playwright/playwright-snapshot-refresh@.service create mode 100644 scripts/workstation/playwright/playwright-snapshot-refresh@.timer diff --git a/docs/architecture/multi-tenancy.md b/docs/architecture/multi-tenancy.md index 7764ebb1..27d856ef 100644 --- a/docs/architecture/multi-tenancy.md +++ b/docs/architecture/multi-tenancy.md @@ -547,6 +547,8 @@ Separate from the in-cluster namespace-owner model above, the **devvm** (`10.0.1 **Claude Code runtime — native, per-user (2026-06-15):** `claude` is the **native** install (`~/.local/bin/claude` → `~/.local/share/claude/versions/`, self-updating; `installMethod: native`) — NOT npm-global or npx. It is the runtime for both the ttyd launcher and each `t3-serve` instance. `setup-devvm.sh` installs node ONLY for the `t3` CLI (not claude); per-user native claude is provisioned by the reconcile's `install_user_claude_native` (covers terminal + t3, idempotent, skip-if-present) and self-bootstrapped by `start-claude.sh` on first launch — both via the official `https://claude.ai/install.sh`. The legacy machine-wide `npm install -g @anthropic-ai/claude-code` bootstrap and the launcher's `npx` fallback were removed; existing users had already auto-migrated to native, and the npm-global dir was empty. **PATH (`~/.local/bin`, where the native binary lives):** ensured three ways — `/etc/profile.d/10-local-bin.sh` for login shells (machine-wide, fresh-user-safe), `start-claude.sh` itself (the launcher runs in tmux's non-login env that skips the user's shell rc), and `t3-serve@.service` (`Environment=PATH=…:/home/%i/.local/bin`). +**Per-user browser MCP — playwright, reproducible from git (2026-06-16):** every user (incl. the admin) gets their OWN isolated `@playwright/mcp` server so their concurrent Claude sessions don't fight over tabs (`--isolated` → a fresh browser context per MCP connection), wired into Claude in **every directory** via a user-scope `~/.claude.json` entry (`playwright → http://localhost:/mcp`). Mechanism: **system-level template units** `playwright-mcp@.service` + `playwright-snapshot-refresh@.{service,timer}` (`User=%i`, sourced from `scripts/workstation/playwright/`, installed by `setup-devvm.sh` §9e — system manager, so NO systemd --user / linger). `roster_engine.py` allocates a sticky per-user `PLAYWRIGHT_PORT` (`PLAYWRIGHT_BASE_PORT=8931`); the reconcile's `install_playwright()` writes it, seeds the chrome-service snapshot token if-absent (staged from Vault `secret/chrome-service` to `/etc/t3-serve/chrome-service-token` by `setup-devvm.sh` §8c, since the hourly root reconcile has no Vault token), wires `~/.claude.json` by running `claude mcp add --scope user` AS the user (clobber-proof + if-absent, so it fixes existing/new/admin without rewriting a populated config), and `enable --now`s the instances (idempotent — never restarts a running server). The `@playwright/mcp` version is **pinned** in the unit (the `@latest`-silently-rolls-the-fleet footgun — see `T3_PIN`). Replaced the earlier hand-made `~/.config/systemd/user/playwright-*` units (one-time idle-gated migration; pre-migration emo/anca had servers running but never wired into their `.claude.json`). Cookie-warming pipeline + ops: `../runbooks/chrome-service-snapshot.md`. + **Infra access:** non-admins get their own **writable, git-crypt-LOCKED** clone of the (public) infra repo — code/docs plaintext, secret files (`*.tfvars`, `secrets/**`) stay ciphertext. Its location depends on the per-user `code_layout` in `roster.yaml`: `single` (default) puts the clone AT `~/code`; `workspace` makes `~/code` a plain directory of per-project clones — the infra clone at `~/code/infra` plus each roster `repos` entry cloned from Forgejo `viktor/` **as the user** (their PAT authenticates, so private repos work; clone failures WARN and retry next hour). Flipping a user to `workspace` auto-migrates their existing `~/code` clone to `~/code/infra` (local branches/dirty state survive; running processes follow the moved inode). ancamilea = workspace + `tripit` since 2026-06-10. The provisioner clones infra anonymously from the public GitHub mirror; **contribute access is wired per-user on top** (see below). The apply boundary still holds (`scripts/tg apply` needs an admin Vault token + cluster RBAC), but **pushing `master` is NOT inert** — the Forgejo→Woodpecker webhook fires `.woodpecker/default.yml` (`event: push, branch: master`, `require_approval: forks` only), which terragrunt-applies changed stacks. `master` is **branch-protected on Forgejo** (force-push disabled for everyone — history is append-only; push + merge whitelists = `viktor` + explicitly granted users, deploy keys allowed). **Allow-then-audit (Viktor, 2026-06-10):** `ebarzin` (emo) is on the whitelist and pushes straight to `master` — no PR gate. The tracking burden moves to: (a) **commit messages that record what + why** (the agent instructions in AGENTS.md and the managed claudeMd require the body to paraphrase the user's request), (b) the **`notify-nonadmin-push` Slack audit step** in `.woodpecker/default.yml` — every master push by a non-admin author is posted to Slack (admin pushes are not), and (c) non-admins **never use `[ci skip]`** so every change fires the pipeline (and thus the audit feed). Users NOT on the whitelist fall back to `/` branches + PRs. **Clones stay fresh automatically** (2026-06-10): the hourly `t3-provision-users` reconcile runs `refresh_user_clone` over every managed clone — the infra clone and any workspace repos (fetch all remotes + fast-forward `master`, ONLY when on master with a clean tree and an upstream — dirty trees and local commits are left alone with a WARN) — and also `wire_forgejo_remote`, which idempotently adds the documented `forgejo` remote + `forgejo/master` upstream to infra clones that predate that contract. `start-claude.sh` does the same freshen at session launch (10s fetch cap per repo so an offline remote never stalls the session; workspace layouts freshen each repo under `~/code`). **Contribute access (per non-admin, manual — the anca/tripit PAT precedent):** @@ -559,7 +561,7 @@ Separate from the in-cluster namespace-owner model above, the **devvm** (`10.0.1 **Web-terminal session persistence (2026-06-10):** the tmux-based web terminal's named sessions (each running one Claude conversation) survive devvm reboots — `tmux-persist-save.timer` (5-min) snapshots every terminal user's sessions (name, cwd, conversation uuid from argv or the cwd-slug transcript dir) to `/var/lib/tmux-persist/.tsv`, and `tmux-persist-restore.service` recreates missing sessions at boot with `claude --resume ` (per-session idempotent; also handles partial loss). The web terminal also exposes an **on-demand "Restore sessions" button** (terminal-lobby: `tmux-api` `POST /restore` → the validated root `tmux-restore-user` wrapper → `tmux-persist restore `, a single-user mode of the same script): the boot-only restore service never fires when an **OOM kills a user's tmux server *without* a reboot** (the common case under multi-user memory pressure), so the button covers that gap. This is a **tmux/terminal-surface** feature, deliberately outside the t3 namespace: the t3 chat surface persists its own threads (`~/.t3` state, plus the daily `t3-backup-state` dump), and Claude conversations themselves were always durable (`~/.claude/projects/`) — what this adds is the volatile tmux wiring. -**Status (2026-06-10):** built + verified on the live host — capacity (8 GiB swap), config inheritance, roster-driven provisioner, per-user locked clone, per-user OIDC kubeconfig + the `oidc-power-user-readonly` ClusterRole + emo's `k8s_users` entry (applied + impersonation-verified), the Authentik `T3 Users` edge gate, **the emo Phase-5 cutover (own clone + launcher repoint + `code-shared` removal, completed 2026-06-10) and emo's contribute access (`ebarzin` write collaborator + PAT + protected `master`)**, and **per-user `code_layout` with the ancamilea workspace cutover (infra → `~/code/infra`, `tripit` alongside, 2026-06-10)**. Per the live `/etc/skel` design, non-admin `~/.claude/{rules,skills}` symlinks into the admin base are **kept** (they ARE the shared-base delivery mechanism — the plan's step to remove them is obsolete). **Remaining (held / future):** the offboarding apply-side (Phase 7), per-user MCP/auth injection, and roster-reconciled `T3 Users` membership. See `../runbooks/offboard-user.md` for deprovisioning. +**Status (2026-06-10):** built + verified on the live host — capacity (8 GiB swap), config inheritance, roster-driven provisioner, per-user locked clone, per-user OIDC kubeconfig + the `oidc-power-user-readonly` ClusterRole + emo's `k8s_users` entry (applied + impersonation-verified), the Authentik `T3 Users` edge gate, **the emo Phase-5 cutover (own clone + launcher repoint + `code-shared` removal, completed 2026-06-10) and emo's contribute access (`ebarzin` write collaborator + PAT + protected `master`)**, and **per-user `code_layout` with the ancamilea workspace cutover (infra → `~/code/infra`, `tripit` alongside, 2026-06-10)**. Per the live `/etc/skel` design, non-admin `~/.claude/{rules,skills}` symlinks into the admin base are **kept** (they ARE the shared-base delivery mechanism — the plan's step to remove them is obsolete). **Remaining (held / future):** the offboarding apply-side (Phase 7), the rest of per-user MCP/auth injection (`ha` + `claude_memory` + `.credentials.json` + beads Dolt cred — **per-user playwright browser MCP done 2026-06-16**, see above), and roster-reconciled `T3 Users` membership. See `../runbooks/offboard-user.md` for deprovisioning. ## Related diff --git a/docs/plans/2026-06-07-multi-user-workstation-plan.md b/docs/plans/2026-06-07-multi-user-workstation-plan.md index f98580c7..50f788ed 100644 --- a/docs/plans/2026-06-07-multi-user-workstation-plan.md +++ b/docs/plans/2026-06-07-multi-user-workstation-plan.md @@ -129,6 +129,21 @@ users: ### Task 2.3: Inject per-user MCP + auth secrets (new users only; never clobber) +> **PARTIAL — per-user playwright browser MCP DONE (2026-06-16), reproducible from git.** +> Implemented NOT via the "write a fresh `~/.claude.json`" step below (that skips +> EXISTING users who have a `.claude.json` lacking the entry — emo + anca were +> exactly this: server running, never wired). Instead: `roster_engine.py` allocates +> a sticky per-user `PLAYWRIGHT_PORT` (`PLAYWRIGHT_BASE_PORT=8931`); `setup-devvm.sh` +> (§8c/§9e) stages the chrome-service token + installs **system-level template units** +> (`scripts/workstation/playwright/playwright-mcp@.service` + `…-snapshot-refresh@.{service,timer}`, +> no systemd --user / linger); `t3-provision-users.sh` `install_playwright()` (ALL +> tiers incl. admin) seeds the token if-absent, runs `claude mcp add --scope user +> playwright` AS the user (clobber-proof → fixes existing + new + admin), and +> `enable --now`s the instances. Replaced the hand-made `~/.config/systemd/user/playwright-*` +> units (one-time idle-gated migration). Runbook: `../runbooks/chrome-service-snapshot.md` +> → "Provisioning". **Still TODO in this task:** `ha`, `claude_memory`, +> `.credentials.json`, and the beads Dolt credential. + **Files:** Modify `infra/scripts/t3-provision-users.sh` (add `install_user_secrets`) - [ ] **Step 1:** For each non-admin **without** an existing `~/.claude.json` (NEW users only — NEVER touch an existing one): write `~/.claude.json` with `playwright-shared` (localhost), `ha` (shared `ha_sofia_mcp_url` from Vault `secret/openclaw`) if HA-eligible, and `claude_memory` using a **shared/simple key (per-user memory isolation is DEFERRED — not a risk now)**. Seed `~/.claude/.credentials.json` with the shared Claude token (Vault) **or** leave absent for interactive login. **Drop the beads Dolt credential** into `~/code/.beads/` (`.beads-credential-key`, from Vault, or set `DOLT_REMOTE_PASSWORD`) so `bd` authenticates — it's git-ignored, so a fresh clone lacks it. All `0600`, owned by the user. Per-user `playwright-mcp` systemd unit on its own port (existing pattern, id=4015). diff --git a/docs/runbooks/chrome-service-snapshot.md b/docs/runbooks/chrome-service-snapshot.md index ab065503..2ebc565f 100644 --- a/docs/runbooks/chrome-service-snapshot.md +++ b/docs/runbooks/chrome-service-snapshot.md @@ -11,8 +11,36 @@ external Claude Code sessions on the dev box. Architecture in | chrome-service Deployment | `chrome-service` ns | always-on | headed chromium, CDP :9222, persistent /profile/chromium-data | | snapshot-server sidecar | same pod | always-on | serves `/api/snapshot`, bearer-gated, port 8088 | | snapshot-harvester CronJob | `chrome-service` ns | `23 * * * *` | dumps `storage_state()` via CDP → `/profile/snapshots/storage-state.json` | -| dev-box refresh timer | each dev box | hourly | curls `chrome.viktorbarzin.me/api/snapshot` → `~/.cache/playwright-shared-storage-state.json` | -| dev-box `playwright-mcp.service` | each dev box | always-on | `@playwright/mcp --isolated --storage-state=…` per-MCP-connection contexts | +| dev-box refresh timer | each dev box, per OS user | hourly (`*:28`) | `playwright-snapshot-refresh@.timer` curls `chrome.viktorbarzin.me/api/snapshot` → `~/.cache/playwright-shared-storage-state.json` | +| dev-box `playwright-mcp@.service` | each dev box, per OS user | always-on | pinned `@playwright/mcp@ --isolated --storage-state=…` on the user's `PLAYWRIGHT_PORT`; per-MCP-connection (per-session) contexts | + +## Provisioning (reproducible from git) + +The dev-box side is **per-OS-user** and fully reproducible — no hand-setup. +Each user gets their own isolated `@playwright/mcp` server (multiple concurrent +Claude sessions per user, isolated by `--isolated`), wired into their Claude in +**every directory** via a user-scope `~/.claude.json` entry +(`playwright → http://localhost:/mcp`). + +- **System-level template units** (NOT `systemd --user`, so no linger needed): + `playwright-mcp@.service` + `playwright-snapshot-refresh@.{service,timer}`, + sourced from `infra/scripts/workstation/playwright/`, installed to + `/etc/systemd/system/` by `setup-devvm.sh` (§9e). `User=%i`; per-user + `PLAYWRIGHT_PORT` from `/etc/t3-serve/playwright-.env`. +- **Port allocation**: `roster_engine.py` (`PLAYWRIGHT_BASE_PORT=8931`, sticky) + — emitted in the derive JSON, written per-user by `t3-provision-users.sh` (§5c). +- **Snapshot token**: `setup-devvm.sh` (§8c) stages Vault + `secret/chrome-service` `api_bearer_token` → root file + `/etc/t3-serve/chrome-service-token`; the provisioner copies it (if-absent, + 0600) to each user's `~/.config/playwright/token` (the hourly root reconcile + has no Vault token, hence the staging — mirrors the Claude OAuth token in §8a). +- **MCP wiring + enablement**: `t3-provision-users.sh` `install_playwright()` runs + `claude mcp add --scope user … playwright` AS the user (clobber-proof, if-absent) + and `systemctl enable --now` the system instances. Idempotent; never restarts a + running instance or rewrites an existing `~/.claude.json` entry. +- **Pinned version**: bump `@playwright/mcp@` in + `scripts/workstation/playwright/playwright-mcp@.service` (the `@latest` → + silent-fleet-roll footgun is why; see the `T3_PIN` rationale in `setup-devvm.sh`). ## Day-to-day @@ -43,14 +71,14 @@ Expected: `wrote snapshot (… bytes) to /profile/snapshots/storage-state.json`. ### Trigger dev-box refresh manually ```bash -# On the dev box, as the user whose Claude Code sessions need the new state: -systemctl --user start playwright-snapshot-refresh.service +# On the dev box, refresh a specific user's snapshot (system template instance): +sudo systemctl start playwright-snapshot-refresh@.service -# Or directly: -/usr/local/bin/playwright-snapshot-refresh +# Or run the script directly AS that user: +sudo -u /usr/local/bin/playwright-snapshot-refresh # Verify -ls -la ~/.cache/playwright-shared-storage-state.json +sudo ls -la /home//.cache/playwright-shared-storage-state.json ``` ### Inspect the current snapshot @@ -108,12 +136,14 @@ The bearer token in `~/.config/playwright/token` doesn't match the server's. Almost always means the Vault secret was rotated and the local cache is stale. -**Fix**: +**Fix** (re-stage centrally so a rebuild stays correct, then re-copy to the user): ```bash vault login -method=oidc # if needed -vault kv get -field=api_bearer_token secret/chrome-service > ~/.config/playwright/token -chmod 600 ~/.config/playwright/token -systemctl --user start playwright-snapshot-refresh.service +sudo install -m 0600 <(vault kv get -field=api_bearer_token secret/chrome-service) \ + /etc/t3-serve/chrome-service-token +sudo install -o -g -m 0600 \ + /etc/t3-serve/chrome-service-token /home//.config/playwright/token +sudo systemctl start playwright-snapshot-refresh@.service ``` ### Dev-box `playwright-snapshot-refresh` returns 404 with "snapshot not yet available" @@ -129,9 +159,9 @@ new context with it. **Existing MCP sessions don't hot-reload** — they keep the cookies they were seeded with at session start. New sessions get the fresh snapshot. -**Fix**: restart the MCP server on the dev box to pick up the new file: +**Fix**: restart the user's MCP server on the dev box to pick up the new file: ```bash -systemctl --user restart playwright-mcp.service +sudo systemctl restart playwright-mcp@.service ``` ### Snapshot file is suspiciously small or empty cookies array @@ -158,13 +188,18 @@ vault kv put secret/chrome-service \ # Reloader auto-restarts chrome-service pod (snapshot-server picks up new token). -# On EVERY dev box that pulls the snapshot: -vault kv get -field=api_bearer_token secret/chrome-service > ~/.config/playwright/token -chmod 600 ~/.config/playwright/token +# On EVERY dev box: re-stage the root file, then overwrite each user's copy +# (the provisioner's per-user copy is if-absent, so a ROTATION must overwrite). +sudo install -m 0600 <(vault kv get -field=api_bearer_token secret/chrome-service) \ + /etc/t3-serve/chrome-service-token +for u in $(ls /etc/t3-serve/playwright-*.env 2>/dev/null | sed 's#.*/playwright-##;s#\.env##'); do + sudo install -o "$u" -g "$u" -m 0600 \ + /etc/t3-serve/chrome-service-token /home/"$u"/.config/playwright/token +done -# Verify the next refresh succeeds: -systemctl --user start playwright-snapshot-refresh.service -journalctl --user -u playwright-snapshot-refresh.service -n 20 +# Verify the next refresh succeeds for a user: +sudo systemctl start playwright-snapshot-refresh@.service +sudo journalctl -u playwright-snapshot-refresh@.service -n 20 ``` ## Restore from a backup tarball diff --git a/scripts/t3-provision-users.sh b/scripts/t3-provision-users.sh index c5bbe4a9..472dfc87 100644 --- a/scripts/t3-provision-users.sh +++ b/scripts/t3-provision-users.sh @@ -307,18 +307,76 @@ install_user_claude_native() { fi } +# Per-user playwright-mcp browser MCP — ALL tiers incl. admin (every user's Claude +# sessions connect to their OWN isolated server; a user's concurrent sessions are +# kept apart by the unit's --isolated). Idempotent + if-absent, so a routine +# reconcile never disturbs a live user: (1) seed the chrome-service snapshot token +# if the user has none; (2) wire the user-scope `playwright` MCP entry by running +# `claude mcp add` AS the user (writes THEIR ~/.claude.json, never reads another's; +# the CLI merges one key and REFUSES to clobber an existing one, so it's safe on a +# populated config), guarded by `claude mcp get`; (3) `enable --now` the system +# template instances (idempotent — does NOT restart an already-running server). +# Needs PLAYWRIGHT_PORT already in the per-user playwright env (written by the +# section-5c loop) + the token staged by setup-devvm.sh (section 8c). +install_playwright() { + local user="$1" home port token_staged=/etc/t3-serve/chrome-service-token + home="$(getent passwd "$user" | cut -d: -f6)" + [[ -n "$home" && -d "$home" ]] || return 0 + port="$(grep -oE 'PLAYWRIGHT_PORT=[0-9]+' "$ENVDIR/playwright-$user.env" 2>/dev/null | cut -d= -f2 || true)" + [[ -n "$port" ]] || { log "WARN: no PLAYWRIGHT_PORT for $user -> skip playwright"; return 0; } + + # (1) chrome-service snapshot token, if-absent (0600, owned by the user) + if [[ ! -f "$home/.config/playwright/token" && -r "$token_staged" ]]; then + if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] seed playwright token -> $user"; else + install -d -o "$user" -g "$user" -m 0700 "$home/.config/playwright" + install -o "$user" -g "$user" -m 0600 "$token_staged" "$home/.config/playwright/token" + log "seeded playwright snapshot token -> $user" + fi + fi + + # (2) wire user-scope ~/.claude.json (AS the user, login shell so the native + # ~/.local/bin/claude is on PATH; clobber-proof + if-absent via `mcp get`) + if [[ "$DRY_RUN" == 1 ]]; then + echo "[dry-run] wire playwright MCP (:$port) if-absent -> $user" + elif runuser -u "$user" -- bash -lc 'command -v claude >/dev/null 2>&1'; then + if ! runuser -u "$user" -- bash -lc 'claude mcp get playwright >/dev/null 2>&1'; then + runuser -u "$user" -- bash -lc "claude mcp add --scope user --transport http playwright 'http://localhost:$port/mcp' >/dev/null 2>&1" \ + && log "wired playwright MCP (user scope, :$port) -> $user" \ + || log "WARN: claude mcp add playwright failed for $user (retries next run)" + fi + else + log "WARN: claude not found for $user -> playwright MCP not wired (retries next run)" + fi + + # (3) enable the system template instances. `enable --now` is idempotent and + # does NOT restart a running unit, so a live user is undisturbed. + run systemctl enable --now "playwright-mcp@$user.service" >/dev/null 2>&1 || true + run systemctl enable --now "playwright-snapshot-refresh@$user.timer" >/dev/null 2>&1 || true +} + [[ $EUID -eq 0 ]] || { echo "t3-provision-users: must run as root" >&2; exit 1; } for bin in python3 jq; do command -v "$bin" >/dev/null || { echo "missing $bin" >&2; exit 1; }; done [[ -f "$ROSTER" && -f "$ENGINE" ]] || { echo "roster/engine not under $WORKSTATION_DIR" >&2; exit 1; } install -d -m 0755 "$ENVDIR" # 1) current sticky ports from existing .env files -> {os_user: port} -ports_file="$(mktemp)"; trap 'rm -f "$ports_file" "${desired_file:-}"' EXIT +ports_file="$(mktemp)"; pw_ports_file="$(mktemp)" +trap 'rm -f "$ports_file" "$pw_ports_file" "${desired_file:-}"' EXIT { echo "{}"; for f in "$ENVDIR"/*.env; do [[ -e "$f" ]] || continue - u="$(basename "$f" .env)"; p="$(grep -oE 'T3_PORT=[0-9]+' "$f" | cut -d= -f2)" + case "$(basename "$f")" in playwright-*) continue;; esac # not a t3-serve env (handled below) + # `|| true`: grep returns non-zero on no-match, which would abort under `set -e -o pipefail`. + u="$(basename "$f" .env)"; p="$(grep -oE 'T3_PORT=[0-9]+' "$f" | cut -d= -f2 || true)" [[ -n "$p" ]] && jq -n --arg u "$u" --argjson p "$p" '{($u): $p}' done; } | jq -s 'add' > "$ports_file" +# sticky PLAYWRIGHT ports from playwright-.env (skipped by the loop above). +# Seeds roster_engine so the live per-user assignments stick across reconciles. +{ echo "{}"; for f in "$ENVDIR"/playwright-*.env; do + [[ -e "$f" ]] || continue + u="$(basename "$f" .env)"; u="${u#playwright-}" + p="$(grep -oE 'PLAYWRIGHT_PORT=[0-9]+' "$f" | cut -d= -f2 || true)" + [[ -n "$p" ]] && jq -n --arg u "$u" --argjson p "$p" '{($u): $p}' + done; } | jq -s 'add' > "$pw_ports_file" # 2) tier validation vs live k8s_users (best-effort; aborts only on a real conflict) if command -v vault >/dev/null; then @@ -336,7 +394,7 @@ fi # 3) derive desired state desired_file="$(mktemp)" -python3 "$ENGINE" derive --roster "$ROSTER" --ports-json "$ports_file" > "$desired_file" +python3 "$ENGINE" derive --roster "$ROSTER" --ports-json "$ports_file" --playwright-ports-json "$pw_ports_file" > "$desired_file" jq -e . "$desired_file" >/dev/null || { echo "[t3-provision] derive produced invalid JSON" >&2; exit 1; } # 3b) machine-wide Claude managed config (repo -> /etc; per-user codex mirrors in the loop below) @@ -396,6 +454,16 @@ while IFS=$'\t' read -r os_user port; do id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file") +# 5c) per-user playwright-mcp (ALL tiers incl. admin): write the sticky +# PLAYWRIGHT_PORT to the per-user playwright env, then seed token + wire +# ~/.claude.json + enable the system template instances. if-absent / +# idempotent — never disturbs a live user's running server or existing config. +while IFS=$'\t' read -r os_user pw_port; do + id "$os_user" >/dev/null 2>&1 || continue + env_set "$ENVDIR/playwright-$os_user.env" PLAYWRIGHT_PORT "$pw_port" + install_playwright "$os_user" +done < <(jq -r '.playwright_ports | to_entries[] | [.key, .value] | @tsv' "$desired_file") + # 5b) machine-wide (once, not per-user): keep the t3 pinned-version ENFORCER enabled (it # re-asserts T3_PIN daily; a no-op when already correct). NOT --now: with Persistent=true # a `--now` enable fires the missed daily job IMMEDIATELY, which on 2026-06-09 pulled a diff --git a/scripts/workstation/playwright/playwright-mcp@.service b/scripts/workstation/playwright/playwright-mcp@.service new file mode 100644 index 00000000..8bc4dd7f --- /dev/null +++ b/scripts/workstation/playwright/playwright-mcp@.service @@ -0,0 +1,35 @@ +[Unit] +# Per-user isolated playwright-mcp HTTP server — the browser MCP each user's +# Claude Code sessions connect to (user-scope `.claude.json` entry "playwright" +# -> http://localhost:/mcp). System-level TEMPLATE unit (one +# committed file, one instance per OS user: playwright-mcp@.service), so +# it is reproducible from git and root-manageable WITHOUT systemd --user / linger. +# Installed to /etc/systemd/system by setup-devvm.sh; enabled per-user by +# t3-provision-users.sh. Supersedes the hand-made ~/.config/systemd/user units. +Description=Per-user isolated playwright-mcp HTTP server (%i) +After=network-online.target playwright-snapshot-refresh@%i.service +Wants=network-online.target playwright-snapshot-refresh@%i.service + +[Service] +Type=simple +User=%i +# PLAYWRIGHT_PORT is written per-user by t3-provision-users.sh from roster_engine +# (PLAYWRIGHT_BASE_PORT, sticky allocation). Required (no `-`): a missing port +# file should fail loudly rather than start npx with an empty --port. +EnvironmentFile=/etc/t3-serve/playwright-%i.env +Restart=on-failure +RestartSec=5 +# --isolated: each MCP HTTP connection (= each Claude Code session) gets a fresh +# ephemeral BrowserContext, so a single user's concurrent sessions never share +# tabs. --storage-state seeds each context from the hourly cookie snapshot +# harvested from in-cluster chrome-service (warm logged-in state). +# Version PINNED (see the T3_PIN rationale in setup-devvm.sh): @latest re-resolves +# on every restart, so an upstream breaking release would silently roll the +# whole fleet. Bump deliberately in git. %h is NOT used (it resolves to /root +# in a system unit even with User=); the home path is spelled out as /home/%i. +ExecStart=/usr/bin/npx -y @playwright/mcp@0.0.76 --port ${PLAYWRIGHT_PORT} --host localhost --headless --browser chrome --isolated --storage-state /home/%i/.cache/playwright-shared-storage-state.json +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target diff --git a/scripts/workstation/playwright/playwright-snapshot-refresh b/scripts/workstation/playwright/playwright-snapshot-refresh new file mode 100755 index 00000000..5a816a0d --- /dev/null +++ b/scripts/workstation/playwright/playwright-snapshot-refresh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Refresh the local cookie+localStorage snapshot served by chrome-service. +# +# Run per-user by the hourly playwright-snapshot-refresh@.timer systemd +# unit (as that user, so $HOME resolves to the user's home). Per-session Claude +# Code MCP contexts (`@playwright/mcp --isolated --storage-state=…`) read this +# file on each connection — fresh state is visible to NEW sessions, existing +# ones keep what they were seeded with. +# +# Token: cached at ~/.config/playwright/token. Seeded per-user (if-absent) by +# t3-provision-users.sh from the root-staged /etc/t3-serve/chrome-service-token +# (which setup-devvm.sh writes from Vault `secret/chrome-service` +# api_bearer_token). Rotate by re-staging + re-copying; the snapshot endpoint +# reloads the token via Reloader, local caches must be refreshed. +set -euo pipefail + +URL="${PLAYWRIGHT_SNAPSHOT_URL:-https://chrome.viktorbarzin.me/api/snapshot}" +TOKEN_FILE="${PLAYWRIGHT_SNAPSHOT_TOKEN:-$HOME/.config/playwright/token}" +DEST="${PLAYWRIGHT_SNAPSHOT_PATH:-$HOME/.cache/playwright-shared-storage-state.json}" + +if [ ! -r "$TOKEN_FILE" ]; then + echo "ERROR: token file $TOKEN_FILE missing or unreadable" >&2 + exit 1 +fi + +mkdir -p "$(dirname "$DEST")" +TMP="$DEST.new.$$" +trap 'rm -f "$TMP"' EXIT + +TOKEN="$(cat "$TOKEN_FILE")" + +HTTP_CODE=$(curl -sS \ + -H "Authorization: Bearer $TOKEN" \ + -o "$TMP" \ + -w '%{http_code}' \ + --max-time 30 \ + "$URL") + +if [ "$HTTP_CODE" != "200" ]; then + echo "ERROR: HTTP $HTTP_CODE from $URL" >&2 + cat "$TMP" >&2 + exit 1 +fi + +# Sanity: response must be valid JSON with at least the cookies/origins keys. +python3 - "$TMP" <<'PY' || { echo "ERROR: response is not a valid storageState JSON" >&2; exit 1; } +import json, sys +with open(sys.argv[1]) as f: + data = json.load(f) +if "cookies" not in data or "origins" not in data: + raise SystemExit("missing required keys") +PY + +mv -f "$TMP" "$DEST" +trap - EXIT +chmod 600 "$DEST" +echo "snapshot refreshed: $DEST ($(stat -c %s "$DEST") bytes)" diff --git a/scripts/workstation/playwright/playwright-snapshot-refresh@.service b/scripts/workstation/playwright/playwright-snapshot-refresh@.service new file mode 100644 index 00000000..2b8d425b --- /dev/null +++ b/scripts/workstation/playwright/playwright-snapshot-refresh@.service @@ -0,0 +1,22 @@ +[Unit] +# Per-user oneshot that pulls the warm cookie+localStorage snapshot from +# in-cluster chrome-service into ~/.cache/playwright-shared-storage-state.json, +# which playwright-mcp@%i seeds every new session from. System-level TEMPLATE +# (one instance per user); runs the shared /usr/local/bin script as the user. +Description=Refresh %i's playwright storage-state snapshot from chrome-service +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=%i +# Runs as %i, so the script's $HOME-relative paths (token, cache dest) resolve to +# the user's home. $HOME/$USER are set by systemd because User= is set. +ExecStart=/usr/local/bin/playwright-snapshot-refresh +StandardOutput=journal +StandardError=journal +# Don't hang if chrome-service is unreachable — the timer retries next hour. +TimeoutStartSec=60 + +[Install] +WantedBy=multi-user.target diff --git a/scripts/workstation/playwright/playwright-snapshot-refresh@.timer b/scripts/workstation/playwright/playwright-snapshot-refresh@.timer new file mode 100644 index 00000000..24b82e85 --- /dev/null +++ b/scripts/workstation/playwright/playwright-snapshot-refresh@.timer @@ -0,0 +1,16 @@ +[Unit] +Description=Hourly refresh of %i's playwright storage-state snapshot from chrome-service +After=network-online.target + +[Timer] +# 5 minutes after the in-cluster snapshot-harvester CronJob (runs at :23 every +# hour) so the file we pull is the freshest one. Also once shortly after boot so +# a freshly-booted box doesn't wait until the next :28 to populate the cache. +OnCalendar=*-*-* *:28:00 +OnBootSec=2min +Persistent=true +RandomizedDelaySec=30 +Unit=playwright-snapshot-refresh@%i.service + +[Install] +WantedBy=timers.target diff --git a/scripts/workstation/roster_engine.py b/scripts/workstation/roster_engine.py index 6e1b8545..2c984556 100644 --- a/scripts/workstation/roster_engine.py +++ b/scripts/workstation/roster_engine.py @@ -21,6 +21,12 @@ from typing import Iterable import yaml BASE_PORT = 3773 +# Per-user playwright-mcp HTTP port (the browser MCP each user's Claude sessions +# connect to). Distinct range from T3_PORT, allocated for EVERY roster user incl. +# the admin (wizard is listed). Sticky from existing, so the live in-session +# assignments (wizard 8931, emo 8932, ancamilea 8933) are preserved across +# reconciles once seeded; a fresh box allocates from 8931 in sorted order. +PLAYWRIGHT_BASE_PORT = 8931 VALID_TIERS = ("admin", "power-user", "namespace-owner") # single - ~/code IS the locked infra clone (the original non-admin layout) # workspace - ~/code is a plain directory of per-project clones; the locked @@ -82,6 +88,7 @@ class DesiredState: ttyd_user_map: str dispatch: dict[str, dict] ports: dict[str, int] + playwright_ports: dict[str, int] = field(default_factory=dict) @dataclass(frozen=True) @@ -203,13 +210,18 @@ def has_blocking_errors(issues: list[ValidationIssue]) -> bool: # -------------------------------------------------------------------------- -def _allocate_ports(roster: Roster, existing_ports: dict[str, int]) -> dict[str, int]: +def _allocate_ports( + roster: Roster, existing_ports: dict[str, int], base: int = BASE_PORT +) -> dict[str, int]: + """Sticky port allocation: keep every roster user's existing port, then assign + each new user the next free port from `base`. Used for both T3_PORT (base 3773) + and the per-user playwright-mcp port (base 8932).""" ports = {u: existing_ports[u] for u in roster.users if u in existing_ports} used = set(ports.values()) for os_user in sorted(roster.users): if os_user in ports: continue - candidate = BASE_PORT + candidate = base while candidate in used: candidate += 1 ports[os_user] = candidate @@ -224,9 +236,14 @@ _TTYD_MAP_HEADER = ( def derive_desired_state( - roster: Roster, existing_ports: dict[str, int] + roster: Roster, + existing_ports: dict[str, int], + existing_playwright_ports: dict[str, int] | None = None, ) -> DesiredState: ports = _allocate_ports(roster, existing_ports) + playwright_ports = _allocate_ports( + roster, existing_playwright_ports or {}, base=PLAYWRIGHT_BASE_PORT + ) ordered = sorted(roster.users.values(), key=lambda u: ports[u.os_user]) ttyd_lines = [f"{u.authentik_user}={u.os_user}" for u in ordered] ttyd_user_map = _TTYD_MAP_HEADER + "\n".join(ttyd_lines) + "\n" @@ -246,7 +263,7 @@ def derive_desired_state( ) for u in roster.users.values() } - return DesiredState(accounts, ttyd_user_map, dispatch, ports) + return DesiredState(accounts, ttyd_user_map, dispatch, ports, playwright_ports) def groups_to_add(desired: Iterable[str], current: Iterable[str]) -> list[str]: @@ -303,6 +320,7 @@ def _desired_state_to_dict(ds: DesiredState) -> dict: "ttyd_user_map": ds.ttyd_user_map, "dispatch": ds.dispatch, "ports": ds.ports, + "playwright_ports": ds.playwright_ports, } @@ -318,7 +336,11 @@ def _main(argv: list[str]) -> int: pv.add_argument("--k8s-users-json", required=True, help="JSON map {k8s_user: tier}") pd = sub.add_parser("derive", help="emit desired state as JSON") pd.add_argument("--roster", required=True) - pd.add_argument("--ports-json", required=True, help="JSON map {os_user: port}") + pd.add_argument("--ports-json", required=True, help="JSON map {os_user: T3_PORT}") + pd.add_argument( + "--playwright-ports-json", + help="JSON map {os_user: PLAYWRIGHT_PORT} (optional; sticky allocation)", + ) args = parser.parse_args(argv) roster = load_roster_file(args.roster) @@ -329,7 +351,12 @@ def _main(argv: list[str]) -> int: print(f"{issue.severity.upper()}: {issue.message}", file=sys.stderr) return 1 if has_blocking_errors(issues) else 0 with open(args.ports_json, encoding="utf-8") as fh: - desired = derive_desired_state(roster, json.load(fh)) + existing_ports = json.load(fh) + existing_playwright_ports = {} + if args.playwright_ports_json: + with open(args.playwright_ports_json, encoding="utf-8") as fh: + existing_playwright_ports = json.load(fh) + desired = derive_desired_state(roster, existing_ports, existing_playwright_ports) json.dump(_desired_state_to_dict(desired), sys.stdout, indent=2, sort_keys=True) sys.stdout.write("\n") return 0 diff --git a/scripts/workstation/setup-devvm.sh b/scripts/workstation/setup-devvm.sh index b0275bbf..bead16e0 100755 --- a/scripts/workstation/setup-devvm.sh +++ b/scripts/workstation/setup-devvm.sh @@ -140,6 +140,16 @@ if command -v vault >/dev/null; then else log "WARN: secret/workstation codex_shared_auth_json absent -> shared Codex auth not staged" fi + # 8c) chrome-service snapshot bearer token -> root file the provisioner copies + # per-user (if-absent) to ~/.config/playwright/token, which the per-user + # playwright-snapshot-refresh reads. One token for all users (single shared + # warm profile, by design). 0600: the snapshot it fetches holds cookies. + if cs_tok="$(vault kv get -field=api_bearer_token secret/chrome-service 2>/dev/null)"; then + install -m 0600 /dev/stdin /etc/t3-serve/chrome-service-token <<<"$cs_tok" + log "staged /etc/t3-serve/chrome-service-token (playwright snapshot auth)" + else + log "WARN: secret/chrome-service api_bearer_token absent -> playwright snapshot refresh will 401" + fi fi # 9) service layer: install + enable the machine-wide systemd units (sources in @@ -177,6 +187,16 @@ for u in t3-serve@.service \ t3-dispatch.service; do install -m 0644 "$SCRIPTS/$u" "/etc/systemd/system/$u" done +# 9e) per-user playwright-mcp browser MCP: system-level TEMPLATE units (one +# instance per OS user) + the snapshot-refresh script. Reproducible-from-git +# replacement for the hand-made ~/.config/systemd/user/playwright-* units +# (no systemd --user / linger needed). Enabled per-user by the provisioner; +# PLAYWRIGHT_PORT (roster_engine) + the chrome-service token (8c) feed them. +install -m 0755 "$HERE/playwright/playwright-snapshot-refresh" /usr/local/bin/playwright-snapshot-refresh +for u in playwright-mcp@.service playwright-snapshot-refresh@.service playwright-snapshot-refresh@.timer; do + install -m 0644 "$HERE/playwright/$u" "/etc/systemd/system/$u" +done +log "playwright: template units + snapshot-refresh script installed (per-user enable in provisioner)" systemctl daemon-reload systemctl enable --now t3-dispatch.service \ t3-autoupdate.timer t3-backup-state.timer t3-provision-users.timer >/dev/null 2>&1 || \ diff --git a/scripts/workstation/test_roster_engine.py b/scripts/workstation/test_roster_engine.py index ac34969c..183096da 100644 --- a/scripts/workstation/test_roster_engine.py +++ b/scripts/workstation/test_roster_engine.py @@ -296,6 +296,53 @@ def test_derive_is_deterministic(): ) +# -------------------------------------------------------------------------- +# derive_desired_state: per-user playwright-mcp ports (reproducible browser MCP) +# -------------------------------------------------------------------------- + +# wizard (admin) IS a roster user, so playwright ports are allocated for every +# user incl. the admin, from PLAYWRIGHT_BASE_PORT=8931. The live in-session +# assignment is wizard 8931, emo 8932, ancamilea 8933. +LIVE_PLAYWRIGHT_PORTS = {"wizard": 8931, "emo": 8932, "ancamilea": 8933} + + +def test_derive_allocates_playwright_ports_for_all_users_incl_admin(): + ds = eng.derive_desired_state(_roster(THREE), {}) + # fresh box: sorted os_user order (ancamilea, emo, wizard) from 8931 + assert ds.playwright_ports == {"ancamilea": 8931, "emo": 8932, "wizard": 8933} + + +def test_derive_preserves_existing_sticky_playwright_ports(): + # Seeded with the live assignment -> preserved exactly (nobody's port moves). + ds = eng.derive_desired_state( + _roster(THREE), {}, existing_playwright_ports=LIVE_PLAYWRIGHT_PORTS + ) + assert ds.playwright_ports == LIVE_PLAYWRIGHT_PORTS + + +def test_derive_allocates_next_free_playwright_port_for_new_user(): + # Existing users sticky; a brand-new user gets the next free port from 8931. + ds = eng.derive_desired_state( + _roster(THREE), {}, existing_playwright_ports={"wizard": 8931, "emo": 8932} + ) + assert ds.playwright_ports["wizard"] == 8931 + assert ds.playwright_ports["emo"] == 8932 + assert ds.playwright_ports["ancamilea"] == 8933 # next free, skipping 8931/8932 + + +def test_playwright_ports_are_disjoint_from_t3_ports(): + ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS, LIVE_PLAYWRIGHT_PORTS) + assert set(ds.ports.values()).isdisjoint(ds.playwright_ports.values()) + + +def test_desired_state_dict_includes_playwright_ports(): + # The JSON adapter is the contract the bash provisioner consumes via jq. + d = eng._desired_state_to_dict( + eng.derive_desired_state(_roster(THREE), {}, LIVE_PLAYWRIGHT_PORTS) + ) + assert d["playwright_ports"] == LIVE_PLAYWRIGHT_PORTS + + # -------------------------------------------------------------------------- # groups_to_add: the additive-only invariant (module #1) # -------------------------------------------------------------------------- From 63e714782c65d3d3ac529786363710f1082a6044 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 16 Jun 2026 22:11:27 +0000 Subject: [PATCH 6/6] immich: remove one-shot anca-elements-import Job + its PVC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All of Anca's photos are imported. The Job was declared as kubernetes_job_v1.anca_elements_import — meaning every `terragrunt apply` of the immich stack re-created it, despite the 2026-05-25 in-code comment saying "After successful completion: REMOVE this resource block + apply again." Nobody noticed for 22 days; the re-trigger today (2026-06-16) was the 6th IO-pressure incident — it scanned all 21,643 assets in pure read-scan mode for 51 min, saturated sdc, starved etcd, crash-looped kube-apiserver. Recovery actions taken before this commit: - Throttled nfsd 64→8 on PVE host to give apiserver headroom - `kubectl delete job -n immich anca-elements-import` + force-delete pod - Restored nfsd to 64; cluster healthy Code change here: - Remove `kubernetes_job_v1.anca_elements_import` block - Remove `module.nfs_anca_elements_host` (PVC `immich-anca-elements-host` — no live consumer; videos batch deferred per user, source dump remains on PVE at /srv/nfs/anca-elements, browseable via Nextcloud admin) - Update 2026-05-25 post-mortem: 6th-incident section + new lesson that one-shot Jobs do NOT belong in kubernetes_job_v1 (use a suspended CronJob or a runbook-captured `kubectl create job` ad-hoc invocation instead). --- ...026-05-25-immich-anca-elements-io-storm.md | 10 ++ stacks/immich/main.tf | 130 ------------------ 2 files changed, 10 insertions(+), 130 deletions(-) diff --git a/docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md b/docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md index f3ea2b8e..0d4d82c1 100644 --- a/docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md +++ b/docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md @@ -178,6 +178,16 @@ During the recovery, a second cascade was discovered that compounded the outage: **Still the real fix (from this PM, still TODO):** the P0 import-side cap, and especially the **IO-isolation** items — move k8s-master **etcd** + node OS disks off sdc onto SSD (generalize P3), and/or give the Immich library its own spindle (P1). Concurrency caps are a band-aid; sdc remains a single shared failure domain that every storm finds. Tracked in beads (see Follow-up Implementation). +## Update 2026-06-16 — 6th IO-pressure incident (same `anca-elements-import` Job re-triggered) + +**Same direct trigger as 2026-05-25.** The original `kubernetes_job_v1.anca_elements_import` resource block was never removed from `stacks/immich/main.tf` after the 2026-05-25 import completed — despite the in-code comment instructing "After successful completion: REMOVE this resource block + apply again." Every subsequent `terragrunt apply` of the immich stack re-created the Job. On 2026-06-16 ~20:50 UTC it ran again with the original `--concurrent-tasks 20`, scanning all 21,643 Immich assets in pure read-scan mode (`Uploaded 0`) for ~51 min. Result mirrored 2026-06-01: 62 of 64 nfsd threads in D-state on `folio_wait_bit_common`, sdc 80–82% util, **etcd starved → kube-apiserver crash-loop with `start-service-ip-repair-controllers failed: unable to perform initial IP and Port allocation check`**. Cluster unreachable; PVE host load peaked at 102 of 44 threads. The 2026-06-01 server-side job concurrency caps (`thumbnailGeneration=2, metadataExtraction=2, library=2`) held — the storm was on the import side, not the ML side. + +**Immediate recovery**: `nfsd` throttled `64 → 8` threads on the PVE host (gave apiserver enough headroom to come back), then `kubectl delete job -n immich anca-elements-import` + force-delete the pod. Storm cleared instantly: sdc 80% → 30% util, all nfsd threads idle, apiserver `/readyz: ok`. nfsd restored to 64. + +**Permanent fix (this commit)**: Removed `kubernetes_job_v1.anca_elements_import` AND the `module "nfs_anca_elements_host"` PVC from `stacks/immich/main.tf`. The photo batch is complete; per user, the videos batch is not on the near roadmap, so the PVC + the comment scaffold around it are gone too. The on-disk dump at `/srv/nfs/anca-elements` on the PVE host is **kept** (browseable via Nextcloud's admin-only "PVE NFS Pool" mount); decision on deletion deferred to user. A future import would re-add the PVC + a fresh Job (or, better, a one-shot manual `kubectl create job` invocation that does not live in Terraform — see Lessons below). + +**Updated lesson — one-shot Jobs do NOT belong in `kubernetes_job_v1`.** TF treats Jobs as long-lived resources and re-creates them on every apply if state drift is detected. A truly one-shot import either (a) becomes a `kubernetes_cron_job_v1` with `suspend = true` (Viktor can un-suspend → run → re-suspend) or (b) lives outside TF entirely as a `kubectl create job --from=...` ad-hoc invocation captured in `docs/runbooks/`. The "REMOVE this resource block + apply again" comment failed as a control because nobody noticed it for 22 days. + ## Related - 2026-05-09 IO post-mortem: `docs/post-mortems/2026-05-09-io-pressure-stale-nfs.md` diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 8ee67b11..ccb3bd2f 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -124,19 +124,6 @@ module "nfs_ml_cache_host" { nfs_path = "/srv/nfs-ssd/immich/machine-learning" } -# Read-only source for one-shot bulk imports into individual users' accounts -# (currently: Anca's WD Elements dump, mirrored to /srv/nfs/anca-elements from -# her Synology). Consumed only by the import Job below — NOT mounted into the -# immich-server Deployment. PVC stays after the Job is removed so videos can -# follow in batch 2. -module "nfs_anca_elements_host" { - source = "../../modules/kubernetes/nfs_volume" - name = "immich-anca-elements-host" - namespace = kubernetes_namespace.immich.metadata[0].name - nfs_server = var.proxmox_host - nfs_path = "/srv/nfs/anca-elements" -} - resource "kubernetes_namespace" "immich" { metadata { name = "immich" @@ -1178,123 +1165,6 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" { } } -# One-shot bulk import of Anca's Synology Elements photo archive into her -# Immich account. Reads /srv/nfs/anca-elements via the RO PVC above and posts -# assets to immich-server in-cluster (bypasses ingress + CrowdSec entirely). -# -# Auth: Anca's personal Immich API key. Add to Vault `secret/immich` under key -# `anca_api_key`, then force-refresh the existing `immich-secrets` ExternalSecret: -# kubectl annotate externalsecret immich-secrets -n immich \ -# force-sync=$(date +%s) --overwrite -# -# After successful completion: REMOVE this resource block + apply again. The -# PVC stays for a videos batch later. Filters target a photo-only subset of -# the dump (videos / installers / docs / courses banned); EXIF is preserved -# end-to-end since immich-go uploads originals byte-for-byte. -resource "kubernetes_job_v1" "anca_elements_import" { - metadata { - name = "anca-elements-import" - namespace = kubernetes_namespace.immich.metadata[0].name - labels = { - app = "anca-elements-import" - tier = local.tiers.gpu - } - } - - # Don't block `terragrunt apply` on the multi-hour upload — TF returns once - # the Job is created; monitor via `kubectl logs -n immich -f job/...`. - wait_for_completion = false - - spec { - backoff_limit = 20 - ttl_seconds_after_finished = 604800 - template { - metadata { - labels = { - app = "anca-elements-import" - } - } - spec { - restart_policy = "OnFailure" - container { - name = "immich-go" - image = "alpine:3.20" - command = [ - "/bin/sh", - "-c", - <<-EOT - set -eu - apk add --no-cache curl tar ca-certificates >/dev/null - - IMMICH_GO_VERSION="v0.31.0" - cd /tmp - echo "Downloading immich-go $${IMMICH_GO_VERSION}…" - curl -sL "https://github.com/simulot/immich-go/releases/download/$${IMMICH_GO_VERSION}/immich-go_Linux_x86_64.tar.gz" \ - | tar -xz - chmod +x ./immich-go - - echo "Starting upload from /data → http://immich-server.immich.svc.cluster.local:2283 …" - exec ./immich-go upload from-folder /data \ - --server http://immich-server.immich.svc.cluster.local:2283 \ - --api-key "$${IMMICH_API_KEY}" \ - --include-extensions .jpg,.jpeg,.png,.heic,.heif,.gif,.tif,.tiff,.webp,.nef,.cr2,.dng,.raw \ - --into-album "Poze (Elements)" \ - --ban-file "filme/" --ban-file "Music/" --ban-file "carti/" \ - --ban-file "cursuri/" --ban-file "Adobe.*/" \ - --ban-file "Fullstack Web Development*/" \ - --ban-file "Contracte and CV/" --ban-file "Cv/" \ - --ban-file "docum/" --ban-file "finance/" \ - --ban-file "download/" --ban-file "kit/" \ - --ban-file "csp/" --ban-file "KOREAN/" \ - --ban-file "System Volume Information/" \ - --pause-immich-jobs=false \ - --concurrent-tasks 20 \ - --client-timeout 1h \ - --no-ui \ - --on-errors continue - EOT - ] - env { - name = "IMMICH_API_KEY" - value_from { - secret_key_ref { - name = "immich-secrets" - key = "anca_api_key" - } - } - } - volume_mount { - name = "anca-elements" - mount_path = "/data" - read_only = true - } - resources { - requests = { - cpu = "500m" - memory = "1Gi" - } - limits = { - memory = "1Gi" - } - } - } - volume { - name = "anca-elements" - persistent_volume_claim { - claim_name = module.nfs_anca_elements_host.claim_name - read_only = true - } - } - } - } - } - lifecycle { - # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 - ignore_changes = [spec[0].template[0].spec[0].dns_config] - } - depends_on = [kubernetes_manifest.external_secret] -} - # POWER TOOLS # resource "kubernetes_deployment" "powertools" {