From 5549fc3672bc4b86b5063590f5953db5c86c0eaa Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 20 Jun 2026 20:10:40 +0000 Subject: [PATCH] Add per-user Claude auth renewal Each workstation user needs a continuously valid Claude token under their own Enterprise identity. Store only that user's OAuth state in an isolated Vault path, renew and verify it automatically, recover from Vault when possible, and alert when interactive SSO is required. --- docs/architecture/multi-tenancy.md | 4 +- ...026-06-07-multi-user-workstation-design.md | 2 +- .../runbooks/claude-auth-renew-workstation.md | 95 +++++++++++ scripts/claude-auth-sync@.service | 20 +++ scripts/claude-auth-sync@.timer | 12 ++ scripts/t3-provision-users.sh | 57 ++++--- scripts/test-claude-auth-sync.sh | 32 ++++ scripts/workstation/claude-auth-sync.sh | 153 ++++++++++++++++++ scripts/workstation/setup-devvm.sh | 22 ++- stacks/monitoring/modules/monitoring/loki.tf | 14 ++ stacks/vault/main.tf | 25 +++ 11 files changed, 408 insertions(+), 28 deletions(-) create mode 100644 docs/runbooks/claude-auth-renew-workstation.md create mode 100644 scripts/claude-auth-sync@.service create mode 100644 scripts/claude-auth-sync@.timer create mode 100755 scripts/test-claude-auth-sync.sh create mode 100755 scripts/workstation/claude-auth-sync.sh diff --git a/docs/architecture/multi-tenancy.md b/docs/architecture/multi-tenancy.md index 27d856ef..4f5136f9 100644 --- a/docs/architecture/multi-tenancy.md +++ b/docs/architecture/multi-tenancy.md @@ -547,6 +547,8 @@ Separate from the in-cluster namespace-owner model above, the **devvm** (`10.0.1 **Claude Code runtime — native, per-user (2026-06-15):** `claude` is the **native** install (`~/.local/bin/claude` → `~/.local/share/claude/versions/`, self-updating; `installMethod: native`) — NOT npm-global or npx. It is the runtime for both the ttyd launcher and each `t3-serve` instance. `setup-devvm.sh` installs node ONLY for the `t3` CLI (not claude); per-user native claude is provisioned by the reconcile's `install_user_claude_native` (covers terminal + t3, idempotent, skip-if-present) and self-bootstrapped by `start-claude.sh` on first launch — both via the official `https://claude.ai/install.sh`. The legacy machine-wide `npm install -g @anthropic-ai/claude-code` bootstrap and the launcher's `npx` fallback were removed; existing users had already auto-migrated to native, and the npm-global dir was empty. **PATH (`~/.local/bin`, where the native binary lives):** ensured three ways — `/etc/profile.d/10-local-bin.sh` for login shells (machine-wide, fresh-user-safe), `start-claude.sh` itself (the launcher runs in tmux's non-login env that skips the user's shell rc), and `t3-serve@.service` (`Environment=PATH=…:/home/%i/.local/bin`). +**Claude authentication — per-user, self-renewing, Vault-recoverable (2026-06-20):** every roster user logs in with their OWN Enterprise identity; shared `CLAUDE_CODE_OAUTH_TOKEN` injection was removed because environment auth outranks local login and collapses identity/audit/quota. Claude owns access-token refresh in `~/.claude/.credentials.json`. A system template timer (`claude-auth-sync@.timer`, every 6h) renews a dedicated 32-day periodic Vault token, validates Claude with real non-persistent Haiku inference (`auth status` can lie during a 401), backs up only `claudeAiOauth` to `secret/workstation/claude-users/`, and performs one atomic Vault restore/retry on failure while preserving `mcpOAuth`. Vault policy `workstation-claude-` isolates every path; the roster generates policies for present and future users. A hard refresh-token revocation still requires the affected person to complete SSO—there is no supported noninteractive bypass. Loki alert `WorkstationClaudeAuthInvalid` surfaces exhausted recovery. Runbook: `../runbooks/claude-auth-renew-workstation.md`. + **Per-user browser MCP — playwright, reproducible from git (2026-06-16):** every user (incl. the admin) gets their OWN isolated `@playwright/mcp` server so their concurrent Claude sessions don't fight over tabs (`--isolated` → a fresh browser context per MCP connection), wired into Claude in **every directory** via a user-scope `~/.claude.json` entry (`playwright → http://localhost:/mcp`). Mechanism: **system-level template units** `playwright-mcp@.service` + `playwright-snapshot-refresh@.{service,timer}` (`User=%i`, sourced from `scripts/workstation/playwright/`, installed by `setup-devvm.sh` §9e — system manager, so NO systemd --user / linger). `roster_engine.py` allocates a sticky per-user `PLAYWRIGHT_PORT` (`PLAYWRIGHT_BASE_PORT=8931`); the reconcile's `install_playwright()` writes it, seeds the chrome-service snapshot token if-absent (staged from Vault `secret/chrome-service` to `/etc/t3-serve/chrome-service-token` by `setup-devvm.sh` §8c, since the hourly root reconcile has no Vault token), wires `~/.claude.json` by running `claude mcp add --scope user` AS the user (clobber-proof + if-absent, so it fixes existing/new/admin without rewriting a populated config), and `enable --now`s the instances (idempotent — never restarts a running server). The `@playwright/mcp` version is **pinned** in the unit (the `@latest`-silently-rolls-the-fleet footgun — see `T3_PIN`). Replaced the earlier hand-made `~/.config/systemd/user/playwright-*` units (one-time idle-gated migration; pre-migration emo/anca had servers running but never wired into their `.claude.json`). Cookie-warming pipeline + ops: `../runbooks/chrome-service-snapshot.md`. **Infra access:** non-admins get their own **writable, git-crypt-LOCKED** clone of the (public) infra repo — code/docs plaintext, secret files (`*.tfvars`, `secrets/**`) stay ciphertext. Its location depends on the per-user `code_layout` in `roster.yaml`: `single` (default) puts the clone AT `~/code`; `workspace` makes `~/code` a plain directory of per-project clones — the infra clone at `~/code/infra` plus each roster `repos` entry cloned from Forgejo `viktor/` **as the user** (their PAT authenticates, so private repos work; clone failures WARN and retry next hour). Flipping a user to `workspace` auto-migrates their existing `~/code` clone to `~/code/infra` (local branches/dirty state survive; running processes follow the moved inode). ancamilea = workspace + `tripit` since 2026-06-10. The provisioner clones infra anonymously from the public GitHub mirror; **contribute access is wired per-user on top** (see below). The apply boundary still holds (`scripts/tg apply` needs an admin Vault token + cluster RBAC), but **pushing `master` is NOT inert** — the Forgejo→Woodpecker webhook fires `.woodpecker/default.yml` (`event: push, branch: master`, `require_approval: forks` only), which terragrunt-applies changed stacks. `master` is **branch-protected on Forgejo** (force-push disabled for everyone — history is append-only; push + merge whitelists = `viktor` + explicitly granted users, deploy keys allowed). **Allow-then-audit (Viktor, 2026-06-10):** `ebarzin` (emo) is on the whitelist and pushes straight to `master` — no PR gate. The tracking burden moves to: (a) **commit messages that record what + why** (the agent instructions in AGENTS.md and the managed claudeMd require the body to paraphrase the user's request), (b) the **`notify-nonadmin-push` Slack audit step** in `.woodpecker/default.yml` — every master push by a non-admin author is posted to Slack (admin pushes are not), and (c) non-admins **never use `[ci skip]`** so every change fires the pipeline (and thus the audit feed). Users NOT on the whitelist fall back to `/` branches + PRs. **Clones stay fresh automatically** (2026-06-10): the hourly `t3-provision-users` reconcile runs `refresh_user_clone` over every managed clone — the infra clone and any workspace repos (fetch all remotes + fast-forward `master`, ONLY when on master with a clean tree and an upstream — dirty trees and local commits are left alone with a WARN) — and also `wire_forgejo_remote`, which idempotently adds the documented `forgejo` remote + `forgejo/master` upstream to infra clones that predate that contract. `start-claude.sh` does the same freshen at session launch (10s fetch cap per repo so an offline remote never stalls the session; workspace layouts freshen each repo under `~/code`). @@ -561,7 +563,7 @@ Separate from the in-cluster namespace-owner model above, the **devvm** (`10.0.1 **Web-terminal session persistence (2026-06-10):** the tmux-based web terminal's named sessions (each running one Claude conversation) survive devvm reboots — `tmux-persist-save.timer` (5-min) snapshots every terminal user's sessions (name, cwd, conversation uuid from argv or the cwd-slug transcript dir) to `/var/lib/tmux-persist/.tsv`, and `tmux-persist-restore.service` recreates missing sessions at boot with `claude --resume ` (per-session idempotent; also handles partial loss). The web terminal also exposes an **on-demand "Restore sessions" button** (terminal-lobby: `tmux-api` `POST /restore` → the validated root `tmux-restore-user` wrapper → `tmux-persist restore `, a single-user mode of the same script): the boot-only restore service never fires when an **OOM kills a user's tmux server *without* a reboot** (the common case under multi-user memory pressure), so the button covers that gap. This is a **tmux/terminal-surface** feature, deliberately outside the t3 namespace: the t3 chat surface persists its own threads (`~/.t3` state, plus the daily `t3-backup-state` dump), and Claude conversations themselves were always durable (`~/.claude/projects/`) — what this adds is the volatile tmux wiring. -**Status (2026-06-10):** built + verified on the live host — capacity (8 GiB swap), config inheritance, roster-driven provisioner, per-user locked clone, per-user OIDC kubeconfig + the `oidc-power-user-readonly` ClusterRole + emo's `k8s_users` entry (applied + impersonation-verified), the Authentik `T3 Users` edge gate, **the emo Phase-5 cutover (own clone + launcher repoint + `code-shared` removal, completed 2026-06-10) and emo's contribute access (`ebarzin` write collaborator + PAT + protected `master`)**, and **per-user `code_layout` with the ancamilea workspace cutover (infra → `~/code/infra`, `tripit` alongside, 2026-06-10)**. Per the live `/etc/skel` design, non-admin `~/.claude/{rules,skills}` symlinks into the admin base are **kept** (they ARE the shared-base delivery mechanism — the plan's step to remove them is obsolete). **Remaining (held / future):** the offboarding apply-side (Phase 7), the rest of per-user MCP/auth injection (`ha` + `claude_memory` + `.credentials.json` + beads Dolt cred — **per-user playwright browser MCP done 2026-06-16**, see above), and roster-reconciled `T3 Users` membership. See `../runbooks/offboard-user.md` for deprovisioning. +**Status (2026-06-20):** built + verified on the live host — capacity (8 GiB swap), config inheritance, roster-driven provisioner, per-user locked clone, per-user OIDC kubeconfig + the `oidc-power-user-readonly` ClusterRole + emo's `k8s_users` entry (applied + impersonation-verified), the Authentik `T3 Users` edge gate, **the emo Phase-5 cutover (own clone + launcher repoint + `code-shared` removal, completed 2026-06-10) and emo's contribute access (`ebarzin` write collaborator + PAT + protected `master`)**, **per-user `code_layout` with the ancamilea workspace cutover**, per-user playwright browser MCP, and per-user Claude OAuth renewal/Vault recovery. Per the live `/etc/skel` design, non-admin `~/.claude/{rules,skills}` symlinks into the admin base are **kept**. **Remaining (held / future):** the offboarding apply-side (Phase 7), per-user `ha`/`claude_memory`/beads credential injection, and roster-reconciled `T3 Users` membership. See `../runbooks/offboard-user.md` for deprovisioning. ## Related diff --git a/docs/plans/2026-06-07-multi-user-workstation-design.md b/docs/plans/2026-06-07-multi-user-workstation-design.md index 8e54fa95..4d80eae4 100644 --- a/docs/plans/2026-06-07-multi-user-workstation-design.md +++ b/docs/plans/2026-06-07-multi-user-workstation-design.md @@ -110,7 +110,7 @@ The Config base / machine-wide managed layer is **secret-free**. Everything carr | Auth / token | Lives in (per-user, `0600`) | New-user provisioning (from Vault) | |---|---|---| -| **Claude OAuth** | `~/.claude/.credentials.json` (or `CLAUDE_CODE_OAUTH_TOKEN`) | the shared Enterprise token (earlier decision) **or** own interactive login; emo keeps his own | +| **Claude OAuth** | `~/.claude/.credentials.json` + isolated Vault backup | own Enterprise SSO login; Claude refreshes locally and `claude-auth-sync@.timer` validates/backs up/recovers `claudeAiOauth` at `secret/workstation/claude-users/`; shared token injection is forbidden | | **`claude_memory` MCP** | `~/.claude.json` mcpServers + `MEMORY_API_KEY` in `settings.json` env | **DEFERRED — not a risk now (Viktor, 2026-06-08).** Per-user memory isolation needs a service-side `_key_to_user` map edit + redeploy (claude-memory-mcp, GHA repo 78), not just a Vault write — NOT built now. For now a new user gets a simple key or omits memory; revisit if isolation becomes a concern. | | **`ha` MCP** (token-in-URL) | `~/.claude.json` | shared `ha_sofia_mcp_url` from Vault `secret/openclaw` (one HA instance; shared secret, per-user file) — only if HA-eligible | | **`playwright` MCP** | per-user systemd unit (own port) + localhost entry | existing per-user playwright pattern (id=4015); non-secret | diff --git a/docs/runbooks/claude-auth-renew-workstation.md b/docs/runbooks/claude-auth-renew-workstation.md new file mode 100644 index 00000000..f5ce6625 --- /dev/null +++ b/docs/runbooks/claude-auth-renew-workstation.md @@ -0,0 +1,95 @@ +# Workstation Claude authentication renewal + +## Scope + +Every roster user authenticates Claude Code with their own Enterprise identity. +Credentials are never shared between OS users. Claude refreshes its normal OAuth +access token; `claude-auth-sync@.timer` verifies that refresh using real +inference every six hours and backs up only the `claudeAiOauth` object to: + +```text +secret/workstation/claude-users/ +``` + +The user's unrelated `mcpOAuth` credentials never leave their home directory. +Each renewal service has a distinct 32-day periodic Vault token, mode `0600`, at +`~/.config/claude-auth-sync/vault-token`. Its policy can access only that user's +path. The service renews the Vault token on every run. + +## Normal lifecycle + +1. Add the user to `scripts/workstation/roster.yaml` and apply the Vault stack. +2. Run `scripts/workstation/setup-devvm.sh` as root with the admin Vault token. + Its foreground provisioner mints the isolated periodic token and enables the + user's timer. Routine hourly provisioning never needs an admin token. +3. The user completes one initial Enterprise login: + + ```bash + claude auth login --claudeai --sso --email + ``` + +4. Start the first sync immediately instead of waiting for the timer: + + ```bash + systemctl start claude-auth-sync@.service + systemctl status claude-auth-sync@.service + ``` + +Success writes no secrets to the journal. The user's private log records `OK` in +`~/.local/state/claude-auth-sync/sync.log`; journald receives the same status with +`identifier=claude-auth-sync` for Loki alerting. + +## Automatic recovery + +`claude auth status` is not a sufficient health check: it can report logged in +while inference returns HTTP 401. The service therefore runs a minimal Haiku +inference with no session persistence. On failure it: + +1. reads the user's latest OAuth object from Vault; +2. atomically merges it into `.credentials.json`, preserving MCP OAuth state; +3. retries inference once; +4. stores the newly refreshed OAuth object back in Vault on success. + +Vault KV version history remains available for audit, but the service deliberately +does not cycle through old refresh tokens: providers commonly invalidate rotated +refresh tokens, so replaying old versions can make recovery less deterministic. + +## Recovery requiring a person + +If both local state and the latest Vault copy fail, the refresh token was revoked, +invalidated, or the Enterprise session requires reauthorization. Run the login as +the affected OS user, then rerun the service: + +```bash +claude auth login --claudeai --sso --email +systemctl start claude-auth-sync@$(id -un).service +``` + +If the scoped Vault token expired or drift protection rejected it, rerun the root +provisioner with an admin Vault token after confirming the matching policy exists: + +```bash +export VAULT_ADDR=https://vault.viktorbarzin.me +export VAULT_TOKEN="$(cat /home/wizard/.vault-token)" +sudo --preserve-env=VAULT_ADDR,VAULT_TOKEN /usr/local/bin/t3-provision-users +``` + +Never copy another user's `.credentials.json` or scoped Vault token. Never restore +the old shared `CLAUDE_CODE_OAUTH_TOKEN`; environment credentials outrank per-user +login and would silently collapse all users onto one identity. + +## Verification + +```bash +systemctl list-timers 'claude-auth-sync@*' +systemctl status claude-auth-sync@.service +journalctl -t claude-auth-sync --since today +``` + +Inspect Vault metadata, not secret values: + +```bash +vault kv metadata get secret/workstation/claude-users/ +``` + +Alert `WorkstationClaudeAuthInvalid` fires when any renewal agent logs `FAIL`. diff --git a/scripts/claude-auth-sync@.service b/scripts/claude-auth-sync@.service new file mode 100644 index 00000000..3750f295 --- /dev/null +++ b/scripts/claude-auth-sync@.service @@ -0,0 +1,20 @@ +[Unit] +Description=Validate and back up Claude OAuth credentials for %i +Documentation=https://github.com/ViktorBarzin/infra/blob/master/docs/runbooks/claude-auth-renew-workstation.md +Wants=network-online.target +After=network-online.target + +[Service] +Type=oneshot +User=%i +Group=%i +Environment=HOME=/home/%i +Environment=PATH=/usr/local/bin:/usr/bin:/bin:/home/%i/.local/bin +ExecStart=/usr/local/bin/claude-auth-sync + +# Credential and Vault access are required; keep the remaining host surface narrow. +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths=-/home/%i/.claude -/home/%i/.claude.json -/home/%i/.config/claude-auth-sync -/home/%i/.local/state/claude-auth-sync diff --git a/scripts/claude-auth-sync@.timer b/scripts/claude-auth-sync@.timer new file mode 100644 index 00000000..b25f2ecd --- /dev/null +++ b/scripts/claude-auth-sync@.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Keep Claude OAuth credentials valid and recoverable for %i + +[Timer] +OnBootSec=10m +OnUnitActiveSec=6h +Persistent=true +RandomizedDelaySec=10m +Unit=claude-auth-sync@%i.service + +[Install] +WantedBy=timers.target diff --git a/scripts/t3-provision-users.sh b/scripts/t3-provision-users.sh index ae1a7759..04e90158 100644 --- a/scripts/t3-provision-users.sh +++ b/scripts/t3-provision-users.sh @@ -251,23 +251,41 @@ env_set() { chmod 600 "$file" } -# Share the admin's Claude subscription with a non-admin: inject CLAUDE_CODE_OAUTH_TOKEN -# (the staged long-lived token) into their t3-serve env — ONLY if they have neither their -# own ~/.claude/.credentials.json (own login) nor an existing token. Never clobbers. The -# agent picks it up when its t3-serve@ instance (re)starts. -install_user_claude_token() { - local user="$1" home envf tok - local token_file="${CLAUDE_TOKEN_FILE:-/etc/t3-serve/claude-oauth-token}" +env_unset() { + local file="$1" key="$2" + [[ -f "$file" ]] || return 0 + grep -q "^${key}=" "$file" || return 0 + if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] unset $key -> $file"; return 0; fi + sed -i "/^${key}=.*/d" "$file" + chmod 600 "$file" + log "removed legacy shared $key -> $(basename "$file")" +} + +# Install one user's isolated Claude credential renewal flow. The scoped periodic +# Vault token is minted only when this reconcile has admin Vault access (normal +# onboarding/deployment); routine token renewal is performed by the user service. +install_claude_auth_sync() { + local user="$1" home cfg token_file token policy home="$(getent passwd "$user" | cut -d: -f6)" [[ -z "$home" ]] && return 0 - [[ -f "$home/.claude/.credentials.json" ]] && return 0 # has own login -> leave it - [[ -r "$token_file" ]] || return 0 - envf="${ENVDIR:-/etc/t3-serve}/$user.env" - grep -q '^CLAUDE_CODE_OAUTH_TOKEN=' "$envf" 2>/dev/null && return 0 # already shared - if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] share Claude token -> $envf"; return 0; fi - tok="$(cat "$token_file")" - env_set "$envf" CLAUDE_CODE_OAUTH_TOKEN "$tok" - log "shared Claude token -> $user (t3-serve env; restart needed to take effect)" + cfg="$home/.config/claude-auth-sync" + token_file="$cfg/vault-token" + policy="workstation-claude-$user" + + if [[ ! -s "$token_file" ]]; then + if [[ "$DRY_RUN" == 1 ]]; then + echo "[dry-run] mint scoped Claude-auth Vault token -> $user" + elif vault token lookup >/dev/null 2>&1 && \ + token="$(vault token create -orphan -period=768h -policy="$policy" \ + -display-name="devvm-claude-auth-$user" -field=token 2>/dev/null)"; then + install -d -o "$user" -g "$user" -m 0700 "$cfg" + install -o "$user" -g "$user" -m 0600 /dev/stdin "$token_file" <<<"$token" + log "minted isolated Claude-auth Vault token -> $user" + else + log "WARN: scoped Claude-auth Vault token missing for $user (run provisioner with admin VAULT_TOKEN after vault stack apply)" + fi + fi + run systemctl enable --now "claude-auth-sync@$user.timer" >/dev/null 2>&1 || true } # Re-deploy the managed per-user Claude launcher to ~/start-claude.sh. /etc/skel only @@ -421,7 +439,7 @@ while IFS=$'\t' read -r os_user tier shell groups_csv code_layout repos_csv; do log "add $os_user -> group $g"; run gpasswd -a "$os_user" "$g" >/dev/null done fi - if [[ "$tier" != admin ]]; then # non-admins: locked clone(s) (kept fresh) + kubeconfig + shared Claude token + if [[ "$tier" != admin ]]; then # non-admins: locked clone(s) (kept fresh) + kubeconfig if [[ "$code_layout" == workspace ]]; then ensure_workspace_layout "$os_user" install_locked_clone "$os_user" code/infra @@ -440,17 +458,20 @@ while IFS=$'\t' read -r os_user tier shell groups_csv code_layout repos_csv; do refresh_user_clone "$os_user" code fi install_user_kubeconfig "$os_user" - install_user_claude_token "$os_user" deploy_user_launcher "$os_user" # keep ~/start-claude.sh current (skel only seeds new accounts) fi refresh_codex_mirror "$os_user" # all tiers — mirror of the managed claudeMd install_user_claude_native "$os_user" # all tiers — per-user native claude (terminal + t3); no npm/npx + install_claude_auth_sync "$os_user" # all tiers — own Claude identity + isolated Vault recovery done < <(jq -r '.accounts[] | [.os_user, .tier, .shell, (if (.groups|length)==0 then "-" else (.groups|join(",")) end), .code_layout, (if (.repos|length)==0 then "-" else (.repos|join(",")) end)] | @tsv' "$desired_file") # 5) per-user .env (sticky port) + enable t3-serve@ while IFS=$'\t' read -r os_user port; do envf="$ENVDIR/$os_user.env" - env_set "$envf" T3_PORT "$port" # update-or-append; preserves CLAUDE_CODE_OAUTH_TOKEN + env_set "$envf" T3_PORT "$port" + # Per-user Enterprise login is authoritative. A legacy shared setup-token has + # higher credential precedence and would silently defeat user isolation. + env_unset "$envf" CLAUDE_CODE_OAUTH_TOKEN id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file") diff --git a/scripts/test-claude-auth-sync.sh b/scripts/test-claude-auth-sync.sh new file mode 100755 index 00000000..10f07746 --- /dev/null +++ b/scripts/test-claude-auth-sync.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -uo pipefail +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=workstation/claude-auth-sync.sh +source "$DIR/workstation/claude-auth-sync.sh" + +pass=0 fail=0 +ok() { if "${@:2}"; then pass=$((pass+1)); else fail=$((fail+1)); echo "FAIL: $1"; fi; } +no() { if "${@:2}"; then fail=$((fail+1)); echo "FAIL: $1"; else pass=$((pass+1)); fi; } +eq() { if [[ "$2" == "$3" ]]; then pass=$((pass+1)); else fail=$((fail+1)); echo "FAIL: $1"; fi; } + +tmp="$(mktemp -d)"; trap 'rm -rf "$tmp"' EXIT +valid='{"mcpOAuth":{"server":{"accessToken":"mcp-secret"}},"claudeAiOauth":{"accessToken":"access","refreshToken":"refresh","expiresAt":123,"scopes":["user:inference"]}}' +printf '%s\n' "$valid" > "$tmp/credentials.json" + +oauth="$(cas_oauth_from_credentials "$tmp/credentials.json")" +eq "extract OAuth object" 'access' "$(jq -r .accessToken <<<"$oauth")" +printf '{"claudeAiOauth":{"accessToken":"access","expiresAt":123}}\n' > "$tmp/bad.json" +no "reject missing refresh token" cas_oauth_from_credentials "$tmp/bad.json" + +replacement='{"accessToken":"new-access","refreshToken":"new-refresh","expiresAt":456}' +merged="$(cas_merge_oauth "$tmp/credentials.json" "$replacement")" +eq "replace Claude access token" new-access "$(jq -r .claudeAiOauth.accessToken <<<"$merged")" +eq "preserve MCP OAuth" mcp-secret "$(jq -r '.mcpOAuth.server.accessToken' <<<"$merged")" + +export CAS_USER=emo +ok "accept own scoped Vault token" cas_vault_identity_ok token-devvm-claude-auth-emo default,workstation-claude-emo +no "reject another user's token" cas_vault_identity_ok token-devvm-claude-auth-anca default,workstation-claude-anca +no "reject wrong policy" cas_vault_identity_ok token-devvm-claude-auth-emo default,workstation-claude-anca + +printf '\n%d passed, %d failed\n' "$pass" "$fail" +(( fail == 0 )) diff --git a/scripts/workstation/claude-auth-sync.sh b/scripts/workstation/claude-auth-sync.sh new file mode 100755 index 00000000..dc3d780d --- /dev/null +++ b/scripts/workstation/claude-auth-sync.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# Keep one Workstation user's Claude subscription OAuth credentials recoverable. +# Claude owns access/refresh-token rotation in ~/.claude/.credentials.json. This +# helper validates auth with real inference, stores only the claudeAiOauth object +# in the user's isolated Vault path, and attempts one restore on failure. +set -euo pipefail + +CAS_USER="${CLAUDE_AUTH_USER:-$(id -un)}" +CAS_HOME="${HOME:?HOME must be set}" +CAS_CREDENTIALS="${CLAUDE_CREDENTIALS_FILE:-$CAS_HOME/.claude/.credentials.json}" +CAS_CONFIG_DIR="${CLAUDE_AUTH_CONFIG_DIR:-$CAS_HOME/.config/claude-auth-sync}" +CAS_VAULT_TOKEN_FILE="${CLAUDE_AUTH_VAULT_TOKEN_FILE:-$CAS_CONFIG_DIR/vault-token}" +CAS_VAULT_PATH="${CLAUDE_AUTH_VAULT_PATH:-secret/workstation/claude-users/$CAS_USER}" +CAS_STATE_DIR="${CLAUDE_AUTH_STATE_DIR:-$CAS_HOME/.local/state/claude-auth-sync}" +CAS_LOG="$CAS_STATE_DIR/sync.log" + +cas_log() { + mkdir -p "$CAS_STATE_DIR" + printf '%s %s\n' "$(date -Is)" "$*" >> "$CAS_LOG" + logger -t claude-auth-sync -- "user=$CAS_USER $*" 2>/dev/null || true +} + +# Print the Claude OAuth object, or fail without exposing any token material. +cas_oauth_from_credentials() { + jq -ce '.claudeAiOauth + | select((.accessToken | type) == "string" and (.accessToken | length) > 0) + | select((.refreshToken | type) == "string" and (.refreshToken | length) > 0) + | select((.expiresAt | type) == "number")' "$1" +} + +# Merge a recovered OAuth object while preserving unrelated credentials (MCP OAuth). +cas_merge_oauth() { + local credentials="$1" oauth="$2" + jq -ce --argjson oauth "$oauth" '.claudeAiOauth = $oauth' "$credentials" +} + +cas_vault_identity_ok() { + local display_name="$1" policies_csv="$2" + [[ "$display_name" == "token-devvm-claude-auth-$CAS_USER" ]] || return 1 + printf ',%s,' "$policies_csv" | grep -q ",workstation-claude-$CAS_USER," +} + +cas_prepare_vault() { + [[ -s "$CAS_VAULT_TOKEN_FILE" ]] || { + cas_log "FAIL missing scoped Vault token; admin must run workstation provisioning" + return 1 + } + export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}" + VAULT_TOKEN="$(<"$CAS_VAULT_TOKEN_FILE")"; export VAULT_TOKEN + + local info display_name policies + info="$(vault token lookup -format=json 2>/dev/null)" || { + cas_log "FAIL scoped Vault token lookup failed" + return 1 + } + display_name="$(jq -r '.data.display_name // ""' <<<"$info")" + policies="$(jq -r '((.data.policies // []) + (.data.identity_policies // [])) | join(",")' <<<"$info")" + cas_vault_identity_ok "$display_name" "$policies" || { + cas_log "FAIL scoped Vault token drift detected; refusing foreign token" + return 1 + } + vault token renew -format=json >/dev/null 2>&1 || { + cas_log "FAIL scoped Vault token renewal failed" + return 1 + } +} + +# auth status is not authoritative: it reported loggedIn=true during a real 401 +# on 2026-06-20. A tiny, non-persistent inference is the feedback loop. +cas_live_auth_ok() { + local out + out="$(timeout 60 claude -p 'Reply with exactly AUTH_OK and nothing else.' \ + --model haiku --max-turns 1 --no-session-persistence --tools "" \ + --disable-slash-commands --setting-sources "" 2>/dev/null)" || return 1 + [[ "$out" == "AUTH_OK" ]] +} + +cas_backup() { + local oauth expires + oauth="$(cas_oauth_from_credentials "$CAS_CREDENTIALS")" || { + cas_log "FAIL local Claude OAuth credential is absent or malformed" + return 1 + } + expires="$(jq -r '.expiresAt' <<<"$oauth")" + vault kv put "$CAS_VAULT_PATH" \ + claude_ai_oauth_json="$oauth" \ + credential_expires_at_ms="$expires" \ + backed_up_at="$(date -Is)" >/dev/null || { + cas_log "FAIL Vault credential backup failed" + return 1 + } + cas_log "OK Claude auth valid; refreshed OAuth state backed up to Vault" +} + +cas_restore() { + local oauth base tmp + oauth="$(vault kv get -field=claude_ai_oauth_json "$CAS_VAULT_PATH" 2>/dev/null)" || { + cas_log "FAIL no recoverable Claude OAuth credential in Vault" + return 1 + } + jq -e 'select((.accessToken | type) == "string" and (.accessToken | length) > 0) + | select((.refreshToken | type) == "string" and (.refreshToken | length) > 0) + | select((.expiresAt | type) == "number")' <<<"$oauth" >/dev/null || { + cas_log "FAIL Vault Claude OAuth credential is malformed" + return 1 + } + + mkdir -p "$(dirname "$CAS_CREDENTIALS")" + if jq -e 'type == "object"' "$CAS_CREDENTIALS" >/dev/null 2>&1; then + base="$CAS_CREDENTIALS" + else + base="$(mktemp)"; printf '{}\n' > "$base" + fi + tmp="$(mktemp "${CAS_CREDENTIALS}.XXXXXX")" + if ! cas_merge_oauth "$base" "$oauth" > "$tmp"; then + rm -f "$tmp"; [[ "$base" == "$CAS_CREDENTIALS" ]] || rm -f "$base" + cas_log "FAIL could not merge Vault Claude OAuth credential" + return 1 + fi + chmod 0600 "$tmp" + mv "$tmp" "$CAS_CREDENTIALS" + [[ "$base" == "$CAS_CREDENTIALS" ]] || rm -f "$base" + cas_log "RECOVERED restored Claude OAuth state from Vault" +} + +cas_main() { + umask 077 + for bin in jq vault claude timeout flock; do + command -v "$bin" >/dev/null || { cas_log "FAIL missing dependency: $bin"; return 1; } + done + mkdir -p "$CAS_STATE_DIR" + exec 9>"$CAS_STATE_DIR/lock" + flock -n 9 || { cas_log "SKIP another sync is already running"; return 0; } + + cas_prepare_vault || return 1 + if cas_live_auth_ok; then + cas_backup + return + fi + + cas_log "WARN live Claude auth failed; attempting one Vault restore" + cas_restore || return 1 + if cas_live_auth_ok; then + cas_backup + return + fi + cas_log "FAIL Claude auth still invalid after Vault restore; interactive SSO login required" + return 1 +} + +if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then + cas_main "$@" +fi diff --git a/scripts/workstation/setup-devvm.sh b/scripts/workstation/setup-devvm.sh index 1807bb80..2bb2f64e 100755 --- a/scripts/workstation/setup-devvm.sh +++ b/scripts/workstation/setup-devvm.sh @@ -125,14 +125,10 @@ if command -v vault >/dev/null; then if [[ -z "${VAULT_TOKEN:-}" && -r /home/wizard/.vault-token ]]; then VAULT_TOKEN="$(cat /home/wizard/.vault-token)"; export VAULT_TOKEN fi - # 8a) Shared Claude subscription OAuth token (long-lived sk-ant-oat01) -> root file the - # provisioner injects into non-admins' t3-serve env (only those without their own login). - if claude_tok="$(vault kv get -field=claude_oauth_token secret/workstation 2>/dev/null)"; then - install -m 0600 /dev/stdin /etc/t3-serve/claude-oauth-token <<<"$claude_tok" - log "staged /etc/t3-serve/claude-oauth-token (shared Claude subscription)" - else - log "WARN: secret/workstation claude_oauth_token absent -> non-admins won't share Claude auth" - fi + # 8a) Claude auth is deliberately NOT shared. Each roster user signs in with their own + # Enterprise identity; claude-auth-sync backs up only their OAuth object to an + # isolated Vault path. The provisioner mints its scoped Vault token when this admin + # VAULT_TOKEN is present. # 8b) Shared Codex auth -> /opt/codex-shared/auth.json (the codex wrapper symlinks each # user's ~/.codex/auth.json here). Previously a manual host change that did NOT survive # a rebuild even though the Vault key existed — now reproducible from Vault. @@ -166,6 +162,7 @@ SCRIPTS="$HERE/.." install -m 0755 "$SCRIPTS/t3-autoupdate.sh" /usr/local/bin/t3-autoupdate install -m 0755 "$SCRIPTS/t3-backup-state.sh" /usr/local/bin/t3-backup-state install -m 0755 "$SCRIPTS/t3-mint" /usr/local/bin/t3-mint +install -m 0755 "$HERE/claude-auth-sync.sh" /usr/local/bin/claude-auth-sync # 9b) t3-dispatch: unprivileged system account + compiled Go binary (build-if-absent) id -u t3-dispatch >/dev/null 2>&1 || useradd --system --no-create-home --shell /usr/sbin/nologin t3-dispatch if [[ ! -x /usr/local/bin/t3-dispatch ]]; then @@ -197,12 +194,14 @@ fi # 9d) unit files + enablement. Timers self-heal; t3-dispatch is long-running. # t3-serve@ is a TEMPLATE (enabled per-user by the provisioner, not here). for u in t3-serve@.service \ + claude-auth-sync@.service claude-auth-sync@.timer \ t3-autoupdate.service t3-autoupdate.timer \ t3-backup-state.service t3-backup-state.timer \ t3-provision-users.service t3-provision-users.timer \ t3-dispatch.service; do install -m 0644 "$SCRIPTS/$u" "/etc/systemd/system/$u" done +log "claude auth: per-user sync script + template units installed" # 9e) per-user playwright-mcp browser MCP: system-level TEMPLATE units (one # instance per OS user) + the snapshot-refresh script. Reproducible-from-git # replacement for the hand-made ~/.config/systemd/user/playwright-* units @@ -219,4 +218,11 @@ systemctl enable --now t3-dispatch.service \ log "WARN: some units failed to enable (check: systemctl status t3-dispatch t3-*.timer)" log "service units installed + enabled (t3-dispatch + 3 timers; t3-serve@ per-user)" +# Run one foreground reconcile while the admin Vault token borrowed in section 8 +# is still available. This is what mints new roster users' isolated periodic +# Vault tokens; the hourly no-admin-token reconcile only maintains existing ones. +if [[ -n "${VAULT_TOKEN:-}" ]]; then + /usr/local/bin/t3-provision-users || log "WARN: foreground provisioner failed; scoped Claude-auth tokens may need a retry" +fi + log "OK (idempotent)" diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index d9d7f7a7..cfb160bb 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -274,6 +274,20 @@ resource "kubernetes_config_map" "loki_alert_rules" { runbook = "docs/runbooks/t3-version-bump.md" } }, + { + # Per-user Claude refresh/backup/restore exhausted its automatic + # recovery path. This is actionable: that user needs interactive SSO, + # or the scoped Vault token/bootstrap needs repair. + alert = "WorkstationClaudeAuthInvalid" + expr = "sum by (unit) (count_over_time({job=\"devvm-journal\", identifier=\"claude-auth-sync\"} |~ \"FAIL\" [15m])) > 0" + for = "0m" + labels = { severity = "warning" } + annotations = { + summary = "Per-user Claude authentication recovery failed on {{ $labels.unit }}" + description = "The Workstation renewal agent could not validate Claude auth, renew its scoped Vault token, or recover from the Vault backup. Follow the per-user SSO recovery runbook." + runbook = "docs/runbooks/claude-auth-renew-workstation.md" + } + }, ] }, { diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index 1c26af51..4ee4b0c2 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -1016,6 +1016,11 @@ data "vault_kv_secret_v2" "platform" { locals { k8s_users = jsondecode(data.vault_kv_secret_v2.platform.data["k8s_users"]) + # Workstation roster is the source of truth for Claude credential isolation. + # Each user's renewal agent receives a periodic Vault token carrying exactly + # one of these policies; no user can read another user's OAuth state. + workstation_users = yamldecode(file("${path.module}/../../scripts/workstation/roster.yaml")).users + # Flatten user -> namespace pairs for namespace-owners namespace_owner_namespaces = flatten([ for name, user in local.k8s_users : [ @@ -1034,6 +1039,26 @@ locals { ])) } +resource "vault_policy" "workstation_claude" { + for_each = local.workstation_users + + name = "workstation-claude-${each.key}" + policy = <<-EOT + path "secret/data/workstation/claude-users/${each.key}" { + capabilities = ["create", "read", "update"] + } + path "secret/metadata/workstation/claude-users/${each.key}" { + capabilities = ["read"] + } + path "auth/token/lookup-self" { + capabilities = ["read"] + } + path "auth/token/renew-self" { + capabilities = ["update"] + } + EOT +} + resource "kubernetes_namespace" "user_namespace" { for_each = nonsensitive(local.user_namespaces)