Add per-user Claude auth renewal

Each workstation user needs a continuously valid Claude token under their own Enterprise identity. Store only that user's OAuth state in an isolated Vault path, renew and verify it automatically, recover from Vault when possible, and alert when interactive SSO is required.
This commit is contained in:
Viktor Barzin 2026-06-20 20:10:40 +00:00
parent 834c5e6a2a
commit 5549fc3672
11 changed files with 408 additions and 28 deletions

View file

@ -0,0 +1,20 @@
[Unit]
Description=Validate and back up Claude OAuth credentials for %i
Documentation=https://github.com/ViktorBarzin/infra/blob/master/docs/runbooks/claude-auth-renew-workstation.md
Wants=network-online.target
After=network-online.target
[Service]
Type=oneshot
User=%i
Group=%i
Environment=HOME=/home/%i
Environment=PATH=/usr/local/bin:/usr/bin:/bin:/home/%i/.local/bin
ExecStart=/usr/local/bin/claude-auth-sync
# Credential and Vault access are required; keep the remaining host surface narrow.
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=read-only
ReadWritePaths=-/home/%i/.claude -/home/%i/.claude.json -/home/%i/.config/claude-auth-sync -/home/%i/.local/state/claude-auth-sync

View file

@ -0,0 +1,12 @@
[Unit]
Description=Keep Claude OAuth credentials valid and recoverable for %i
[Timer]
OnBootSec=10m
OnUnitActiveSec=6h
Persistent=true
RandomizedDelaySec=10m
Unit=claude-auth-sync@%i.service
[Install]
WantedBy=timers.target

View file

@ -251,23 +251,41 @@ env_set() {
chmod 600 "$file"
}
# Share the admin's Claude subscription with a non-admin: inject CLAUDE_CODE_OAUTH_TOKEN
# (the staged long-lived token) into their t3-serve env — ONLY if they have neither their
# own ~/.claude/.credentials.json (own login) nor an existing token. Never clobbers. The
# agent picks it up when its t3-serve@ instance (re)starts.
install_user_claude_token() {
local user="$1" home envf tok
local token_file="${CLAUDE_TOKEN_FILE:-/etc/t3-serve/claude-oauth-token}"
env_unset() {
local file="$1" key="$2"
[[ -f "$file" ]] || return 0
grep -q "^${key}=" "$file" || return 0
if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] unset $key -> $file"; return 0; fi
sed -i "/^${key}=.*/d" "$file"
chmod 600 "$file"
log "removed legacy shared $key -> $(basename "$file")"
}
# Install one user's isolated Claude credential renewal flow. The scoped periodic
# Vault token is minted only when this reconcile has admin Vault access (normal
# onboarding/deployment); routine token renewal is performed by the user service.
install_claude_auth_sync() {
local user="$1" home cfg token_file token policy
home="$(getent passwd "$user" | cut -d: -f6)"
[[ -z "$home" ]] && return 0
[[ -f "$home/.claude/.credentials.json" ]] && return 0 # has own login -> leave it
[[ -r "$token_file" ]] || return 0
envf="${ENVDIR:-/etc/t3-serve}/$user.env"
grep -q '^CLAUDE_CODE_OAUTH_TOKEN=' "$envf" 2>/dev/null && return 0 # already shared
if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] share Claude token -> $envf"; return 0; fi
tok="$(cat "$token_file")"
env_set "$envf" CLAUDE_CODE_OAUTH_TOKEN "$tok"
log "shared Claude token -> $user (t3-serve env; restart needed to take effect)"
cfg="$home/.config/claude-auth-sync"
token_file="$cfg/vault-token"
policy="workstation-claude-$user"
if [[ ! -s "$token_file" ]]; then
if [[ "$DRY_RUN" == 1 ]]; then
echo "[dry-run] mint scoped Claude-auth Vault token -> $user"
elif vault token lookup >/dev/null 2>&1 && \
token="$(vault token create -orphan -period=768h -policy="$policy" \
-display-name="devvm-claude-auth-$user" -field=token 2>/dev/null)"; then
install -d -o "$user" -g "$user" -m 0700 "$cfg"
install -o "$user" -g "$user" -m 0600 /dev/stdin "$token_file" <<<"$token"
log "minted isolated Claude-auth Vault token -> $user"
else
log "WARN: scoped Claude-auth Vault token missing for $user (run provisioner with admin VAULT_TOKEN after vault stack apply)"
fi
fi
run systemctl enable --now "claude-auth-sync@$user.timer" >/dev/null 2>&1 || true
}
# Re-deploy the managed per-user Claude launcher to ~/start-claude.sh. /etc/skel only
@ -421,7 +439,7 @@ while IFS=$'\t' read -r os_user tier shell groups_csv code_layout repos_csv; do
log "add $os_user -> group $g"; run gpasswd -a "$os_user" "$g" >/dev/null
done
fi
if [[ "$tier" != admin ]]; then # non-admins: locked clone(s) (kept fresh) + kubeconfig + shared Claude token
if [[ "$tier" != admin ]]; then # non-admins: locked clone(s) (kept fresh) + kubeconfig
if [[ "$code_layout" == workspace ]]; then
ensure_workspace_layout "$os_user"
install_locked_clone "$os_user" code/infra
@ -440,17 +458,20 @@ while IFS=$'\t' read -r os_user tier shell groups_csv code_layout repos_csv; do
refresh_user_clone "$os_user" code
fi
install_user_kubeconfig "$os_user"
install_user_claude_token "$os_user"
deploy_user_launcher "$os_user" # keep ~/start-claude.sh current (skel only seeds new accounts)
fi
refresh_codex_mirror "$os_user" # all tiers — mirror of the managed claudeMd
install_user_claude_native "$os_user" # all tiers — per-user native claude (terminal + t3); no npm/npx
install_claude_auth_sync "$os_user" # all tiers — own Claude identity + isolated Vault recovery
done < <(jq -r '.accounts[] | [.os_user, .tier, .shell, (if (.groups|length)==0 then "-" else (.groups|join(",")) end), .code_layout, (if (.repos|length)==0 then "-" else (.repos|join(",")) end)] | @tsv' "$desired_file")
# 5) per-user .env (sticky port) + enable t3-serve@
while IFS=$'\t' read -r os_user port; do
envf="$ENVDIR/$os_user.env"
env_set "$envf" T3_PORT "$port" # update-or-append; preserves CLAUDE_CODE_OAUTH_TOKEN
env_set "$envf" T3_PORT "$port"
# Per-user Enterprise login is authoritative. A legacy shared setup-token has
# higher credential precedence and would silently defeat user isolation.
env_unset "$envf" CLAUDE_CODE_OAUTH_TOKEN
id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true
done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file")

View file

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -uo pipefail
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=workstation/claude-auth-sync.sh
source "$DIR/workstation/claude-auth-sync.sh"
pass=0 fail=0
ok() { if "${@:2}"; then pass=$((pass+1)); else fail=$((fail+1)); echo "FAIL: $1"; fi; }
no() { if "${@:2}"; then fail=$((fail+1)); echo "FAIL: $1"; else pass=$((pass+1)); fi; }
eq() { if [[ "$2" == "$3" ]]; then pass=$((pass+1)); else fail=$((fail+1)); echo "FAIL: $1"; fi; }
tmp="$(mktemp -d)"; trap 'rm -rf "$tmp"' EXIT
valid='{"mcpOAuth":{"server":{"accessToken":"mcp-secret"}},"claudeAiOauth":{"accessToken":"access","refreshToken":"refresh","expiresAt":123,"scopes":["user:inference"]}}'
printf '%s\n' "$valid" > "$tmp/credentials.json"
oauth="$(cas_oauth_from_credentials "$tmp/credentials.json")"
eq "extract OAuth object" 'access' "$(jq -r .accessToken <<<"$oauth")"
printf '{"claudeAiOauth":{"accessToken":"access","expiresAt":123}}\n' > "$tmp/bad.json"
no "reject missing refresh token" cas_oauth_from_credentials "$tmp/bad.json"
replacement='{"accessToken":"new-access","refreshToken":"new-refresh","expiresAt":456}'
merged="$(cas_merge_oauth "$tmp/credentials.json" "$replacement")"
eq "replace Claude access token" new-access "$(jq -r .claudeAiOauth.accessToken <<<"$merged")"
eq "preserve MCP OAuth" mcp-secret "$(jq -r '.mcpOAuth.server.accessToken' <<<"$merged")"
export CAS_USER=emo
ok "accept own scoped Vault token" cas_vault_identity_ok token-devvm-claude-auth-emo default,workstation-claude-emo
no "reject another user's token" cas_vault_identity_ok token-devvm-claude-auth-anca default,workstation-claude-anca
no "reject wrong policy" cas_vault_identity_ok token-devvm-claude-auth-emo default,workstation-claude-anca
printf '\n%d passed, %d failed\n' "$pass" "$fail"
(( fail == 0 ))

View file

@ -0,0 +1,153 @@
#!/usr/bin/env bash
# Keep one Workstation user's Claude subscription OAuth credentials recoverable.
# Claude owns access/refresh-token rotation in ~/.claude/.credentials.json. This
# helper validates auth with real inference, stores only the claudeAiOauth object
# in the user's isolated Vault path, and attempts one restore on failure.
set -euo pipefail
CAS_USER="${CLAUDE_AUTH_USER:-$(id -un)}"
CAS_HOME="${HOME:?HOME must be set}"
CAS_CREDENTIALS="${CLAUDE_CREDENTIALS_FILE:-$CAS_HOME/.claude/.credentials.json}"
CAS_CONFIG_DIR="${CLAUDE_AUTH_CONFIG_DIR:-$CAS_HOME/.config/claude-auth-sync}"
CAS_VAULT_TOKEN_FILE="${CLAUDE_AUTH_VAULT_TOKEN_FILE:-$CAS_CONFIG_DIR/vault-token}"
CAS_VAULT_PATH="${CLAUDE_AUTH_VAULT_PATH:-secret/workstation/claude-users/$CAS_USER}"
CAS_STATE_DIR="${CLAUDE_AUTH_STATE_DIR:-$CAS_HOME/.local/state/claude-auth-sync}"
CAS_LOG="$CAS_STATE_DIR/sync.log"
cas_log() {
mkdir -p "$CAS_STATE_DIR"
printf '%s %s\n' "$(date -Is)" "$*" >> "$CAS_LOG"
logger -t claude-auth-sync -- "user=$CAS_USER $*" 2>/dev/null || true
}
# Print the Claude OAuth object, or fail without exposing any token material.
cas_oauth_from_credentials() {
jq -ce '.claudeAiOauth
| select((.accessToken | type) == "string" and (.accessToken | length) > 0)
| select((.refreshToken | type) == "string" and (.refreshToken | length) > 0)
| select((.expiresAt | type) == "number")' "$1"
}
# Merge a recovered OAuth object while preserving unrelated credentials (MCP OAuth).
cas_merge_oauth() {
local credentials="$1" oauth="$2"
jq -ce --argjson oauth "$oauth" '.claudeAiOauth = $oauth' "$credentials"
}
cas_vault_identity_ok() {
local display_name="$1" policies_csv="$2"
[[ "$display_name" == "token-devvm-claude-auth-$CAS_USER" ]] || return 1
printf ',%s,' "$policies_csv" | grep -q ",workstation-claude-$CAS_USER,"
}
cas_prepare_vault() {
[[ -s "$CAS_VAULT_TOKEN_FILE" ]] || {
cas_log "FAIL missing scoped Vault token; admin must run workstation provisioning"
return 1
}
export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
VAULT_TOKEN="$(<"$CAS_VAULT_TOKEN_FILE")"; export VAULT_TOKEN
local info display_name policies
info="$(vault token lookup -format=json 2>/dev/null)" || {
cas_log "FAIL scoped Vault token lookup failed"
return 1
}
display_name="$(jq -r '.data.display_name // ""' <<<"$info")"
policies="$(jq -r '((.data.policies // []) + (.data.identity_policies // [])) | join(",")' <<<"$info")"
cas_vault_identity_ok "$display_name" "$policies" || {
cas_log "FAIL scoped Vault token drift detected; refusing foreign token"
return 1
}
vault token renew -format=json >/dev/null 2>&1 || {
cas_log "FAIL scoped Vault token renewal failed"
return 1
}
}
# auth status is not authoritative: it reported loggedIn=true during a real 401
# on 2026-06-20. A tiny, non-persistent inference is the feedback loop.
cas_live_auth_ok() {
local out
out="$(timeout 60 claude -p 'Reply with exactly AUTH_OK and nothing else.' \
--model haiku --max-turns 1 --no-session-persistence --tools "" \
--disable-slash-commands --setting-sources "" 2>/dev/null)" || return 1
[[ "$out" == "AUTH_OK" ]]
}
cas_backup() {
local oauth expires
oauth="$(cas_oauth_from_credentials "$CAS_CREDENTIALS")" || {
cas_log "FAIL local Claude OAuth credential is absent or malformed"
return 1
}
expires="$(jq -r '.expiresAt' <<<"$oauth")"
vault kv put "$CAS_VAULT_PATH" \
claude_ai_oauth_json="$oauth" \
credential_expires_at_ms="$expires" \
backed_up_at="$(date -Is)" >/dev/null || {
cas_log "FAIL Vault credential backup failed"
return 1
}
cas_log "OK Claude auth valid; refreshed OAuth state backed up to Vault"
}
cas_restore() {
local oauth base tmp
oauth="$(vault kv get -field=claude_ai_oauth_json "$CAS_VAULT_PATH" 2>/dev/null)" || {
cas_log "FAIL no recoverable Claude OAuth credential in Vault"
return 1
}
jq -e 'select((.accessToken | type) == "string" and (.accessToken | length) > 0)
| select((.refreshToken | type) == "string" and (.refreshToken | length) > 0)
| select((.expiresAt | type) == "number")' <<<"$oauth" >/dev/null || {
cas_log "FAIL Vault Claude OAuth credential is malformed"
return 1
}
mkdir -p "$(dirname "$CAS_CREDENTIALS")"
if jq -e 'type == "object"' "$CAS_CREDENTIALS" >/dev/null 2>&1; then
base="$CAS_CREDENTIALS"
else
base="$(mktemp)"; printf '{}\n' > "$base"
fi
tmp="$(mktemp "${CAS_CREDENTIALS}.XXXXXX")"
if ! cas_merge_oauth "$base" "$oauth" > "$tmp"; then
rm -f "$tmp"; [[ "$base" == "$CAS_CREDENTIALS" ]] || rm -f "$base"
cas_log "FAIL could not merge Vault Claude OAuth credential"
return 1
fi
chmod 0600 "$tmp"
mv "$tmp" "$CAS_CREDENTIALS"
[[ "$base" == "$CAS_CREDENTIALS" ]] || rm -f "$base"
cas_log "RECOVERED restored Claude OAuth state from Vault"
}
cas_main() {
umask 077
for bin in jq vault claude timeout flock; do
command -v "$bin" >/dev/null || { cas_log "FAIL missing dependency: $bin"; return 1; }
done
mkdir -p "$CAS_STATE_DIR"
exec 9>"$CAS_STATE_DIR/lock"
flock -n 9 || { cas_log "SKIP another sync is already running"; return 0; }
cas_prepare_vault || return 1
if cas_live_auth_ok; then
cas_backup
return
fi
cas_log "WARN live Claude auth failed; attempting one Vault restore"
cas_restore || return 1
if cas_live_auth_ok; then
cas_backup
return
fi
cas_log "FAIL Claude auth still invalid after Vault restore; interactive SSO login required"
return 1
}
if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then
cas_main "$@"
fi

View file

@ -125,14 +125,10 @@ if command -v vault >/dev/null; then
if [[ -z "${VAULT_TOKEN:-}" && -r /home/wizard/.vault-token ]]; then
VAULT_TOKEN="$(cat /home/wizard/.vault-token)"; export VAULT_TOKEN
fi
# 8a) Shared Claude subscription OAuth token (long-lived sk-ant-oat01) -> root file the
# provisioner injects into non-admins' t3-serve env (only those without their own login).
if claude_tok="$(vault kv get -field=claude_oauth_token secret/workstation 2>/dev/null)"; then
install -m 0600 /dev/stdin /etc/t3-serve/claude-oauth-token <<<"$claude_tok"
log "staged /etc/t3-serve/claude-oauth-token (shared Claude subscription)"
else
log "WARN: secret/workstation claude_oauth_token absent -> non-admins won't share Claude auth"
fi
# 8a) Claude auth is deliberately NOT shared. Each roster user signs in with their own
# Enterprise identity; claude-auth-sync backs up only their OAuth object to an
# isolated Vault path. The provisioner mints its scoped Vault token when this admin
# VAULT_TOKEN is present.
# 8b) Shared Codex auth -> /opt/codex-shared/auth.json (the codex wrapper symlinks each
# user's ~/.codex/auth.json here). Previously a manual host change that did NOT survive
# a rebuild even though the Vault key existed — now reproducible from Vault.
@ -166,6 +162,7 @@ SCRIPTS="$HERE/.."
install -m 0755 "$SCRIPTS/t3-autoupdate.sh" /usr/local/bin/t3-autoupdate
install -m 0755 "$SCRIPTS/t3-backup-state.sh" /usr/local/bin/t3-backup-state
install -m 0755 "$SCRIPTS/t3-mint" /usr/local/bin/t3-mint
install -m 0755 "$HERE/claude-auth-sync.sh" /usr/local/bin/claude-auth-sync
# 9b) t3-dispatch: unprivileged system account + compiled Go binary (build-if-absent)
id -u t3-dispatch >/dev/null 2>&1 || useradd --system --no-create-home --shell /usr/sbin/nologin t3-dispatch
if [[ ! -x /usr/local/bin/t3-dispatch ]]; then
@ -197,12 +194,14 @@ fi
# 9d) unit files + enablement. Timers self-heal; t3-dispatch is long-running.
# t3-serve@ is a TEMPLATE (enabled per-user by the provisioner, not here).
for u in t3-serve@.service \
claude-auth-sync@.service claude-auth-sync@.timer \
t3-autoupdate.service t3-autoupdate.timer \
t3-backup-state.service t3-backup-state.timer \
t3-provision-users.service t3-provision-users.timer \
t3-dispatch.service; do
install -m 0644 "$SCRIPTS/$u" "/etc/systemd/system/$u"
done
log "claude auth: per-user sync script + template units installed"
# 9e) per-user playwright-mcp browser MCP: system-level TEMPLATE units (one
# instance per OS user) + the snapshot-refresh script. Reproducible-from-git
# replacement for the hand-made ~/.config/systemd/user/playwright-* units
@ -219,4 +218,11 @@ systemctl enable --now t3-dispatch.service \
log "WARN: some units failed to enable (check: systemctl status t3-dispatch t3-*.timer)"
log "service units installed + enabled (t3-dispatch + 3 timers; t3-serve@ per-user)"
# Run one foreground reconcile while the admin Vault token borrowed in section 8
# is still available. This is what mints new roster users' isolated periodic
# Vault tokens; the hourly no-admin-token reconcile only maintains existing ones.
if [[ -n "${VAULT_TOKEN:-}" ]]; then
/usr/local/bin/t3-provision-users || log "WARN: foreground provisioner failed; scoped Claude-auth tokens may need a retry"
fi
log "OK (idempotent)"