diff --git a/docs/architecture/multi-tenancy.md b/docs/architecture/multi-tenancy.md index e95fcb8f..c6e9fb58 100644 --- a/docs/architecture/multi-tenancy.md +++ b/docs/architecture/multi-tenancy.md @@ -553,6 +553,8 @@ Separate from the in-cluster namespace-owner model above, the **devvm** (`10.0.1 5. (Optional — Viktor's call per user) Grant direct master push: add their login to the `master` branch-protection push + merge whitelists (`PATCH /api/v1/repos/viktor/infra/branch_protections/master`). Done for `ebarzin` 2026-06-10. 6. Verify: branch push succeeds; a `master` push succeeds for whitelisted users and is rejected with `Not allowed to push to protected branch` otherwise. +**Session persistence (2026-06-10):** named tmux sessions (each running one Claude conversation) survive devvm reboots — `t3-tmux-save.timer` (5-min) snapshots every roster user's sessions (name, cwd, conversation uuid from argv or the cwd-slug transcript dir) to `/var/lib/t3-tmux-state/.tsv`, and `t3-tmux-restore.service` recreates missing sessions at boot with `claude --resume ` (per-session idempotent; also handles partial loss). Conversations themselves were always durable (`~/.claude/projects/`); this persists the session wiring. + **Status (2026-06-10):** built + verified on the live host — capacity (8 GiB swap), config inheritance, roster-driven provisioner, per-user locked clone, per-user OIDC kubeconfig + the `oidc-power-user-readonly` ClusterRole + emo's `k8s_users` entry (applied + impersonation-verified), the Authentik `T3 Users` edge gate, **the emo Phase-5 cutover (own clone + launcher repoint + `code-shared` removal, completed 2026-06-10) and emo's contribute access (`ebarzin` write collaborator + PAT + protected `master`)**. Per the live `/etc/skel` design, non-admin `~/.claude/{rules,skills}` symlinks into the admin base are **kept** (they ARE the shared-base delivery mechanism — the plan's step to remove them is obsolete). **Remaining (held / future):** the offboarding apply-side (Phase 7), per-user MCP/auth injection, and roster-reconciled `T3 Users` membership. See `../runbooks/offboard-user.md` for deprovisioning. ## Related diff --git a/scripts/t3-provision-users.sh b/scripts/t3-provision-users.sh index b252cc90..83c5d679 100644 --- a/scripts/t3-provision-users.sh +++ b/scripts/t3-provision-users.sh @@ -256,6 +256,11 @@ done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file") # breaking nightly mid-day and took out auth for everyone. `enable` (no --now) just arms # the 04:00 schedule; fresh boxes get t3 from setup-devvm.sh's pinned install, not here. run systemctl enable t3-autoupdate.timer >/dev/null 2>&1 || true +# tmux session persistence: periodic snapshot + boot-time restore (reboot +# survival for users' named claude sessions). Safe to --now: save is a +# read-only snapshot; restore is per-session idempotent. +run systemctl enable --now t3-tmux-save.timer >/dev/null 2>&1 || true +run systemctl enable t3-tmux-restore.service >/dev/null 2>&1 || true # 6) regenerate /etc/ttyd-user-map + dispatch.json from the desired state (SSoT: # a roster entry removed here DISAPPEARS, which is what the offboarding cut relies on) diff --git a/scripts/t3-tmux-restore.service b/scripts/t3-tmux-restore.service new file mode 100644 index 00000000..5d5c925b --- /dev/null +++ b/scripts/t3-tmux-restore.service @@ -0,0 +1,12 @@ +[Unit] +Description=Restore workstation tmux sessions (resume saved claude conversations) after boot +After=network.target local-fs.target +# Before the save timer's first run (OnBootSec=10min) so an empty post-boot +# state can never be snapshotted over the manifest being restored from. + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/t3-tmux-sessions restore + +[Install] +WantedBy=multi-user.target diff --git a/scripts/t3-tmux-save.service b/scripts/t3-tmux-save.service new file mode 100644 index 00000000..df4adbfb --- /dev/null +++ b/scripts/t3-tmux-save.service @@ -0,0 +1,6 @@ +[Unit] +Description=Snapshot workstation tmux sessions (name -> claude conversation) for reboot survival + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/t3-tmux-sessions save diff --git a/scripts/t3-tmux-save.timer b/scripts/t3-tmux-save.timer new file mode 100644 index 00000000..b230aee2 --- /dev/null +++ b/scripts/t3-tmux-save.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Periodic workstation tmux session snapshot + +[Timer] +OnBootSec=10min +OnCalendar=*:0/5 +Persistent=false + +[Install] +WantedBy=timers.target diff --git a/scripts/t3-tmux-sessions.sh b/scripts/t3-tmux-sessions.sh new file mode 100644 index 00000000..bc223745 --- /dev/null +++ b/scripts/t3-tmux-sessions.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Persist workstation tmux sessions across devvm reboots. +# +# save — snapshot every roster user's live tmux sessions to +# /var/lib/t3-tmux-state/.tsv (name, cwd, claude session +# uuid). The uuid is sniffed from the claude process's OPEN +# transcript fd (~/.claude/projects//.jsonl), so it is +# correct regardless of how the session was launched (fresh via +# start-claude.sh or an explicit --resume). Runs every 5 min via +# t3-tmux-save.timer. A user with no tmux server keeps their last +# manifest (so a post-reboot save can't wipe it before restore). +# restore — recreate manifest sessions that don't currently exist, resuming +# each saved conversation (claude --resume ). Per-session +# idempotent: existing names are left alone, so it is safe both at +# boot (t3-tmux-restore.service) and after a partial loss. +# +# v1 limitation: one window/pane per session is captured (the workstation +# usage pattern — one named claude conversation per tmux session). +set -euo pipefail + +STATE_DIR=/var/lib/t3-tmux-state +MAP=/etc/ttyd-user-map +MODE="${1:-}" + +log() { echo "[t3-tmux-sessions] $*"; } + +users() { [[ -r "$MAP" ]] && cut -d= -f2 "$MAP" | sort -u; } + +tmux_as() { local u="$1"; shift; runuser -u "$u" -- tmux "$@"; } + +# First descendant of $1 whose comm is `claude` (BFS, bounded by process tree). +claude_pid_under() { + local q=("$1") pid kids + while ((${#q[@]})); do + pid="${q[0]}"; q=("${q[@]:1}") + [[ "$(ps -o comm= -p "$pid" 2>/dev/null)" == claude ]] && { echo "$pid"; return 0; } + read -ra kids <<<"$(pgrep -P "$pid" 2>/dev/null | tr '\n' ' ')" || true + ((${#kids[@]})) && q+=("${kids[@]}") + done + return 1 +} + +# Conversation uuid of a claude process ($1 pid, $2 user, $3 cwd). Two sources +# (claude does NOT hold its transcript fd open, so fd-sniffing doesn't work): +# 1. argv `--resume ` — covers every session this script's restore (or a +# manual recovery) created, making the save/restore loop self-sustaining; +# 2. newest .jsonl in the user's cwd-slug project dir created at/after +# the process start — covers fresh launcher-started sessions. +# Always returns 0; empty output means "no conversation" (restored as a shell). +uuid_of_claude() { + local uuid slug dir start f + uuid="$(tr '\0' '\n' < "/proc/$1/cmdline" 2>/dev/null \ + | grep -A1 -x -- '--resume' | tail -1 \ + | grep -oE '^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$' || true)" + [[ -n "$uuid" ]] && { echo "$uuid"; return 0; } + slug="${3//\//-}"; slug="${slug//./-}" + dir="$(getent passwd "$2" | cut -d: -f6)/.claude/projects/$slug" + [[ -d "$dir" ]] || return 0 + start=$(( $(date +%s) - $(ps -o etimes= -p "$1" 2>/dev/null | tr -d ' ' || echo 0) - 5 )) + f="$(find "$dir" -maxdepth 1 -name '*.jsonl' -newermt "@$start" -printf '%T@ %f\n' 2>/dev/null \ + | sort -rn | head -1 | awk '{print $2}' || true)" + [[ -n "$f" ]] && echo "${f%.jsonl}" + return 0 +} + +save() { + install -d -m 0755 "$STATE_DIR" + local u uid sess pane_pid pane_cwd cpid uuid tmp + for u in $(users); do + uid="$(id -u "$u" 2>/dev/null)" || continue + [[ -S "/tmp/tmux-$uid/default" ]] || continue # no server -> keep last manifest + tmp="$(mktemp)" + while IFS=$'\t' read -r sess pane_pid pane_cwd; do + [[ -n "$sess" ]] || continue + uuid="" + if cpid="$(claude_pid_under "$pane_pid")"; then uuid="$(uuid_of_claude "$cpid" "$u" "$pane_cwd")"; fi + printf '%s\t%s\t%s\n' "$sess" "$pane_cwd" "$uuid" >> "$tmp" + done < <(tmux_as "$u" list-panes -a -F $'#{session_name}\t#{pane_pid}\t#{pane_current_path}' 2>/dev/null \ + | sort -u -t$'\t' -k1,1) + install -m 0600 "$tmp" "$STATE_DIR/$u.tsv"; rm -f "$tmp" + log "saved $(wc -l < "$STATE_DIR/$u.tsv") session(s) for $u" + done +} + +restore() { + local u f sess cwd uuid cmd + for u in $(users); do + f="$STATE_DIR/$u.tsv" + [[ -s "$f" ]] || continue + while IFS=$'\t' read -r sess cwd uuid; do + [[ -n "$sess" ]] || continue + tmux_as "$u" has-session -t "=$sess" 2>/dev/null && continue # already live + [[ -d "$cwd" ]] || cwd="$(getent passwd "$u" | cut -d: -f6)" + if [[ -n "$uuid" ]]; then + cmd="claude --dangerously-skip-permissions --resume $uuid --name \"$sess\"; echo; echo ' claude exited — shell preserved'; exec bash -l" + else + cmd="exec bash -l" + fi + tmux_as "$u" new-session -d -s "$sess" -c "$cwd" "$cmd" \ + && log "restored $u:$sess${uuid:+ (resume ${uuid:0:8})}" \ + || log "WARN: failed to restore $u:$sess" + done < "$f" + done +} + +case "$MODE" in + save) save ;; + restore) restore ;; + *) echo "usage: t3-tmux-sessions save|restore" >&2; exit 1 ;; +esac