From 2c1865eabb31f527ac78e3e4665c4fe1e5c25234 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 8 Jun 2026 14:18:12 +0000 Subject: [PATCH] workstation: roster-driven provisioner (SSoT reconcile, additive-only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit t3-provision-users.sh now consumes roster_engine.py: derives accounts + per-tier groups + sticky ports + /etc/ttyd-user-map + dispatch.json from roster.yaml and applies them. ADDITIVE-ONLY for existing users (never strips a group, replaces a home, or re-locks an account) so the hourly timer is always safe. Best-effort tier validation vs live k8s_users: warns on a net-new absent user (emo), aborts only on a real tier conflict, skips when root has no Vault token. DRY_RUN mode for safe testing. Verified on the live host: reproduces dispatch.json content exactly, emo/anca groups + all t3-serve instances unchanged, idempotent, shellcheck-clean; deployed to /usr/local/bin (hourly timer target). Engine: validate_tiers now returns ValidationIssue(severity) — error=conflict (abort) vs warn=absent (grant pending) — + has_blocking_errors(); 28 pytest cases. setup-devvm.sh redeploys the provisioner for reproducibility. Co-Authored-By: Claude Opus 4.8 --- scripts/t3-provision-users.sh | 121 +++++++++++++++------- scripts/workstation/roster_engine.py | 55 +++++++--- scripts/workstation/setup-devvm.sh | 5 + scripts/workstation/test_roster_engine.py | 42 +++++--- 4 files changed, 159 insertions(+), 64 deletions(-) diff --git a/scripts/t3-provision-users.sh b/scripts/t3-provision-users.sh index fb51ab4e..c34d3b08 100644 --- a/scripts/t3-provision-users.sh +++ b/scripts/t3-provision-users.sh @@ -1,44 +1,93 @@ #!/usr/bin/env bash -# Reconcile per-user t3 instances from /etc/ttyd-user-map. -# Each "authentik_user=os_user" line -> an enabled t3-serve@ on a -# sticky port, plus /etc/t3-serve/dispatch.json (authentik_user -> {os_user,port}) -# consumed by t3-dispatch. +# Reconcile per-user t3 Workstation instances from roster.yaml (the single source +# of truth). roster_engine.py derives the desired state (accounts, per-tier groups, +# sticky ports, /etc/ttyd-user-map, dispatch.json); this script APPLIES it. +# +# ADDITIVE-ONLY for existing users: never removes a group, never replaces a home, +# never re-locks/re-chmods an existing account — so a routine (hourly) reconcile is +# always safe for live users. Destructive offboarding (userdel) is a SEPARATE, gated +# path, never here. Runs hourly as root via t3-provision-users.timer; root has no +# Vault token, so tier validation is best-effort (skipped when k8s_users is unreachable). +# +# DRY_RUN=1 prints actions without mutating. WORKSTATION_DIR overrides the roster/engine location. set -euo pipefail -MAP=/etc/ttyd-user-map + +WORKSTATION_DIR="${WORKSTATION_DIR:-/home/wizard/code/infra/scripts/workstation}" +ENGINE="$WORKSTATION_DIR/roster_engine.py" +ROSTER="$WORKSTATION_DIR/roster.yaml" ENVDIR=/etc/t3-serve -BASE_PORT=3773 +MAP=/etc/ttyd-user-map +DRY_RUN="${DRY_RUN:-0}" + +log() { echo "[t3-provision] $*"; } +run() { if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] $*"; else "$@"; fi; } + +[[ $EUID -eq 0 ]] || { echo "t3-provision-users: must run as root" >&2; exit 1; } +for bin in python3 jq; do command -v "$bin" >/dev/null || { echo "missing $bin" >&2; exit 1; }; done +[[ -f "$ROSTER" && -f "$ENGINE" ]] || { echo "roster/engine not under $WORKSTATION_DIR" >&2; exit 1; } install -d -m 0755 "$ENVDIR" -port_of() { grep -oE 'T3_PORT=[0-9]+' "$1" | cut -d= -f2; } +# 1) current sticky ports from existing .env files -> {os_user: port} +ports_file="$(mktemp)"; trap 'rm -f "$ports_file" "${desired_file:-}"' EXIT +{ echo "{}"; for f in "$ENVDIR"/*.env; do + [[ -e "$f" ]] || continue + u="$(basename "$f" .env)"; p="$(grep -oE 'T3_PORT=[0-9]+' "$f" | cut -d= -f2)" + [[ -n "$p" ]] && jq -n --arg u "$u" --argjson p "$p" '{($u): $p}' + done; } | jq -s 'add' > "$ports_file" -next_port() { # lowest free port >= BASE_PORT not already assigned - local used p - used=$(grep -hoE 'T3_PORT=[0-9]+' "$ENVDIR"/*.env 2>/dev/null | cut -d= -f2 | sort -n) - p=$BASE_PORT - while echo "$used" | grep -qx "$p"; do p=$((p+1)); done - echo "$p" -} - -declare -A DISPATCH -while IFS='=' read -r ak os; do - [[ -z "${ak// }" || "$ak" =~ ^[[:space:]]*# ]] && continue - ak=$(echo "$ak" | xargs); os=$(echo "$os" | xargs) - [[ -z "$ak" || -z "$os" ]] && continue - if ! id "$os" >/dev/null 2>&1; then - logger -t t3-provision "skip $ak: no OS user $os"; continue +# 2) tier validation vs live k8s_users (best-effort; aborts only on a real conflict) +if command -v vault >/dev/null; then + export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}" + if k8s_raw="$(vault kv get -field=k8s_users secret/platform 2>/dev/null)"; then + k8s_file="$(mktemp)"; echo "$k8s_raw" | jq -c 'map_values(.role)' > "$k8s_file" + if ! python3 "$ENGINE" validate --roster "$ROSTER" --k8s-users-json "$k8s_file"; then + rm -f "$k8s_file"; echo "[t3-provision] ABORT: roster tier conflicts with k8s_users" >&2; exit 1 + fi + rm -f "$k8s_file" + else + log "WARN: k8s_users unreachable (no Vault token?) -> skipping tier validation" fi - envf="$ENVDIR/$os.env" - [[ -f "$envf" ]] || echo "T3_PORT=$(next_port)" > "$envf" - port=$(port_of "$envf") - systemctl enable --now "t3-serve@$os.service" >/dev/null 2>&1 || true - DISPATCH[$ak]="{\"os_user\":\"$os\",\"port\":$port}" -done < "$MAP" +fi -tmp=$(mktemp) -{ printf '{'; first=1 - for ak in "${!DISPATCH[@]}"; do - [[ $first -eq 0 ]] && printf ','; first=0 - printf '"%s":%s' "$ak" "${DISPATCH[$ak]}" - done; printf '}\n'; } > "$tmp" -install -m 0644 "$tmp" "$ENVDIR/dispatch.json"; rm -f "$tmp" -logger -t t3-provision "reconcile complete: $(wc -c < "$ENVDIR/dispatch.json") bytes" +# 3) derive desired state +desired_file="$(mktemp)" +python3 "$ENGINE" derive --roster "$ROSTER" --ports-json "$ports_file" > "$desired_file" +jq -e . "$desired_file" >/dev/null || { echo "[t3-provision] derive produced invalid JSON" >&2; exit 1; } + +# 4) per-account: create-if-absent + ADDITIVE tier groups (never strip) +while IFS=$'\t' read -r os_user shell groups_csv; do + if ! id "$os_user" >/dev/null 2>&1; then + log "create account: $os_user (shell $shell)" + run useradd -m -s "$shell" "$os_user" + run passwd -l "$os_user" # SSO/t3 only — no local password + run chmod 700 "/home/$os_user" + fi + [[ -z "$groups_csv" ]] && continue + current="$(id -nG "$os_user" 2>/dev/null | tr ' ' '\n')" + IFS=',' read -ra want <<< "$groups_csv" + for g in "${want[@]}"; do + grep -qx "$g" <<< "$current" && continue # already a member -> skip + getent group "$g" >/dev/null 2>&1 || continue # group must exist + log "add $os_user -> group $g"; run gpasswd -a "$os_user" "$g" >/dev/null + done +done < <(jq -r '.accounts[] | [.os_user, .shell, (.groups|join(","))] | @tsv' "$desired_file") + +# 5) per-user .env (sticky port) + enable t3-serve@ +while IFS=$'\t' read -r os_user port; do + envf="$ENVDIR/$os_user.env" + if [[ ! -f "$envf" ]] || ! grep -qx "T3_PORT=$port" "$envf"; then + run bash -c "printf 'T3_PORT=%s\n' '$port' > '$envf'" + fi + id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true +done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file") + +# 6) regenerate /etc/ttyd-user-map + dispatch.json from the desired state (SSoT: +# a roster entry removed here DISAPPEARS, which is what the offboarding cut relies on) +if [[ "$DRY_RUN" == 1 ]]; then + log "[dry-run] would regenerate $MAP + $ENVDIR/dispatch.json" +else + jq -r '.ttyd_user_map' "$desired_file" > "$MAP.tmp" && install -m 0644 "$MAP.tmp" "$MAP" && rm -f "$MAP.tmp" + jq -c '.dispatch' "$desired_file" > "$ENVDIR/dispatch.json.tmp" && install -m 0644 "$ENVDIR/dispatch.json.tmp" "$ENVDIR/dispatch.json" && rm -f "$ENVDIR/dispatch.json.tmp" +fi + +log "reconcile complete ($([[ "$DRY_RUN" == 1 ]] && echo DRY-RUN || echo applied))" diff --git a/scripts/workstation/roster_engine.py b/scripts/workstation/roster_engine.py index ceaa5388..d9e7dd71 100644 --- a/scripts/workstation/roster_engine.py +++ b/scripts/workstation/roster_engine.py @@ -117,26 +117,49 @@ def load_roster_file(path: str) -> Roster: # -------------------------------------------------------------------------- -def validate_tiers(roster: Roster, k8s_user_tiers: dict[str, str]) -> list[str]: - """Return one error string per roster user whose tier disagrees with the - live `k8s_users` map. Admins are exempt (cluster-admin is granted out of - band). An empty list means the roster is consistent with the cluster.""" - errors = [] +@dataclass(frozen=True) +class ValidationIssue: + os_user: str + severity: str # "error" = tier conflict (abort) | "warn" = absent (grant pending) + message: str + + +def validate_tiers( + roster: Roster, k8s_user_tiers: dict[str, str] +) -> list[ValidationIssue]: + """Compare each roster user's tier against the live `k8s_users` map. A real + conflict (roster tier != cluster tier) is an "error" (abort). A net-new user + not yet in `k8s_users` is a "warn" (onboarding proceeds; the kubectl grant is + pending). Admins are exempt (cluster-admin is granted out of band). An empty + list means the roster is consistent with the cluster.""" + issues = [] for user in roster.users.values(): if user.tier == "admin": continue actual = k8s_user_tiers.get(user.k8s_user) if actual is None: - errors.append( - f"{user.os_user}: tier {user.tier} but k8s_user {user.k8s_user!r} " - f"absent from k8s_users (add the entry first)" + issues.append( + ValidationIssue( + user.os_user, + "warn", + f"{user.os_user}: tier {user.tier} but k8s_user {user.k8s_user!r} " + f"absent from k8s_users (kubectl grant pending — add the entry)", + ) ) elif actual != user.tier: - errors.append( - f"{user.os_user}: roster tier {user.tier} != k8s_users tier " - f"{actual} for {user.k8s_user!r}" + issues.append( + ValidationIssue( + user.os_user, + "error", + f"{user.os_user}: roster tier {user.tier} != k8s_users tier " + f"{actual} for {user.k8s_user!r}", + ) ) - return errors + return issues + + +def has_blocking_errors(issues: list[ValidationIssue]) -> bool: + return any(issue.severity == "error" for issue in issues) # -------------------------------------------------------------------------- @@ -261,10 +284,10 @@ def _main(argv: list[str]) -> int: roster = load_roster_file(args.roster) if args.cmd == "validate": with open(args.k8s_users_json, encoding="utf-8") as fh: - errors = validate_tiers(roster, json.load(fh)) - for err in errors: - print(err, file=sys.stderr) - return 1 if errors else 0 + issues = validate_tiers(roster, json.load(fh)) + for issue in issues: + print(f"{issue.severity.upper()}: {issue.message}", file=sys.stderr) + return 1 if has_blocking_errors(issues) else 0 with open(args.ports_json, encoding="utf-8") as fh: desired = derive_desired_state(roster, json.load(fh)) json.dump(_desired_state_to_dict(desired), sys.stdout, indent=2, sort_keys=True) diff --git a/scripts/workstation/setup-devvm.sh b/scripts/workstation/setup-devvm.sh index 2ab909c8..ae7c6096 100755 --- a/scripts/workstation/setup-devvm.sh +++ b/scripts/workstation/setup-devvm.sh @@ -63,4 +63,9 @@ for d in skills rules agents commands; do done log "skel: launcher + tmux + inheritance symlinks (base=$CONFIG_BASE)" +# 6) deploy the roster-driven provisioner to /usr/local/bin (run hourly by +# t3-provision-users.timer). Re-deployed here so its logic is reproducible. +install -m 0755 "$HERE/../t3-provision-users.sh" /usr/local/bin/t3-provision-users +log "t3-provision-users -> /usr/local/bin/ (roster-driven)" + log "OK (idempotent)" diff --git a/scripts/workstation/test_roster_engine.py b/scripts/workstation/test_roster_engine.py index 444371db..fe19c90e 100644 --- a/scripts/workstation/test_roster_engine.py +++ b/scripts/workstation/test_roster_engine.py @@ -99,24 +99,26 @@ def test_validate_ok_when_tiers_match(): assert eng.validate_tiers(r, {"anca": "namespace-owner"}) == [] -def test_validate_flags_tier_mismatch(): +def test_validate_flags_tier_mismatch_as_error(): + # roster says power-user, cluster says namespace-owner -> a real conflict -> ERROR (abort). r = _roster( "users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}" ) - errs = eng.validate_tiers(r, {"anca": "namespace-owner"}) - assert len(errs) == 1 - assert ( - "anca" in errs[0] and "power-user" in errs[0] and "namespace-owner" in errs[0] - ) + issues = eng.validate_tiers(r, {"anca": "namespace-owner"}) + assert len(issues) == 1 + assert issues[0].severity == "error" + assert issues[0].os_user == "ancamilea" + assert "power-user" in issues[0].message and "namespace-owner" in issues[0].message -def test_validate_flags_netnew_user_absent_from_k8s_users(): - # emo is power-user in the roster but has no k8s_users entry yet -> the OIDC - # RBAC binding can't exist, so this must fail loud (add the entry first). +def test_validate_flags_netnew_absent_as_warn(): + # emo is power-user in the roster but has no k8s_users entry yet. Onboarding the + # workstation should still proceed; the kubectl grant is pending -> WARN, not error. r = _roster("users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}") - errs = eng.validate_tiers(r, {}) - assert len(errs) == 1 - assert "emo" in errs[0] and "k8s_users" in errs[0] + issues = eng.validate_tiers(r, {}) + assert len(issues) == 1 + assert issues[0].severity == "warn" + assert "emo" in issues[0].message and "k8s_users" in issues[0].message def test_validate_skips_admin_tier(): @@ -127,6 +129,22 @@ def test_validate_skips_admin_tier(): assert eng.validate_tiers(r, {}) == [] +def test_has_blocking_errors_distinguishes_mismatch_from_absent(): + mismatch = _roster( + "users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}" + ) + absent = _roster( + "users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}" + ) + assert ( + eng.has_blocking_errors( + eng.validate_tiers(mismatch, {"anca": "namespace-owner"}) + ) + is True + ) + assert eng.has_blocking_errors(eng.validate_tiers(absent, {})) is False + + # -------------------------------------------------------------------------- # derive_desired_state: accounts, sticky ports, ttyd map, dispatch (module #1) # --------------------------------------------------------------------------