diff --git a/cli/cmd_vault.go b/cli/cmd_vault.go index 6d35ba76..b64dbd2d 100644 --- a/cli/cmd_vault.go +++ b/cli/cmd_vault.go @@ -4,6 +4,7 @@ import ( "bufio" "encoding/base64" "encoding/json" + "errors" "fmt" "os" "os/exec" @@ -79,7 +80,33 @@ func realRunner(name string, argv, envv []string) (string, error) { out, err := cmd.Output() // Trim only the trailing newline the tool appends — NOT all whitespace, so a // fetched secret with significant leading/trailing spaces is preserved. - return strings.TrimRight(string(out), "\r\n"), err + return strings.TrimRight(string(out), "\r\n"), augmentErr(err, exitStderr(err)) +} + +// exitStderr returns the stderr captured by cmd.Output() on a failed exec (it +// stows it on *exec.ExitError), or nil. The tools we shell out to (vault, bw) +// write the actionable message there — "connection refused", "permission +// denied" — which the caller would otherwise never see behind a bare +// "exit status N". +func exitStderr(err error) []byte { + var ee *exec.ExitError + if errors.As(err, &ee) { + return ee.Stderr + } + return nil +} + +// augmentErr appends captured stderr to an error so failures are diagnosable +// (not just "exit status 2"). Returns nil when err is nil, and err unchanged +// when there's no stderr; preserves the wrapped error for errors.Is/As. +func augmentErr(err error, stderr []byte) error { + if err == nil { + return nil + } + if s := strings.TrimSpace(string(stderr)); s != "" { + return fmt.Errorf("%w: %s", err, s) + } + return err } // realRunnerStdin runs a command feeding `stdin` to it, for secret values that @@ -92,7 +119,7 @@ func realRunnerStdin(name string, argv, envv []string, stdin string) (string, er } cmd.Stdin = strings.NewReader(stdin) out, err := cmd.Output() - return strings.TrimRight(string(out), "\r\n"), err + return strings.TrimRight(string(out), "\r\n"), augmentErr(err, exitStderr(err)) } func vwCredsPath(user string) string { return vwUserPathPrefix + user } @@ -135,26 +162,58 @@ func scopedTokenPath(home string) string { } // vaultTokenSource decides which Vault token the `vault` child processes should -// use. Precedence: an explicit $VAULT_TOKEN, then a native ~/.vault-token (what -// admins carry), then the per-user scoped token claude-auth-sync maintains at -// scopedTokenPath(HOME) (policy workstation-claude-, which grants exactly -// the create/read/update this tool needs on the user's own path). Returns the -// token to export — "" when nothing must be exported because the vault CLI reads -// the ambient credential natively — plus a source tag for tests/logging. +// use. Precedence: an explicit $VAULT_TOKEN (deliberate override), then the +// per-user scoped token claude-auth-sync maintains at scopedTokenPath(HOME) +// (policy workstation-claude-, which grants exactly the create/read/update +// this tool needs on the user's own path), then a native ~/.vault-token. +// +// The scoped token MUST beat ~/.vault-token: this tool only ever touches the +// caller's own secret/workstation/claude-users/ path, and a power-user who +// ran `vault login -method=oidc` carries a read-only ~/.vault-token whose +// capability on that path is `deny` — letting it win shadows the scoped token +// and every op fails 403/deny (emo, 2026-06-28). ~/.vault-token is only the +// right credential when there is no scoped token (admins). Returns the token to +// export — "" when the vault CLI should read the ambient/native credential — +// plus a source tag for tests/logging. func vaultTokenSource(envToken string, haveVaultTokenFile bool, scopedToken string) (token, source string) { switch { case envToken != "": return "", "env" + case strings.TrimSpace(scopedToken) != "": + return strings.TrimSpace(scopedToken), "scoped" case haveVaultTokenFile: return "", "file" default: - if t := strings.TrimSpace(scopedToken); t != "" { - return t, "scoped" - } return "", "none" } } +// vaultAddrDefault is the cluster Vault the workstation talks to. The bw server +// is likewise hardcoded (openSession), so a sane default here is consistent. +const vaultAddrDefault = "https://vault.viktorbarzin.me" + +// vaultAddrToSet returns the VAULT_ADDR to export when the caller's environment +// doesn't already set one, else "". homelab vault is invoked by AFK agent +// sessions — frequently non-login shells (tmux panes, agent subprocesses) that +// never sourced /etc/environment — so, like claude-auth-sync, the CLI must NOT +// depend on an ambient VAULT_ADDR; otherwise every `vault` child falls back to +// the 127.0.0.1:8200 default and fails "connection refused" (exit 2). +func vaultAddrToSet(envAddr string) string { + if strings.TrimSpace(envAddr) == "" { + return vaultAddrDefault + } + return "" +} + +// ensureVaultAddr exports the default VAULT_ADDR when none is set, so the vault +// child processes reach the cluster Vault regardless of the caller's shell. An +// explicit VAULT_ADDR (admins, CI) is left untouched. +func ensureVaultAddr() { + if a := vaultAddrToSet(os.Getenv("VAULT_ADDR")); a != "" { + os.Setenv("VAULT_ADDR", a) + } +} + // fileNonEmpty reports whether path exists and has content. func fileNonEmpty(path string) bool { fi, err := os.Stat(path) @@ -167,6 +226,10 @@ func fileNonEmpty(path string) bool { // is idempotent and safe for admins, whose explicit $VAULT_TOKEN / ~/.vault-token // take precedence and are left untouched. func ensureVaultToken() { + // Every vault verb funnels through here, so this is the one place that also + // guarantees VAULT_ADDR is set (see vaultAddrToSet for why it can't be + // assumed from the caller's shell). + ensureVaultAddr() home := os.Getenv("HOME") scoped, _ := os.ReadFile(scopedTokenPath(home)) tok, src := vaultTokenSource(os.Getenv("VAULT_TOKEN"), home != "" && fileNonEmpty(home+"/.vault-token"), string(scoped)) diff --git a/cli/cmd_vault_test.go b/cli/cmd_vault_test.go index 4f583b95..602cfbbc 100644 --- a/cli/cmd_vault_test.go +++ b/cli/cmd_vault_test.go @@ -2,6 +2,7 @@ package main import ( "encoding/base64" + "errors" "fmt" "os" "reflect" @@ -269,6 +270,29 @@ func TestEnsureVaultTokenKeepsExplicitEnv(t *testing.T) { } } +func TestEnsureVaultTokenPrefersScopedOverFile(t *testing.T) { + // Regression: a power-user's read-only OIDC ~/.vault-token must NOT shadow the + // purpose-built scoped token (emo's setup hit 403 because it did, 2026-06-28). + dir := t.TempDir() + cfg := dir + "/.config/claude-auth-sync" + if err := os.MkdirAll(cfg, 0o700); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(cfg+"/vault-token", []byte("SCOPED-TOK"), 0o600); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(dir+"/.vault-token", []byte("STALE-OIDC-TOK"), 0o600); err != nil { + t.Fatal(err) + } + t.Setenv("HOME", dir) + t.Setenv("VAULT_TOKEN", "") + + ensureVaultToken() + if got := os.Getenv("VAULT_TOKEN"); got != "SCOPED-TOK" { + t.Fatalf("VAULT_TOKEN = %q, want the scoped token to win over a stale ~/.vault-token", got) + } +} + func TestScopedTokenPath(t *testing.T) { if got := scopedTokenPath("/home/emo"); got != "/home/emo/.config/claude-auth-sync/vault-token" { t.Fatalf("scopedTokenPath = %q", got) @@ -276,9 +300,10 @@ func TestScopedTokenPath(t *testing.T) { } func TestVaultTokenSource(t *testing.T) { - // Precedence: explicit $VAULT_TOKEN > ~/.vault-token (vault CLI native) > - // the claude-auth-sync per-user scoped token. This is what lets a non-admin - // workstation user (no ambient token) reach their own Vault path. + // Precedence: explicit $VAULT_TOKEN > the claude-auth-sync per-user scoped + // token > a native ~/.vault-token. Scoped beats the file so a power-user's + // read-only OIDC ~/.vault-token can't shadow the scoped token on the user's + // own path (emo, 2026-06-28). cases := []struct { name string env string @@ -287,10 +312,11 @@ func TestVaultTokenSource(t *testing.T) { wantTok, wantSrc string }{ {"explicit env wins", "abc", true, "S", "", "env"}, - {"vault-token file used natively", "", true, "S", "", "file"}, - {"scoped fallback for non-admin", "", false, "S-TOK", "S-TOK", "scoped"}, + {"scoped beats a stale ~/.vault-token", "", true, "S-TOK", "S-TOK", "scoped"}, + {"scoped used when no file", "", false, "S-TOK", "S-TOK", "scoped"}, + {"native ~/.vault-token only when no scoped", "", true, "", "", "file"}, {"scoped value is trimmed", "", false, " S-TOK\n", "S-TOK", "scoped"}, - {"whitespace-only scoped is no token", "", false, " \n", "", "none"}, + {"whitespace-only scoped falls back to file", "", true, " \n", "", "file"}, {"nothing configured", "", false, "", "", "none"}, } for _, c := range cases { @@ -302,6 +328,66 @@ func TestVaultTokenSource(t *testing.T) { } } +func TestVaultAddrToSet(t *testing.T) { + // homelab vault is invoked by AFK agent sessions (non-login shells that + // never sourced /etc/environment), so the CLI must self-default VAULT_ADDR + // rather than rely on the ambient env — else every `vault` child hits the + // 127.0.0.1:8200 default and fails "connection refused" (exit 2). + cases := []struct { + name, env, want string + }{ + {"unset -> default", "", vaultAddrDefault}, + {"whitespace-only -> default", " \n", vaultAddrDefault}, + {"explicit kept (empty = leave alone)", "https://vault.example.com", ""}, + } + for _, c := range cases { + if got := vaultAddrToSet(c.env); got != c.want { + t.Errorf("%s: vaultAddrToSet(%q) = %q, want %q", c.name, c.env, got, c.want) + } + } +} + +func TestEnsureVaultTokenSetsDefaultAddr(t *testing.T) { + dir := t.TempDir() // no scoped token, no ~/.vault-token + t.Setenv("HOME", dir) + t.Setenv("VAULT_TOKEN", "") + t.Setenv("VAULT_ADDR", "") // emo's non-login-shell situation + + ensureVaultToken() + if got := os.Getenv("VAULT_ADDR"); got != vaultAddrDefault { + t.Fatalf("VAULT_ADDR = %q, want default %q to be exported", got, vaultAddrDefault) + } +} + +func TestEnsureVaultTokenKeepsExplicitAddr(t *testing.T) { + dir := t.TempDir() + t.Setenv("HOME", dir) + t.Setenv("VAULT_TOKEN", "") + t.Setenv("VAULT_ADDR", "https://vault.example.com") + + ensureVaultToken() + if got := os.Getenv("VAULT_ADDR"); got != "https://vault.example.com" { + t.Fatalf("VAULT_ADDR = %q, must not override an explicit addr", got) + } +} + +func TestAugmentErrSurfacesStderr(t *testing.T) { + if got := augmentErr(nil, []byte("ignored")); got != nil { + t.Fatalf("augmentErr(nil, …) = %v, want nil", got) + } + base := errors.New("exit status 2") + got := augmentErr(base, []byte(" dial tcp 127.0.0.1:8200: connect: connection refused\n")) + if got == nil || !strings.Contains(got.Error(), "connection refused") || !strings.Contains(got.Error(), "exit status 2") { + t.Fatalf("augmentErr did not surface stderr: %v", got) + } + if !errors.Is(got, base) { + t.Fatal("augmentErr lost the wrapped error (errors.Is failed)") + } + if got := augmentErr(base, []byte(" ")); got != base { + t.Fatalf("augmentErr with blank stderr = %v, want the original error unchanged", got) + } +} + func TestKvWriteVerb(t *testing.T) { // merge=true → read-modify-write patch (needs only read+update, NOT the // `patch` capability the scoped workstation policy lacks). diff --git a/docs/runbooks/homelab-vault-onboarding.md b/docs/runbooks/homelab-vault-onboarding.md index 61d323ab..29499874 100644 --- a/docs/runbooks/homelab-vault-onboarding.md +++ b/docs/runbooks/homelab-vault-onboarding.md @@ -23,18 +23,28 @@ homelab vault lock lock / log out the local bw session `homelab vault` runs `vault` as the calling user. It resolves a Vault token in this order (`ensureVaultToken`, `cli/cmd_vault.go`): -1. an explicit `$VAULT_TOKEN`, then -2. a native `~/.vault-token` (what admins carry), then -3. the per-user **scoped token** that `claude-auth-sync` maintains at - `~/.config/claude-auth-sync/vault-token` (policy `workstation-claude-`). +1. an explicit `$VAULT_TOKEN` (a deliberate override), then +2. the per-user **scoped token** that `claude-auth-sync` maintains at + `~/.config/claude-auth-sync/vault-token` (policy `workstation-claude-`), then +3. a native `~/.vault-token` (admins who carry one; non-admins usually don't). + +**The scoped token deliberately beats `~/.vault-token`.** This tool only touches +your own `secret/workstation/claude-users/` path, and a power-user who ran +`vault login -method=oidc` carries a read-only `~/.vault-token` (capability +`deny` on that path); letting it win would shadow the scoped token and fail every +op with `403 permission denied` (this is exactly what bit emo, 2026-06-28). The +CLI also **self-defaults `VAULT_ADDR`** to `https://vault.viktorbarzin.me` when +unset, so it works from non-login shells (tmux panes, AFK agent subprocesses) +that never sourced `/etc/environment` — otherwise every `vault` child hits the +`127.0.0.1:8200` default and fails `connection refused` (exit 2). That scoped policy grants exactly `create`/`read`/`update` on the user's own `secret/workstation/claude-users/` path — no `patch` capability — so the tool writes with `vault kv patch -method=rw` (read-modify-write), falling back to `kv put` only when the path does not exist yet. This preserves the `claude_ai_oauth_json` key that [claude-auth-sync](claude-auth-renew-workstation.md) -co-locates there. (Both bugs that previously made this admin-only were fixed -2026-06-27.) +co-locates there. (The admin-only bugs were fixed 2026-06-27; the +`VAULT_ADDR`/token-precedence bugs above were fixed 2026-06-28.) ## Prerequisites (per user) @@ -119,3 +129,20 @@ VAULT_TOKEN="$(sudo cat /home//.config/claude-auth-sync/vault-token)" \ sudo -u -i bw --version # /usr/bin/bw resolves for the user sudo -u -i homelab vault status ``` + +## Troubleshooting + +**`homelab vault setup` (or any verb) fails with `exit status 2`** — older +binaries swallowed the underlying `vault` error; the message now includes it. +Two historical causes (both fixed in-CLI 2026-06-28, kept here for diagnosis): + +- `... connection refused` to `127.0.0.1:8200` → `VAULT_ADDR` wasn't set in the + caller's shell. The CLI now self-defaults it, but if you see this on an old + binary: `export VAULT_ADDR=https://vault.viktorbarzin.me`. +- `403 permission denied` on `PUT .../secret/data/workstation/claude-users/` + → a stale read-only `~/.vault-token` (e.g. from `vault login -method=oidc`, + policy `default`, capability `deny` on that path) was shadowing the scoped + token. The CLI now prefers the scoped token; on an old binary, `rm + ~/.vault-token` (or `unset VAULT_TOKEN`) and retry. Confirm with + `VAULT_TOKEN="$(sudo cat /home//.config/claude-auth-sync/vault-token)" vault token capabilities secret/data/workstation/claude-users/` + → must be `create, read, update`.