vault-token-renew: self-heal the periodic token on admin-capable clobber
Viktor asked for 'vault login -method=oidc' to work seamlessly: the OIDC login the docs prescribe kept clobbering ~/.vault-token with a 7-day token, and detect-only DRIFT failures went unnoticed for weeks (weekly-expiry loop, twice in June). On drift the renewer now re-mints the periodic token with the clobbering token's own authority (Vault's 403 is the judge — no policy guessing), sanity-checks it, replaces the file atomically, and revokes stale token-devvm-wizard leftovers. Weak/read-only clobbers still fail loudly on purpose. Design: docs/plans/2026-07-03-vault-token-self-heal-design.md Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
8631709ca2
commit
4a7b6db806
1 changed files with 78 additions and 9 deletions
|
|
@ -67,6 +67,72 @@ vtr_is_stale_periodic() {
|
||||||
[ "$acc" != "$2" ]
|
[ "$acc" != "$2" ]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# vtr_heal <foreign-dn> <log-file> -> 0 if ~/.vault-token was re-minted back to
|
||||||
|
# our periodic admin token using the foreign token's own authority, 1 if the
|
||||||
|
# heal was denied or failed (caller exits non-zero; the unit goes failed).
|
||||||
|
#
|
||||||
|
# Self-heal added 2026-07-03 (docs/plans/2026-07-03-vault-token-self-heal-design.md):
|
||||||
|
# an OIDC login — which the infra docs prescribe before applies — clobbers
|
||||||
|
# ~/.vault-token with a 7-day token, and detect-only drift left that unnoticed
|
||||||
|
# for weeks (the weekly-expiry loop). We ATTEMPT the re-mint with the
|
||||||
|
# clobbering token itself and let Vault's authz decide — a read-only clobber
|
||||||
|
# (the 2026-06-05 woodpecker incident) is denied the mint and stays a loud
|
||||||
|
# failure, because it signals a misbehaving flow that someone should look at.
|
||||||
|
vtr_heal() {
|
||||||
|
local foreign_dn="$1" log="$2"
|
||||||
|
local errf new_token new_info new_dn new_pols new_acc tmp
|
||||||
|
errf=$(mktemp)
|
||||||
|
if ! new_token=$(vault token create -orphan -period=768h \
|
||||||
|
-policy=vault-admin -policy=sops-admin -display-name=devvm-wizard \
|
||||||
|
-field=token 2>"$errf") || [ -z "$new_token" ]; then
|
||||||
|
printf '%s DRIFT: ~/.vault-token is dn=%q — heal denied, foreign token lacks create authority (%s); investigate what wrote it. Manual re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \
|
||||||
|
"$(date -Is)" "$foreign_dn" "$(tr '\n' ' ' <"$errf")" >>"$log"
|
||||||
|
rm -f "$errf"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
rm -f "$errf"
|
||||||
|
|
||||||
|
# Sanity: the minted token must itself pass the drift guard before it may
|
||||||
|
# replace ~/.vault-token.
|
||||||
|
if ! new_info=$(VAULT_TOKEN="$new_token" vault token lookup -format=json 2>&1); then
|
||||||
|
printf '%s FAIL: heal minted a token but its lookup failed: %s\n' \
|
||||||
|
"$(date -Is)" "$new_info" >>"$log"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
new_dn=$(vtr_display_name "$new_info")
|
||||||
|
new_pols=$(vtr_policies_csv "$new_info")
|
||||||
|
if ! vtr_drift_ok "$new_dn" "$new_pols"; then
|
||||||
|
printf '%s FAIL: heal minted an unexpected token (dn=%q policies=%q) — not writing it\n' \
|
||||||
|
"$(date -Is)" "$new_dn" "$new_pols" >>"$log"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Atomic replace: mktemp files are 0600 from birth; same-filesystem mv.
|
||||||
|
tmp=$(mktemp "$HOME/.vault-token.XXXXXX")
|
||||||
|
printf '%s' "$new_token" >"$tmp"
|
||||||
|
mv "$tmp" "$HOME/.vault-token"
|
||||||
|
|
||||||
|
# Anti-sprawl: revoke previous token-devvm-wizard tokens — each heal would
|
||||||
|
# otherwise strand the prior periodic ADMIN token server-side for up to 32d.
|
||||||
|
# The clobbering foreign token is deliberately NOT revoked: it may still back
|
||||||
|
# the user's live login session, and it ages out on its own (7d for OIDC).
|
||||||
|
local sweep="accessor sweep skipped (list denied)" accessors a a_info revoked=0
|
||||||
|
new_acc=$(vtr_accessor "$new_info")
|
||||||
|
if [ -n "$new_acc" ] && accessors=$(VAULT_TOKEN="$new_token" vault list -format=json auth/token/accessors 2>/dev/null); then
|
||||||
|
while IFS= read -r a; do
|
||||||
|
[ -n "$a" ] || continue
|
||||||
|
a_info=$(VAULT_TOKEN="$new_token" vault token lookup -format=json -accessor "$a" 2>/dev/null) || continue
|
||||||
|
if vtr_is_stale_periodic "$a_info" "$new_acc"; then
|
||||||
|
VAULT_TOKEN="$new_token" vault token revoke -accessor "$a" >/dev/null 2>&1 && revoked=$((revoked + 1))
|
||||||
|
fi
|
||||||
|
done < <(printf '%s' "$accessors" | jq -r '.[]')
|
||||||
|
sweep="revoked $revoked stale periodic token(s)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf '%s HEALED: re-minted periodic token from foreign dn=%q (%s)\n' \
|
||||||
|
"$(date -Is)" "$foreign_dn" "$sweep" >>"$log"
|
||||||
|
}
|
||||||
|
|
||||||
vtr_main() {
|
vtr_main() {
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
export PATH="/usr/local/bin:/usr/bin:/bin:${PATH:-}"
|
export PATH="/usr/local/bin:/usr/bin:/bin:${PATH:-}"
|
||||||
|
|
@ -83,16 +149,19 @@ vtr_main() {
|
||||||
dn=$(vtr_display_name "$info")
|
dn=$(vtr_display_name "$info")
|
||||||
pols=$(vtr_policies_csv "$info")
|
pols=$(vtr_policies_csv "$info")
|
||||||
|
|
||||||
# Drift guard (added 2026-06-07): the renewer must NOT keep a FOREIGN token alive.
|
# Drift guard (2026-06-07) + self-heal (2026-07-03): the renewer must not
|
||||||
# On 2026-06-05 a stray `vault login -method=kubernetes` overwrote ~/.vault-token
|
# keep a FOREIGN token alive (on 2026-06-05 a stray kubernetes login was
|
||||||
# with a read-only woodpecker token, and this script then silently renewed THAT
|
# silently renewed for two days, masking lost write access). But detect-only
|
||||||
# for two days — masking the loss of write access. So before renewing, confirm
|
# drift proved worse in practice: an OIDC login — which the infra docs
|
||||||
# the token is our periodic admin token; if it has drifted, fail loudly (systemd
|
# prescribe before applies — clobbers this file too, and the resulting DRIFT
|
||||||
# marks the unit failed) instead of keeping someone else's token alive.
|
# failures went unnoticed for weeks while access degraded to a 7-day token
|
||||||
|
# (the weekly-expiry loop). On drift we now ATTEMPT to heal (see vtr_heal):
|
||||||
|
# re-mint the periodic token with the clobbering token's own authority.
|
||||||
|
# Vault's authz keeps the old guarantee — a token that couldn't legitimately
|
||||||
|
# hold vault-admin is denied the mint, and we still fail loud.
|
||||||
if ! vtr_drift_ok "$dn" "$pols"; then
|
if ! vtr_drift_ok "$dn" "$pols"; then
|
||||||
printf '%s DRIFT: ~/.vault-token is dn=%q policies=%q (expected dn=%q with %q). Refusing to renew a foreign token. Re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \
|
vtr_heal "$dn" "$log" || exit 1
|
||||||
"$(date -Is)" "$dn" "$pols" "$EXPECTED_DN" "$REQUIRED_POLICY" >>"$log"
|
exit 0
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# `vault token renew` with no argument renews the calling token (renew-self).
|
# `vault token renew` with no argument renews the calling token (renew-self).
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue