diff --git a/scripts/vault-token-renew.sh b/scripts/vault-token-renew.sh index 60502ac5..42e78603 100644 --- a/scripts/vault-token-renew.sh +++ b/scripts/vault-token-renew.sh @@ -67,6 +67,72 @@ vtr_is_stale_periodic() { [ "$acc" != "$2" ] } +# vtr_heal -> 0 if ~/.vault-token was re-minted back to +# our periodic admin token using the foreign token's own authority, 1 if the +# heal was denied or failed (caller exits non-zero; the unit goes failed). +# +# Self-heal added 2026-07-03 (docs/plans/2026-07-03-vault-token-self-heal-design.md): +# an OIDC login — which the infra docs prescribe before applies — clobbers +# ~/.vault-token with a 7-day token, and detect-only drift left that unnoticed +# for weeks (the weekly-expiry loop). We ATTEMPT the re-mint with the +# clobbering token itself and let Vault's authz decide — a read-only clobber +# (the 2026-06-05 woodpecker incident) is denied the mint and stays a loud +# failure, because it signals a misbehaving flow that someone should look at. +vtr_heal() { + local foreign_dn="$1" log="$2" + local errf new_token new_info new_dn new_pols new_acc tmp + errf=$(mktemp) + if ! new_token=$(vault token create -orphan -period=768h \ + -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard \ + -field=token 2>"$errf") || [ -z "$new_token" ]; then + printf '%s DRIFT: ~/.vault-token is dn=%q — heal denied, foreign token lacks create authority (%s); investigate what wrote it. Manual re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \ + "$(date -Is)" "$foreign_dn" "$(tr '\n' ' ' <"$errf")" >>"$log" + rm -f "$errf" + return 1 + fi + rm -f "$errf" + + # Sanity: the minted token must itself pass the drift guard before it may + # replace ~/.vault-token. + if ! new_info=$(VAULT_TOKEN="$new_token" vault token lookup -format=json 2>&1); then + printf '%s FAIL: heal minted a token but its lookup failed: %s\n' \ + "$(date -Is)" "$new_info" >>"$log" + return 1 + fi + new_dn=$(vtr_display_name "$new_info") + new_pols=$(vtr_policies_csv "$new_info") + if ! vtr_drift_ok "$new_dn" "$new_pols"; then + printf '%s FAIL: heal minted an unexpected token (dn=%q policies=%q) — not writing it\n' \ + "$(date -Is)" "$new_dn" "$new_pols" >>"$log" + return 1 + fi + + # Atomic replace: mktemp files are 0600 from birth; same-filesystem mv. + tmp=$(mktemp "$HOME/.vault-token.XXXXXX") + printf '%s' "$new_token" >"$tmp" + mv "$tmp" "$HOME/.vault-token" + + # Anti-sprawl: revoke previous token-devvm-wizard tokens — each heal would + # otherwise strand the prior periodic ADMIN token server-side for up to 32d. + # The clobbering foreign token is deliberately NOT revoked: it may still back + # the user's live login session, and it ages out on its own (7d for OIDC). + local sweep="accessor sweep skipped (list denied)" accessors a a_info revoked=0 + new_acc=$(vtr_accessor "$new_info") + if [ -n "$new_acc" ] && accessors=$(VAULT_TOKEN="$new_token" vault list -format=json auth/token/accessors 2>/dev/null); then + while IFS= read -r a; do + [ -n "$a" ] || continue + a_info=$(VAULT_TOKEN="$new_token" vault token lookup -format=json -accessor "$a" 2>/dev/null) || continue + if vtr_is_stale_periodic "$a_info" "$new_acc"; then + VAULT_TOKEN="$new_token" vault token revoke -accessor "$a" >/dev/null 2>&1 && revoked=$((revoked + 1)) + fi + done < <(printf '%s' "$accessors" | jq -r '.[]') + sweep="revoked $revoked stale periodic token(s)" + fi + + printf '%s HEALED: re-minted periodic token from foreign dn=%q (%s)\n' \ + "$(date -Is)" "$foreign_dn" "$sweep" >>"$log" +} + vtr_main() { set -euo pipefail export PATH="/usr/local/bin:/usr/bin:/bin:${PATH:-}" @@ -83,16 +149,19 @@ vtr_main() { dn=$(vtr_display_name "$info") pols=$(vtr_policies_csv "$info") - # Drift guard (added 2026-06-07): the renewer must NOT keep a FOREIGN token alive. - # On 2026-06-05 a stray `vault login -method=kubernetes` overwrote ~/.vault-token - # with a read-only woodpecker token, and this script then silently renewed THAT - # for two days — masking the loss of write access. So before renewing, confirm - # the token is our periodic admin token; if it has drifted, fail loudly (systemd - # marks the unit failed) instead of keeping someone else's token alive. + # Drift guard (2026-06-07) + self-heal (2026-07-03): the renewer must not + # keep a FOREIGN token alive (on 2026-06-05 a stray kubernetes login was + # silently renewed for two days, masking lost write access). But detect-only + # drift proved worse in practice: an OIDC login — which the infra docs + # prescribe before applies — clobbers this file too, and the resulting DRIFT + # failures went unnoticed for weeks while access degraded to a 7-day token + # (the weekly-expiry loop). On drift we now ATTEMPT to heal (see vtr_heal): + # re-mint the periodic token with the clobbering token's own authority. + # Vault's authz keeps the old guarantee — a token that couldn't legitimately + # hold vault-admin is denied the mint, and we still fail loud. if ! vtr_drift_ok "$dn" "$pols"; then - printf '%s DRIFT: ~/.vault-token is dn=%q policies=%q (expected dn=%q with %q). Refusing to renew a foreign token. Re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \ - "$(date -Is)" "$dn" "$pols" "$EXPECTED_DN" "$REQUIRED_POLICY" >>"$log" - exit 1 + vtr_heal "$dn" "$log" || exit 1 + exit 0 fi # `vault token renew` with no argument renews the calling token (renew-self).