From 077ac97df511f34ca7f78aec1a019c238a45aa5b Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Fri, 19 Jun 2026 06:04:30 +0000
Subject: [PATCH] k8s-version-upgrade: auto-restore apiserver OIDC after
 control-plane bumps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kubeadm upgrade apply regenerates the apiserver static-pod manifest and drops
the --authentication-config flag, silently breaking SSO (kubectl/kubelogin + the
k8s dashboard) until someone manually re-applied the rbac stack. That manual step
ran after every control-plane upgrade — the one thing keeping autonomous patch
upgrades from being truly hands-off (it bit us this cycle: an earlier master bump
left SSO broken until we noticed).

Automate it: the rbac stack now publishes its existing OIDC restore script (the
same one its null_resource runs) to a kube-system/apiserver-oidc-restore
ConfigMap, and the upgrade chain's phase_master re-runs it on master right after
the kubeadm upgrade — while tigera-operator is still quiesced so the flag-add
apiserver restart can't crashloop it. The script is idempotent and health-gates
/livez with auto-rollback; the step is non-fatal (a failure only lags SSO until
the next rbac apply, it won't abort the upgrade). phase_master already self-skips
when master is at target, so this only fires when master was actually upgraded.

The chain SA gets a name-scoped get on that one ConfigMap. Runbook updated: the
manual restore is now a documented fallback (command corrected — it needs
-replace, since the null_resource trigger hash never changes).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/runbooks/k8s-version-upgrade.md          | 35 +++++++++++++------
 stacks/k8s-version-upgrade/main.tf            |  9 +++++
 .../scripts/upgrade-step.sh                   | 27 ++++++++++++++
 stacks/rbac/modules/rbac/apiserver-oidc.tf    | 19 ++++++++++
 4 files changed, 80 insertions(+), 10 deletions(-)

diff --git a/docs/runbooks/k8s-version-upgrade.md b/docs/runbooks/k8s-version-upgrade.md
index bba6be0f..7533da44 100644
--- a/docs/runbooks/k8s-version-upgrade.md
+++ b/docs/runbooks/k8s-version-upgrade.md
@@ -150,27 +150,42 @@ Exposed in K8s via ExternalSecret `k8s-upgrade-creds` in the `k8s-upgrade` names
 
 ## Common Operations
 
-### Post-upgrade: restore apiserver OIDC (REQUIRED after any control-plane bump)
+### Post-upgrade: apiserver OIDC restore (AUTOMATED by the chain since 2026-06-19)
 
 `kubeadm upgrade apply` **regenerates `/etc/kubernetes/manifests/kube-apiserver.yaml`
 and drops the `--authentication-config` flag**, silently disabling apiserver
 OIDC (kubectl/kubelogin CLI **and** the web dashboard SSO break — tokens get
-401). This is not auto-detected (the `rbac` stack's `null_resource` trigger is a
-content hash that doesn't change). After any control-plane upgrade, re-apply:
+401). This used to require a manual re-apply after **every** control-plane bump.
+
+**Now automated:** the `rbac` stack publishes its OIDC restore script to the
+`kube-system/apiserver-oidc-restore` ConfigMap, and the version-upgrade chain's
+`phase_master` re-runs it on master immediately after `kubeadm upgrade apply`
+(while tigera-operator is still quiesced, so the flag-add apiserver restart can't
+crashloop the operator). It's idempotent, health-gates `/livez` with
+auto-rollback, and is **non-fatal** — a failure only lags SSO until the next rbac
+apply (the version upgrade itself already succeeded). So a chain-driven
+control-plane bump no longer breaks SSO. The master phase self-skips when master
+is already at target, so this only runs when master was actually upgraded.
+
+**Manual fallback** — only for an out-of-band/manual `kubeadm` upgrade, or if the
+chain logged `WARN: --authentication-config absent after re-apply`:
 
 ```bash
 cd stacks/rbac
 TF_VAR_ssh_private_key="$(cat ~/.ssh/id_ed25519)" \
   VAULT_ADDR=https://vault.viktorbarzin.me ../../scripts/tg apply \
-  --non-interactive -target=module.rbac.null_resource.apiserver_oidc_config
+  --non-interactive -target=module.rbac.null_resource.apiserver_oidc_config \
+  -replace=module.rbac.null_resource.apiserver_oidc_config
 ```
 
-(`ssh_private_key` must be a key authorized for `wizard@<master>`; it is not yet
-wired from Vault.) The provisioner re-writes `/etc/kubernetes/pki/auth-config.yaml`
-(both `kubernetes` + `k8s-dashboard` issuers), re-adds the flag, and
-health-gates `/livez` with auto-rollback. Verify: `curl -sk
-https://localhost:6443/livez` on the master = `ok`, and the apiserver manifest
-contains `--authentication-config`. See `docs/plans/2026-06-04-k8s-dashboard-sso-design.md`.
+(`-replace` is **required** — the `null_resource` trigger is a content hash that
+doesn't change, so a plain `-target` apply is a no-op. `ssh_private_key` must be a
+key authorized for `wizard@<master>`.) The provisioner re-writes
+`/etc/kubernetes/pki/auth-config.yaml` (both `kubernetes` + `k8s-dashboard`
+issuers), re-adds the flag, and health-gates `/livez` with auto-rollback. Verify:
+`curl -sk https://localhost:6443/livez` on the master = `ok`, and the apiserver
+manifest contains `--authentication-config`. See
+`docs/plans/2026-06-04-k8s-dashboard-sso-design.md`.
 
 ### Verify the pipeline is healthy
 ```bash
diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf
index 1a77510e..738b5431 100644
--- a/stacks/k8s-version-upgrade/main.tf
+++ b/stacks/k8s-version-upgrade/main.tf
@@ -221,6 +221,15 @@ resource "kubernetes_cluster_role" "k8s_upgrade_job" {
     resource_names = [local.namespace]
     verbs          = ["get", "patch", "update"]
   }
+  # Read the apiserver-OIDC restore script (published by the rbac stack to
+  # kube-system) so phase_master can re-apply --authentication-config after a
+  # kubeadm control-plane upgrade drops it. Name-scoped get only.
+  rule {
+    api_groups     = [""]
+    resources      = ["configmaps"]
+    resource_names = ["apiserver-oidc-restore"]
+    verbs          = ["get"]
+  }
 }
 
 resource "kubernetes_cluster_role_binding" "k8s_upgrade_job" {
diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
index 1e73be4d..95bfb9c7 100644
--- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
+++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh
@@ -521,6 +521,33 @@ phase_master() {
   alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
   [ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; }
 
+  # Re-apply apiserver OIDC. `kubeadm upgrade apply` regenerates the apiserver
+  # static-pod manifest and DROPS --authentication-config, silently breaking SSO
+  # (kubectl/kubelogin + the dashboard) until re-applied — historically a manual
+  # `tg apply` of the rbac stack after every control-plane bump. Automate it here
+  # while tigera-operator is STILL quiesced, so the flag-add apiserver restart
+  # cannot crashloop the operator. Single source of truth: the rbac stack
+  # publishes the exact script its own null_resource runs to a kube-system
+  # ConfigMap; it is idempotent and health-gates /livez with auto-rollback, and a
+  # failure here is NON-FATAL (the version upgrade already succeeded — only SSO
+  # would lag until the next rbac apply).
+  local oidc_restore
+  oidc_restore=$($KUBECTL -n kube-system get configmap apiserver-oidc-restore \
+    -o jsonpath='{.data.restore\.sh}' 2>/dev/null || true)
+  if [ -n "$oidc_restore" ]; then
+    slack "Re-applying apiserver OIDC after master upgrade"
+    printf '%s' "$oidc_restore" | ssh "${SSH_OPTS[@]}" "$(ssh_target k8s-master)" 'bash -s' \
+      || slack "WARN: apiserver OIDC re-apply exited non-zero — verify SSO"
+    if ssh "${SSH_OPTS[@]}" "$(ssh_target k8s-master)" \
+         'sudo grep -q -- "--authentication-config=" /etc/kubernetes/manifests/kube-apiserver.yaml'; then
+      slack "apiserver OIDC restored (--authentication-config present)"
+    else
+      slack "WARN: --authentication-config absent after re-apply — SSO down; run the rbac apiserver_oidc_config apply"
+    fi
+  else
+    slack "WARN: apiserver-oidc-restore ConfigMap missing — skipping OIDC re-apply (apply the rbac stack)"
+  fi
+
   # Restore tigera-operator (happy path) + clear the safety-net EXIT trap.
   echo "Restoring tigera-operator"
   $KUBECTL -n tigera-operator scale deploy tigera-operator --replicas=1 2>&1 || true
diff --git a/stacks/rbac/modules/rbac/apiserver-oidc.tf b/stacks/rbac/modules/rbac/apiserver-oidc.tf
index f88bf924..5165a7d1 100644
--- a/stacks/rbac/modules/rbac/apiserver-oidc.tf
+++ b/stacks/rbac/modules/rbac/apiserver-oidc.tf
@@ -158,3 +158,22 @@ resource "null_resource" "apiserver_oidc_config" {
     auth_config = sha256(local.apiserver_auth_config_yaml)
   }
 }
+
+# Publish the restore script to a ConfigMap so the k8s-version-upgrade chain can
+# re-apply apiserver OIDC on master immediately after a `kubeadm upgrade` (which
+# regenerates the apiserver manifest and drops --authentication-config → breaks
+# SSO). This is the SAME script the null_resource above runs over SSH, so the
+# rbac stack stays the single source of truth — the chain just re-runs it
+# post-upgrade (phase_master in
+# stacks/k8s-version-upgrade/scripts/upgrade-step.sh) instead of waiting for a
+# manual `tg apply`. Content is config (issuer URLs + claim mappings), not
+# secrets, so a ConfigMap is appropriate.
+resource "kubernetes_config_map_v1" "apiserver_oidc_restore" {
+  metadata {
+    name      = "apiserver-oidc-restore"
+    namespace = "kube-system"
+  }
+  data = {
+    "restore.sh" = local.apiserver_auth_remote_script
+  }
+}