From d1777d61192fcb8b30eb291d1650d9180324b23c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 18:16:54 +0000 Subject: [PATCH] kured(sentinel-gate): fix auth + write-perm so safety checks actually run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test 3 validation surfaced two latent bugs in the sentinel-gate DaemonSet that have been masked since 2026-04-18 (when uu was off, nothing wrote /var/run/reboot-required, so the gate never had to fire): 1. automount_service_account_token=false on both the SA and the pod spec → kubectl in the script falls back to localhost:8080 on every call. Each check (`kubectl get nodes`, `kubectl get pods -n calico-system`, transition-time read) errors to stderr and emits empty stdout. `wc -l` reports 0 → checks "pass" with no real data. 2. bitnami/kubectl:latest runs as uid=1001 by default. The hostPath /var/run is root:root 0755 → final `touch /host/var-run/gated-reboot-required` failed with EACCES. Fail-safe by accident — but if anything had ever loosened those perms, the broken checks above would have green-lit the gate with no real validation. Fix: enable token mount on the SA + pod, set securityContext.run_as_user=0 on the container. Verified post-fix: kubectl returns all 5 nodes, touch succeeds, sentinel-gate now reports the correct `BLOCKED: A node transitioned Ready within the last 24 hours (soak window)` when triggered with k8s-node1's recent reboot within the cool-down period. Co-Authored-By: Claude Opus 4.7 --- stacks/kured/main.tf | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/stacks/kured/main.tf b/stacks/kured/main.tf index 896f23d3..7292bb00 100644 --- a/stacks/kured/main.tf +++ b/stacks/kured/main.tf @@ -103,7 +103,12 @@ resource "kubernetes_service_account" "kured_sentinel_gate" { name = "kured-sentinel-gate" namespace = kubernetes_namespace.kured.metadata[0].name } - automount_service_account_token = false + # Token IS mounted — the script uses kubectl to read nodes + pods state for + # the safety checks. Without an authenticated token, kubectl falls back to + # localhost:8080 (no proxy in distroless-ish image), every check silently + # no-ops on parse-empty stdout, and the gate appears to PASS when it + # shouldn't. Mount the token. (Found 2026-05-10 during Test 3 validation.) + automount_service_account_token = true } resource "kubernetes_cluster_role" "kured_sentinel_gate" { @@ -161,8 +166,17 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" { } spec { service_account_name = kubernetes_service_account.kured_sentinel_gate.metadata[0].name - automount_service_account_token = false + automount_service_account_token = true enable_service_links = false + # bitnami/kubectl:latest runs as uid=1001 by default. The hostPath + # /var/run is root:root 0755 → final `touch + # /host/var-run/gated-reboot-required` fails with EACCES, so the gate + # never opens. Run as root inside the container (the hostPath mount + # already gives privileged-equivalent access; this just lets us write + # to /var/run). Found 2026-05-10 during Test 3 validation. + security_context { + run_as_user = 0 + } toleration { effect = "NoSchedule" key = "node-role.kubernetes.io/control-plane"