fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/stacks/kured/main.tf
+++ b/stacks/kured/main.tf
@ -0,0 +1,347 @@
+# kured — Kubernetes Reboot Daemon
+#
+# Auto-reboots nodes when /var/run/reboot-required exists on the host (set by
+# unattended-upgrades). The reboot process is gated by a custom sentinel file
+# (kured-sentinel-gate DaemonSet below) and by Prometheus alerts so reboots
+# only happen when:
+#   - all nodes Ready
+#   - all calico-node pods Running
+#   - no node has transitioned Ready in the last 24 hours (24h soak)
+#   - no Prometheus alert is firing (excluding self-referential ignore-list)
+#
+# History:
+#   - 2026-03 post-mortem (memory 390): 26h cluster outage triggered by kured
+#     rebooting nodes while containerd's overlayfs snapshotter was corrupted.
+#     Remediation included the sentinel gate and a tight reboot window
+#     (Mon-Fri 02:00-06:00 London).
+#   - 2026-04-18: adopted into Terraform (Wave 5a). Previously helm-installed
+#     manually + kubectl-applied sentinel gate.
+#   - 2026-05-10: re-enabled unattended-upgrades (cloud_init.yaml flipped from
+#     remove → install). Sentinel cool-down stretched 30m → 24h. Added Helm
+#     values prometheusUrl + alertFilterRegexp so any non-ignored firing alert
+#     halts the rollout. New "Upgrade Gates" alert group in monitoring stack
+#     (KubeAPIServerDown, KubeStateMetricsDown, PrometheusRuleEvaluationFailing,
+#     PVCStuckPending, RecentNodeReboot, MysqlStandaloneDown,
+#     ClusterPodReadyRatioDropped, NodeMemoryPressure, NodeDiskPressure,
+#     KubeQuotaAlmostFull) provides explicit cluster-health gating.
+
+resource "kubernetes_namespace" "kured" {
+  metadata {
+    name = "kured"
+    labels = {
+      "istio-injection"  = "disabled"
+      tier               = local.tiers.cluster
+      "keel.sh/enrolled" = "true"
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
+    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
+  }
+}
+
+# -----------------------------------------------------------------------------
+# kured Helm release
+# -----------------------------------------------------------------------------
+
+resource "helm_release" "kured" {
+  namespace        = kubernetes_namespace.kured.metadata[0].name
+  create_namespace = false
+  name             = "kured"
+  chart            = "kured"
+  repository       = "https://kubereboot.github.io/charts/"
+  version          = "5.11.0"
+
+  values = [yamlencode({
+    configuration = {
+      period    = "1h0m0s"
+      timeZone  = "Europe/London"
+      startTime = "02:00"
+      endTime   = "06:00"
+      # All 7 days — operator decision 2026-05-16. The Mon–Fri restriction
+      # was a 2026-03-16-era guardrail (overlapping with weekend on-call
+      # response). Today the rest of the safety net (halt-on-alert,
+      # sentinel-gate Check 4 = 24h soak, single-concurrency, the
+      # K8sUpgradeStalled alert) is strong enough to operate any day; the
+      # weekday-only schedule was just slowing the backlog down.
+      rebootDays = ["mo", "tu", "we", "th", "fr", "sa", "su"]
+      # IMPORTANT: must match where kured-sentinel-gate writes (below):
+      # `touch /host/var-run/gated-reboot-required` → host
+      # `/var/run/gated-reboot-required`. The kured chart derives the host
+      # path from `dirname(rebootSentinel)`, so this single setting controls
+      # BOTH the in-pod mountPath AND the host hostPath. Previously
+      # `/sentinel/gated-reboot-required` — that pointed the chart's hostPath
+      # at `/sentinel/` (empty, auto-created by hostPath:Directory) while the
+      # gate kept writing to `/var/run/`. kured never saw the open gate so
+      # nodes stopped rebooting on 2026-05-10 when unattended-upgrades was
+      # re-enabled. Fixed 2026-05-16.
+      rebootSentinel = "/var/run/gated-reboot-required"
+      notifyUrl      = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"]
+      concurrency    = 1
+      rebootDelay    = "30s"
+      # Fail closed instead of looping forever. Default is 0 (unlimited) — if
+      # a future PDB or finalizer stalls drain, kured retries indefinitely and
+      # the node stays cordoned silently. 30m gives CNPG / shared-store
+      # Anubis / any other stateful workload plenty of time to settle, but
+      # caps the silent-failure window. After timeout kured logs the abort
+      # and waits for the next period; node stays Schedulable so the cluster
+      # doesn't lose capacity. Fixed 2026-05-16.
+      drainTimeout = "30m"
+      # Halt rolling reboots when ANY firing Prometheus alert is not in the
+      # ignore-list. The ignore-list excludes self-referential / always-firing
+      # alerts that would otherwise deadlock kured. alertFilterMatchOnly stays
+      # false (default) so the regex marks alerts to IGNORE — every other
+      # firing alert blocks. See "Upgrade Gates" group in monitoring stack.
+      prometheusUrl        = "http://prometheus-server.monitoring.svc.cluster.local:80"
+      alertFilterRegexp    = "^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor)$"
+      alertFiringOnly      = true
+      alertFilterMatchOnly = false
+    }
+    reboot_days  = "mon,tue,wed,thu,fri,sat,sun"
+    window_end   = "06:00"
+    window_start = "22:00"
+    service = {
+      annotations = {
+        "prometheus.io/scrape" = "true"
+        "prometheus.io/port"   = "8080"
+        "prometheus.io/path"   = "/metrics"
+      }
+    }
+  })]
+}
+
+data "vault_kv_secret_v2" "secrets" {
+  mount = "secret"
+  name  = "kured"
+}
+
+# -----------------------------------------------------------------------------
+# kured-sentinel-gate
+#
+# Runs a DaemonSet that creates /var/run/gated-reboot-required ONLY when all
+# safety preconditions are met (see script). kured's rebootSentinel points at
+# this file, so reboots are effectively blocked unless every check passes.
+# -----------------------------------------------------------------------------
+
+resource "kubernetes_service_account" "kured_sentinel_gate" {
+  metadata {
+    name      = "kured-sentinel-gate"
+    namespace = kubernetes_namespace.kured.metadata[0].name
+  }
+  # Token IS mounted — the script uses kubectl to read nodes + pods state for
+  # the safety checks. Without an authenticated token, kubectl falls back to
+  # localhost:8080 (no proxy in distroless-ish image), every check silently
+  # no-ops on parse-empty stdout, and the gate appears to PASS when it
+  # shouldn't. Mount the token. (Found 2026-05-10 during Test 3 validation.)
+  automount_service_account_token = true
+}
+
+resource "kubernetes_cluster_role" "kured_sentinel_gate" {
+  metadata {
+    name = "kured-sentinel-gate"
+  }
+  rule {
+    api_groups = [""]
+    resources  = ["nodes"]
+    verbs      = ["list"]
+  }
+  rule {
+    api_groups = [""]
+    resources  = ["pods"]
+    verbs      = ["list"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "kured_sentinel_gate" {
+  metadata {
+    name = "kured-sentinel-gate"
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role.kured_sentinel_gate.metadata[0].name
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account.kured_sentinel_gate.metadata[0].name
+    namespace = kubernetes_namespace.kured.metadata[0].name
+  }
+}
+
+resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" {
+  metadata {
+    name      = "kured-sentinel-gate"
+    namespace = kubernetes_namespace.kured.metadata[0].name
+    labels = {
+      app  = "kured-sentinel-gate"
+      tier = local.tiers.cluster
+    }
+  }
+  spec {
+    selector {
+      match_labels = {
+        app = "kured-sentinel-gate"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "kured-sentinel-gate"
+        }
+      }
+      spec {
+        service_account_name            = kubernetes_service_account.kured_sentinel_gate.metadata[0].name
+        automount_service_account_token = true
+        enable_service_links            = false
+        # bitnami/kubectl:latest runs as uid=1001 by default. The hostPath
+        # /var/run is root:root 0755 → final `touch
+        # /host/var-run/gated-reboot-required` fails with EACCES, so the gate
+        # never opens. Run as root inside the container (the hostPath mount
+        # already gives privileged-equivalent access; this just lets us write
+        # to /var/run). Found 2026-05-10 during Test 3 validation.
+        security_context {
+          run_as_user = 0
+        }
+        toleration {
+          effect   = "NoSchedule"
+          key      = "node-role.kubernetes.io/control-plane"
+          operator = "Equal"
+        }
+        toleration {
+          effect   = "NoSchedule"
+          key      = "node-role.kubernetes.io/master"
+          operator = "Equal"
+        }
+        container {
+          name              = "gate"
+          image             = "bitnami/kubectl:latest"
+          image_pull_policy = "Always"
+          command = [
+            "/bin/bash",
+            "-c",
+            <<-EOT
+              # Self-restart guard (added 2026-05-31): a node stuck in
+              # pending-reboot keeps THIS pod on the kubectl-heavy hot path
+              # every cycle. The long-lived bash loop slowly leaks (repeated
+              # kubectl forks + the Check-4 process substitution) until the
+              # cgroup OOM-kills child processes — PID 1 bash survives, so the
+              # pod never restarts, it just racks up silent oom_events
+              # (149 in 7d / accelerating on k8s-master, 2026-05-30..31).
+              # Exit 0 every MAX_ITER cycles (~6h at 300s) so kubelet restarts
+              # the pod fresh and memory can never accumulate.
+              ITER=0
+              MAX_ITER=72
+              while true; do
+                ITER=$((ITER + 1))
+                if [ "$ITER" -gt "$MAX_ITER" ]; then
+                  echo "  Iteration cap ($MAX_ITER) reached — exit 0 for a clean restart (leak guard)"
+                  exit 0
+                fi
+                echo "[$(date)] Checking reboot gate conditions... (iter $ITER/$MAX_ITER)"
+
+                # Check 1: Does the host need a reboot?
+                if [ ! -f /host/var-run/reboot-required ]; then
+                  echo "  No reboot required on this host"
+                  rm -f /host/var-run/gated-reboot-required
+                  sleep 300
+                  continue
+                fi
+                echo "  Host has /var/run/reboot-required"
+
+                # Check 2: Are ALL nodes Ready?
+                NOT_READY=$(kubectl get nodes --no-headers | grep -v ' Ready' | wc -l | tr -d ' ')
+                if [ "$NOT_READY" -gt 0 ]; then
+                  echo "  BLOCKED: $NOT_READY node(s) not Ready"
+                  rm -f /host/var-run/gated-reboot-required
+                  sleep 300
+                  continue
+                fi
+                echo "  All nodes Ready"
+
+                # Check 3: Are ALL calico-node pods Running?
+                CALICO_NOT_RUNNING=$(kubectl get pods -n calico-system -l k8s-app=calico-node --no-headers 2>/dev/null | grep -v Running | wc -l | tr -d ' ')
+                if [ "$CALICO_NOT_RUNNING" -gt 0 ]; then
+                  echo "  BLOCKED: $CALICO_NOT_RUNNING calico-node pod(s) not Running"
+                  rm -f /host/var-run/gated-reboot-required
+                  sleep 300
+                  continue
+                fi
+                echo "  All calico-node pods Running"
+
+                # Check 4: No node rebooted in last 24 hours (soak window).
+                # Stretched from 30m to 24h on 2026-05-10 so the de-facto canary
+                # node has a full day of observation before the next node drains.
+                RECENT_REBOOT=0
+                while IFS= read -r transition_time; do
+                  if [ -n "$transition_time" ]; then
+                    transition_epoch=$(date -d "$transition_time" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$transition_time" +%s 2>/dev/null)
+                    now_epoch=$(date +%s)
+                    diff=$(( now_epoch - transition_epoch ))
+                    if [ "$diff" -lt 86400 ]; then
+                      RECENT_REBOOT=1
+                      break
+                    fi
+                  fi
+                done < <(kubectl get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
+
+                if [ "$RECENT_REBOOT" -eq 1 ]; then
+                  echo "  BLOCKED: A node transitioned Ready within the last 24 hours (soak window)"
+                  rm -f /host/var-run/gated-reboot-required
+                  sleep 300
+                  continue
+                fi
+                echo "  No recent node reboots (24h soak window clear)"
+
+                # All checks passed — create gated sentinel
+                echo "  ALL CHECKS PASSED — creating /var/run/gated-reboot-required"
+                touch /host/var-run/gated-reboot-required
+                sleep 300
+              done
+            EOT
+          ]
+          resources {
+            requests = {
+              cpu    = "10m"
+              memory = "32Mi"
+            }
+            # 64Mi was too tight for the kubectl-heavy hot path (each kubectl
+            # fork is a ~30-50Mi Go binary). Raised to 256Mi 2026-05-31 after
+            # the k8s-master gate pod OOM-killed child kubectls 149x/7d while
+            # master sat in pending-reboot. The self-restart guard (loop above)
+            # is the primary leak fix; this just gives comfortable headroom.
+            limits = {
+              memory = "256Mi"
+            }
+          }
+          volume_mount {
+            name       = "var-run"
+            mount_path = "/host/var-run"
+          }
+        }
+        volume {
+          name = "var-run"
+          host_path {
+            path = "/var/run"
+            type = "Directory"
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    ignore_changes = [
+      spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
+      metadata[0].annotations["keel.sh/policy"],
+      metadata[0].annotations["keel.sh/trigger"],
+      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
+      metadata[0].annotations["keel.sh/match-tag"],
+      spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
+      metadata[0].annotations["kubernetes.io/change-cause"],
+      metadata[0].annotations["deployment.kubernetes.io/revision"],
+      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
+    ]
+  }
+}
+
+# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
+# CI retrigger v2 2026-05-16T13:46:35+00:00
--- a/stacks/kured/secrets
+++ b/stacks/kured/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/kured/terragrunt.hcl
+++ b/stacks/kured/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "platform" {
+  config_path  = "../platform"
+  skip_outputs = true
+}