From 5c0ea96a9168dd60066b2b9c0ff735283f38e7d1 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sun, 10 May 2026 17:07:32 +0000
Subject: [PATCH] infra: re-enable unattended-upgrades with kured
 prometheus-gating
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverses the March 2026 outage mitigation that disabled unattended-
upgrades cluster-wide. Now re-enables it on the k8s template VM with:

  - Allowed-Origins limited to security/updates pockets
  - Package-Blacklist for k8s/containerd/runc/calico-node (apt-mark
    hold on the cluster-critical components)
  - Automatic-Reboot disabled — kured drives the actual reboots
  - Compatible with the existing kured + sentinel-gate flow

kured side:
  - rebootDelay 30s, concurrency 1
  - Sentinel cool-down stretched 30m → 24h (aligns with the 24h soak
    window from the post-mortem)
  - prometheusUrl + alertFilterRegexp wired so any firing non-ignored
    alert halts the rollout. Ignore-list excludes self-referential
    alerts (Watchdog/RebootRequired/KuredNodeWasNotDrained/
    InfoInhibitor) that would otherwise deadlock kured.

Prometheus side (already partly landed in 6c4e0966 — the "Upgrade
Gates" rule group):
  - Refine `KubeQuotaAlmostFull` to include the resourcequota label in
    both the on-clause and the summary, so multi-quota namespaces
    (authentik, beads-server, frigate) report the quota name correctly.

grafana.tf: terraform fmt whitespace only.

Together with the post-mortem 2026-03-22 (memory id=390) the loop is
closed: unattended-upgrades runs again, kernel-class updates can land,
but only when cluster health is green and the reboot window is open.
---
 modules/create-template-vm/cloud_init.yaml    | 41 +++++++++++++++++--
 stacks/kured/main.tf                          | 35 +++++++++++++---
 .../monitoring/modules/monitoring/grafana.tf  |  6 +--
 .../monitoring/prometheus_chart_values.tpl    |  4 +-
 4 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/modules/create-template-vm/cloud_init.yaml b/modules/create-template-vm/cloud_init.yaml
index 8b696cad..1dc683f6 100644
--- a/modules/create-template-vm/cloud_init.yaml
+++ b/modules/create-template-vm/cloud_init.yaml
@@ -67,11 +67,44 @@ runcmd:
   - sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf
   - systemctl restart systemd-journald
   %{if is_k8s_template}
-  # Disable unattended-upgrades to prevent unexpected kernel updates that can break containerd/kubelet
-  # (Root cause of 26h cluster outage: unattended-upgrades → kernel update → containerd failure)
-  - systemctl disable --now unattended-upgrades || true
-  - apt-get remove -y unattended-upgrades || true
+  # Re-enabled 2026-05-10: unattended-upgrades is back on, but with a tight
+  # Allowed-Origins list, a Package-Blacklist for k8s/containerd/runc/calico,
+  # and Automatic-Reboot disabled (kured + sentinel-gate handles reboots in a
+  # 24h-soaked rolling window, gated by Prometheus alerts).
+  # Original outage (March 2026) was kernel update → containerd overlayfs corruption.
+  # Mitigations: 24h cool-down between node reboots, Prometheus halt-on-alert,
+  # apt-mark hold on k8s components, Package-Blacklist for runtime components.
+  - apt-get install -y unattended-upgrades update-notifier-common
+  - |
+    cat > /etc/apt/apt.conf.d/52unattended-upgrades-k8s <<'EOF'
+    Unattended-Upgrade::Allowed-Origins {
+        "$${distro_id}:$${distro_codename}";
+        "$${distro_id}:$${distro_codename}-security";
+        "$${distro_id}:$${distro_codename}-updates";
+        "$${distro_id}ESMApps:$${distro_codename}-apps-security";
+        "$${distro_id}ESM:$${distro_codename}-infra-security";
+    };
+    Unattended-Upgrade::Package-Blacklist {
+        "^containerd(\.io)?$$";
+        "^runc$$";
+        "^cri-tools$$";
+        "^kubernetes-cni$$";
+        "^calico-.*";
+        "^cni-plugins-.*";
+        "^docker-ce$$";
+    };
+    Unattended-Upgrade::DevRelease "false";
+    Unattended-Upgrade::Automatic-Reboot "false";
+    EOF
+  - |
+    cat > /etc/apt/apt.conf.d/20auto-upgrades <<'EOF'
+    APT::Periodic::Update-Package-Lists "1";
+    APT::Periodic::Unattended-Upgrade "1";
+    EOF
+  - systemctl unmask unattended-upgrades 2>/dev/null || true
+  - systemctl enable --now unattended-upgrades
   - apt-mark hold kubelet kubeadm kubectl
+  - apt-mark hold containerd containerd.io runc 2>/dev/null || true
   - systemctl stop kubelet
   - containerd config default | sudo tee /etc/containerd/config.toml
   - ${containerd_config_update_command}
diff --git a/stacks/kured/main.tf b/stacks/kured/main.tf
index 183974ea..896f23d3 100644
--- a/stacks/kured/main.tf
+++ b/stacks/kured/main.tf
@@ -2,10 +2,12 @@
 #
 # Auto-reboots nodes when /var/run/reboot-required exists on the host (set by
 # unattended-upgrades). The reboot process is gated by a custom sentinel file
-# (kured-sentinel-gate DaemonSet below) so reboots only happen when:
+# (kured-sentinel-gate DaemonSet below) and by Prometheus alerts so reboots
+# only happen when:
 #   - all nodes Ready
 #   - all calico-node pods Running
-#   - no node has transitioned Ready in the last 30 minutes (cool-down)
+#   - no node has transitioned Ready in the last 24 hours (24h soak)
+#   - no Prometheus alert is firing (excluding self-referential ignore-list)
 #
 # History:
 #   - 2026-03 post-mortem (memory 390): 26h cluster outage triggered by kured
@@ -14,6 +16,14 @@
 #     (Mon-Fri 02:00-06:00 London).
 #   - 2026-04-18: adopted into Terraform (Wave 5a). Previously helm-installed
 #     manually + kubectl-applied sentinel gate.
+#   - 2026-05-10: re-enabled unattended-upgrades (cloud_init.yaml flipped from
+#     remove → install). Sentinel cool-down stretched 30m → 24h. Added Helm
+#     values prometheusUrl + alertFilterRegexp so any non-ignored firing alert
+#     halts the rollout. New "Upgrade Gates" alert group in monitoring stack
+#     (KubeAPIServerDown, KubeStateMetricsDown, PrometheusRuleEvaluationFailing,
+#     PVCStuckPending, RecentNodeReboot, MysqlStandaloneDown,
+#     ClusterPodReadyRatioDropped, NodeMemoryPressure, NodeDiskPressure,
+#     KubeQuotaAlmostFull) provides explicit cluster-health gating.
 
 resource "kubernetes_namespace" "kured" {
   metadata {
@@ -50,6 +60,17 @@ resource "helm_release" "kured" {
       rebootDays     = ["mo", "tu", "we", "th", "fr"]
       rebootSentinel = "/sentinel/gated-reboot-required"
       notifyUrl      = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"]
+      concurrency    = 1
+      rebootDelay    = "30s"
+      # Halt rolling reboots when ANY firing Prometheus alert is not in the
+      # ignore-list. The ignore-list excludes self-referential / always-firing
+      # alerts that would otherwise deadlock kured. alertFilterMatchOnly stays
+      # false (default) so the regex marks alerts to IGNORE — every other
+      # firing alert blocks. See "Upgrade Gates" group in monitoring stack.
+      prometheusUrl        = "http://prometheus-server.monitoring.svc.cluster.local:80"
+      alertFilterRegexp    = "^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor)$"
+      alertFiringOnly      = true
+      alertFilterMatchOnly = false
     }
     reboot_days  = "mon,tue,wed,thu,fri"
     window_end   = "06:00"
@@ -192,14 +213,16 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" {
                 fi
                 echo "  All calico-node pods Running"
 
-                # Check 4: No node rebooted in last 30 minutes (cool-down)
+                # Check 4: No node rebooted in last 24 hours (soak window).
+                # Stretched from 30m to 24h on 2026-05-10 so the de-facto canary
+                # node has a full day of observation before the next node drains.
                 RECENT_REBOOT=0
                 while IFS= read -r transition_time; do
                   if [ -n "$transition_time" ]; then
                     transition_epoch=$(date -d "$transition_time" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$transition_time" +%s 2>/dev/null)
                     now_epoch=$(date +%s)
                     diff=$(( now_epoch - transition_epoch ))
-                    if [ "$diff" -lt 1800 ]; then
+                    if [ "$diff" -lt 86400 ]; then
                       RECENT_REBOOT=1
                       break
                     fi
@@ -207,12 +230,12 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" {
                 done < <(kubectl get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
 
                 if [ "$RECENT_REBOOT" -eq 1 ]; then
-                  echo "  BLOCKED: A node transitioned Ready within the last 30 minutes (cool-down)"
+                  echo "  BLOCKED: A node transitioned Ready within the last 24 hours (soak window)"
                   rm -f /host/var-run/gated-reboot-required
                   sleep 300
                   continue
                 fi
-                echo "  No recent node reboots (30m cool-down clear)"
+                echo "  No recent node reboots (24h soak window clear)"
 
                 # All checks passed — create gated sentinel
                 echo "  ALL CHECKS PASSED — creating /var/run/gated-reboot-required"
diff --git a/stacks/monitoring/modules/monitoring/grafana.tf b/stacks/monitoring/modules/monitoring/grafana.tf
index e7abd8c6..cb1bcd93 100644
--- a/stacks/monitoring/modules/monitoring/grafana.tf
+++ b/stacks/monitoring/modules/monitoring/grafana.tf
@@ -179,13 +179,13 @@ resource "null_resource" "grafana_admin_only_folder_acl" {
   # Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone
   # edits permissions via the UI or the folder is rebuilt.
   triggers = {
-    folder    = each.value
-    always    = timestamp()
+    folder = each.value
+    always = timestamp()
   }
 
   provisioner "local-exec" {
     interpreter = ["/bin/bash", "-c"]
-    command = <<-EOT
+    command     = <<-EOT
       set -euo pipefail
       FOLDER='${each.value}'
       KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}'
diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index 3ec9d0d5..1e15e98c 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -1866,12 +1866,12 @@ serverFiles:
           - alert: KubeQuotaAlmostFull
             expr: |
               kube_resourcequota{type="used"}
-              / on(namespace, resource) kube_resourcequota{type="hard"} > 0.95
+              / on(namespace, resource, resourcequota) kube_resourcequota{type="hard"} > 0.95
             for: 15m
             labels:
               severity: warning
             annotations:
-              summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule"
+              summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resourcequota }} {{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule"
       - name: "Traefik Ingress"
         rules:
           - alert: TraefikDown