From 5c0ea96a9168dd60066b2b9c0ff735283f38e7d1 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 17:07:32 +0000 Subject: [PATCH] infra: re-enable unattended-upgrades with kured prometheus-gating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverses the March 2026 outage mitigation that disabled unattended- upgrades cluster-wide. Now re-enables it on the k8s template VM with: - Allowed-Origins limited to security/updates pockets - Package-Blacklist for k8s/containerd/runc/calico-node (apt-mark hold on the cluster-critical components) - Automatic-Reboot disabled — kured drives the actual reboots - Compatible with the existing kured + sentinel-gate flow kured side: - rebootDelay 30s, concurrency 1 - Sentinel cool-down stretched 30m → 24h (aligns with the 24h soak window from the post-mortem) - prometheusUrl + alertFilterRegexp wired so any firing non-ignored alert halts the rollout. Ignore-list excludes self-referential alerts (Watchdog/RebootRequired/KuredNodeWasNotDrained/ InfoInhibitor) that would otherwise deadlock kured. Prometheus side (already partly landed in 6c4e0966 — the "Upgrade Gates" rule group): - Refine `KubeQuotaAlmostFull` to include the resourcequota label in both the on-clause and the summary, so multi-quota namespaces (authentik, beads-server, frigate) report the quota name correctly. grafana.tf: terraform fmt whitespace only. Together with the post-mortem 2026-03-22 (memory id=390) the loop is closed: unattended-upgrades runs again, kernel-class updates can land, but only when cluster health is green and the reboot window is open. --- modules/create-template-vm/cloud_init.yaml | 41 +++++++++++++++++-- stacks/kured/main.tf | 35 +++++++++++++--- .../monitoring/modules/monitoring/grafana.tf | 6 +-- .../monitoring/prometheus_chart_values.tpl | 4 +- 4 files changed, 71 insertions(+), 15 deletions(-) diff --git a/modules/create-template-vm/cloud_init.yaml b/modules/create-template-vm/cloud_init.yaml index 8b696cad..1dc683f6 100644 --- a/modules/create-template-vm/cloud_init.yaml +++ b/modules/create-template-vm/cloud_init.yaml @@ -67,11 +67,44 @@ runcmd: - sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf - systemctl restart systemd-journald %{if is_k8s_template} - # Disable unattended-upgrades to prevent unexpected kernel updates that can break containerd/kubelet - # (Root cause of 26h cluster outage: unattended-upgrades → kernel update → containerd failure) - - systemctl disable --now unattended-upgrades || true - - apt-get remove -y unattended-upgrades || true + # Re-enabled 2026-05-10: unattended-upgrades is back on, but with a tight + # Allowed-Origins list, a Package-Blacklist for k8s/containerd/runc/calico, + # and Automatic-Reboot disabled (kured + sentinel-gate handles reboots in a + # 24h-soaked rolling window, gated by Prometheus alerts). + # Original outage (March 2026) was kernel update → containerd overlayfs corruption. + # Mitigations: 24h cool-down between node reboots, Prometheus halt-on-alert, + # apt-mark hold on k8s components, Package-Blacklist for runtime components. + - apt-get install -y unattended-upgrades update-notifier-common + - | + cat > /etc/apt/apt.conf.d/52unattended-upgrades-k8s <<'EOF' + Unattended-Upgrade::Allowed-Origins { + "$${distro_id}:$${distro_codename}"; + "$${distro_id}:$${distro_codename}-security"; + "$${distro_id}:$${distro_codename}-updates"; + "$${distro_id}ESMApps:$${distro_codename}-apps-security"; + "$${distro_id}ESM:$${distro_codename}-infra-security"; + }; + Unattended-Upgrade::Package-Blacklist { + "^containerd(\.io)?$$"; + "^runc$$"; + "^cri-tools$$"; + "^kubernetes-cni$$"; + "^calico-.*"; + "^cni-plugins-.*"; + "^docker-ce$$"; + }; + Unattended-Upgrade::DevRelease "false"; + Unattended-Upgrade::Automatic-Reboot "false"; + EOF + - | + cat > /etc/apt/apt.conf.d/20auto-upgrades <<'EOF' + APT::Periodic::Update-Package-Lists "1"; + APT::Periodic::Unattended-Upgrade "1"; + EOF + - systemctl unmask unattended-upgrades 2>/dev/null || true + - systemctl enable --now unattended-upgrades - apt-mark hold kubelet kubeadm kubectl + - apt-mark hold containerd containerd.io runc 2>/dev/null || true - systemctl stop kubelet - containerd config default | sudo tee /etc/containerd/config.toml - ${containerd_config_update_command} diff --git a/stacks/kured/main.tf b/stacks/kured/main.tf index 183974ea..896f23d3 100644 --- a/stacks/kured/main.tf +++ b/stacks/kured/main.tf @@ -2,10 +2,12 @@ # # Auto-reboots nodes when /var/run/reboot-required exists on the host (set by # unattended-upgrades). The reboot process is gated by a custom sentinel file -# (kured-sentinel-gate DaemonSet below) so reboots only happen when: +# (kured-sentinel-gate DaemonSet below) and by Prometheus alerts so reboots +# only happen when: # - all nodes Ready # - all calico-node pods Running -# - no node has transitioned Ready in the last 30 minutes (cool-down) +# - no node has transitioned Ready in the last 24 hours (24h soak) +# - no Prometheus alert is firing (excluding self-referential ignore-list) # # History: # - 2026-03 post-mortem (memory 390): 26h cluster outage triggered by kured @@ -14,6 +16,14 @@ # (Mon-Fri 02:00-06:00 London). # - 2026-04-18: adopted into Terraform (Wave 5a). Previously helm-installed # manually + kubectl-applied sentinel gate. +# - 2026-05-10: re-enabled unattended-upgrades (cloud_init.yaml flipped from +# remove → install). Sentinel cool-down stretched 30m → 24h. Added Helm +# values prometheusUrl + alertFilterRegexp so any non-ignored firing alert +# halts the rollout. New "Upgrade Gates" alert group in monitoring stack +# (KubeAPIServerDown, KubeStateMetricsDown, PrometheusRuleEvaluationFailing, +# PVCStuckPending, RecentNodeReboot, MysqlStandaloneDown, +# ClusterPodReadyRatioDropped, NodeMemoryPressure, NodeDiskPressure, +# KubeQuotaAlmostFull) provides explicit cluster-health gating. resource "kubernetes_namespace" "kured" { metadata { @@ -50,6 +60,17 @@ resource "helm_release" "kured" { rebootDays = ["mo", "tu", "we", "th", "fr"] rebootSentinel = "/sentinel/gated-reboot-required" notifyUrl = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"] + concurrency = 1 + rebootDelay = "30s" + # Halt rolling reboots when ANY firing Prometheus alert is not in the + # ignore-list. The ignore-list excludes self-referential / always-firing + # alerts that would otherwise deadlock kured. alertFilterMatchOnly stays + # false (default) so the regex marks alerts to IGNORE — every other + # firing alert blocks. See "Upgrade Gates" group in monitoring stack. + prometheusUrl = "http://prometheus-server.monitoring.svc.cluster.local:80" + alertFilterRegexp = "^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor)$" + alertFiringOnly = true + alertFilterMatchOnly = false } reboot_days = "mon,tue,wed,thu,fri" window_end = "06:00" @@ -192,14 +213,16 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" { fi echo " All calico-node pods Running" - # Check 4: No node rebooted in last 30 minutes (cool-down) + # Check 4: No node rebooted in last 24 hours (soak window). + # Stretched from 30m to 24h on 2026-05-10 so the de-facto canary + # node has a full day of observation before the next node drains. RECENT_REBOOT=0 while IFS= read -r transition_time; do if [ -n "$transition_time" ]; then transition_epoch=$(date -d "$transition_time" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$transition_time" +%s 2>/dev/null) now_epoch=$(date +%s) diff=$(( now_epoch - transition_epoch )) - if [ "$diff" -lt 1800 ]; then + if [ "$diff" -lt 86400 ]; then RECENT_REBOOT=1 break fi @@ -207,12 +230,12 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" { done < <(kubectl get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}') if [ "$RECENT_REBOOT" -eq 1 ]; then - echo " BLOCKED: A node transitioned Ready within the last 30 minutes (cool-down)" + echo " BLOCKED: A node transitioned Ready within the last 24 hours (soak window)" rm -f /host/var-run/gated-reboot-required sleep 300 continue fi - echo " No recent node reboots (30m cool-down clear)" + echo " No recent node reboots (24h soak window clear)" # All checks passed — create gated sentinel echo " ALL CHECKS PASSED — creating /var/run/gated-reboot-required" diff --git a/stacks/monitoring/modules/monitoring/grafana.tf b/stacks/monitoring/modules/monitoring/grafana.tf index e7abd8c6..cb1bcd93 100644 --- a/stacks/monitoring/modules/monitoring/grafana.tf +++ b/stacks/monitoring/modules/monitoring/grafana.tf @@ -179,13 +179,13 @@ resource "null_resource" "grafana_admin_only_folder_acl" { # Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone # edits permissions via the UI or the folder is rebuilt. triggers = { - folder = each.value - always = timestamp() + folder = each.value + always = timestamp() } provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] - command = <<-EOT + command = <<-EOT set -euo pipefail FOLDER='${each.value}' KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}' diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 3ec9d0d5..1e15e98c 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1866,12 +1866,12 @@ serverFiles: - alert: KubeQuotaAlmostFull expr: | kube_resourcequota{type="used"} - / on(namespace, resource) kube_resourcequota{type="hard"} > 0.95 + / on(namespace, resource, resourcequota) kube_resourcequota{type="hard"} > 0.95 for: 15m labels: severity: warning annotations: - summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule" + summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resourcequota }} {{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule" - name: "Traefik Ingress" rules: - alert: TraefikDown