infra: re-enable unattended-upgrades with kured prometheus-gating
Reverses the March 2026 outage mitigation that disabled unattended-
upgrades cluster-wide. Now re-enables it on the k8s template VM with:
- Allowed-Origins limited to security/updates pockets
- Package-Blacklist for k8s/containerd/runc/calico-node (apt-mark
hold on the cluster-critical components)
- Automatic-Reboot disabled — kured drives the actual reboots
- Compatible with the existing kured + sentinel-gate flow
kured side:
- rebootDelay 30s, concurrency 1
- Sentinel cool-down stretched 30m → 24h (aligns with the 24h soak
window from the post-mortem)
- prometheusUrl + alertFilterRegexp wired so any firing non-ignored
alert halts the rollout. Ignore-list excludes self-referential
alerts (Watchdog/RebootRequired/KuredNodeWasNotDrained/
InfoInhibitor) that would otherwise deadlock kured.
Prometheus side (already partly landed in 6c4e0966 — the "Upgrade
Gates" rule group):
- Refine `KubeQuotaAlmostFull` to include the resourcequota label in
both the on-clause and the summary, so multi-quota namespaces
(authentik, beads-server, frigate) report the quota name correctly.
grafana.tf: terraform fmt whitespace only.
Together with the post-mortem 2026-03-22 (memory id=390) the loop is
closed: unattended-upgrades runs again, kernel-class updates can land,
but only when cluster health is green and the reboot window is open.
This commit is contained in:
parent
fe75fad467
commit
5c0ea96a91
4 changed files with 71 additions and 15 deletions
|
|
@ -67,11 +67,44 @@ runcmd:
|
||||||
- sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf
|
- sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf
|
||||||
- systemctl restart systemd-journald
|
- systemctl restart systemd-journald
|
||||||
%{if is_k8s_template}
|
%{if is_k8s_template}
|
||||||
# Disable unattended-upgrades to prevent unexpected kernel updates that can break containerd/kubelet
|
# Re-enabled 2026-05-10: unattended-upgrades is back on, but with a tight
|
||||||
# (Root cause of 26h cluster outage: unattended-upgrades → kernel update → containerd failure)
|
# Allowed-Origins list, a Package-Blacklist for k8s/containerd/runc/calico,
|
||||||
- systemctl disable --now unattended-upgrades || true
|
# and Automatic-Reboot disabled (kured + sentinel-gate handles reboots in a
|
||||||
- apt-get remove -y unattended-upgrades || true
|
# 24h-soaked rolling window, gated by Prometheus alerts).
|
||||||
|
# Original outage (March 2026) was kernel update → containerd overlayfs corruption.
|
||||||
|
# Mitigations: 24h cool-down between node reboots, Prometheus halt-on-alert,
|
||||||
|
# apt-mark hold on k8s components, Package-Blacklist for runtime components.
|
||||||
|
- apt-get install -y unattended-upgrades update-notifier-common
|
||||||
|
- |
|
||||||
|
cat > /etc/apt/apt.conf.d/52unattended-upgrades-k8s <<'EOF'
|
||||||
|
Unattended-Upgrade::Allowed-Origins {
|
||||||
|
"$${distro_id}:$${distro_codename}";
|
||||||
|
"$${distro_id}:$${distro_codename}-security";
|
||||||
|
"$${distro_id}:$${distro_codename}-updates";
|
||||||
|
"$${distro_id}ESMApps:$${distro_codename}-apps-security";
|
||||||
|
"$${distro_id}ESM:$${distro_codename}-infra-security";
|
||||||
|
};
|
||||||
|
Unattended-Upgrade::Package-Blacklist {
|
||||||
|
"^containerd(\.io)?$$";
|
||||||
|
"^runc$$";
|
||||||
|
"^cri-tools$$";
|
||||||
|
"^kubernetes-cni$$";
|
||||||
|
"^calico-.*";
|
||||||
|
"^cni-plugins-.*";
|
||||||
|
"^docker-ce$$";
|
||||||
|
};
|
||||||
|
Unattended-Upgrade::DevRelease "false";
|
||||||
|
Unattended-Upgrade::Automatic-Reboot "false";
|
||||||
|
EOF
|
||||||
|
- |
|
||||||
|
cat > /etc/apt/apt.conf.d/20auto-upgrades <<'EOF'
|
||||||
|
APT::Periodic::Update-Package-Lists "1";
|
||||||
|
APT::Periodic::Unattended-Upgrade "1";
|
||||||
|
EOF
|
||||||
|
- systemctl unmask unattended-upgrades 2>/dev/null || true
|
||||||
|
- systemctl enable --now unattended-upgrades
|
||||||
- apt-mark hold kubelet kubeadm kubectl
|
- apt-mark hold kubelet kubeadm kubectl
|
||||||
|
- apt-mark hold containerd containerd.io runc 2>/dev/null || true
|
||||||
- systemctl stop kubelet
|
- systemctl stop kubelet
|
||||||
- containerd config default | sudo tee /etc/containerd/config.toml
|
- containerd config default | sudo tee /etc/containerd/config.toml
|
||||||
- ${containerd_config_update_command}
|
- ${containerd_config_update_command}
|
||||||
|
|
|
||||||
|
|
@ -2,10 +2,12 @@
|
||||||
#
|
#
|
||||||
# Auto-reboots nodes when /var/run/reboot-required exists on the host (set by
|
# Auto-reboots nodes when /var/run/reboot-required exists on the host (set by
|
||||||
# unattended-upgrades). The reboot process is gated by a custom sentinel file
|
# unattended-upgrades). The reboot process is gated by a custom sentinel file
|
||||||
# (kured-sentinel-gate DaemonSet below) so reboots only happen when:
|
# (kured-sentinel-gate DaemonSet below) and by Prometheus alerts so reboots
|
||||||
|
# only happen when:
|
||||||
# - all nodes Ready
|
# - all nodes Ready
|
||||||
# - all calico-node pods Running
|
# - all calico-node pods Running
|
||||||
# - no node has transitioned Ready in the last 30 minutes (cool-down)
|
# - no node has transitioned Ready in the last 24 hours (24h soak)
|
||||||
|
# - no Prometheus alert is firing (excluding self-referential ignore-list)
|
||||||
#
|
#
|
||||||
# History:
|
# History:
|
||||||
# - 2026-03 post-mortem (memory 390): 26h cluster outage triggered by kured
|
# - 2026-03 post-mortem (memory 390): 26h cluster outage triggered by kured
|
||||||
|
|
@ -14,6 +16,14 @@
|
||||||
# (Mon-Fri 02:00-06:00 London).
|
# (Mon-Fri 02:00-06:00 London).
|
||||||
# - 2026-04-18: adopted into Terraform (Wave 5a). Previously helm-installed
|
# - 2026-04-18: adopted into Terraform (Wave 5a). Previously helm-installed
|
||||||
# manually + kubectl-applied sentinel gate.
|
# manually + kubectl-applied sentinel gate.
|
||||||
|
# - 2026-05-10: re-enabled unattended-upgrades (cloud_init.yaml flipped from
|
||||||
|
# remove → install). Sentinel cool-down stretched 30m → 24h. Added Helm
|
||||||
|
# values prometheusUrl + alertFilterRegexp so any non-ignored firing alert
|
||||||
|
# halts the rollout. New "Upgrade Gates" alert group in monitoring stack
|
||||||
|
# (KubeAPIServerDown, KubeStateMetricsDown, PrometheusRuleEvaluationFailing,
|
||||||
|
# PVCStuckPending, RecentNodeReboot, MysqlStandaloneDown,
|
||||||
|
# ClusterPodReadyRatioDropped, NodeMemoryPressure, NodeDiskPressure,
|
||||||
|
# KubeQuotaAlmostFull) provides explicit cluster-health gating.
|
||||||
|
|
||||||
resource "kubernetes_namespace" "kured" {
|
resource "kubernetes_namespace" "kured" {
|
||||||
metadata {
|
metadata {
|
||||||
|
|
@ -50,6 +60,17 @@ resource "helm_release" "kured" {
|
||||||
rebootDays = ["mo", "tu", "we", "th", "fr"]
|
rebootDays = ["mo", "tu", "we", "th", "fr"]
|
||||||
rebootSentinel = "/sentinel/gated-reboot-required"
|
rebootSentinel = "/sentinel/gated-reboot-required"
|
||||||
notifyUrl = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"]
|
notifyUrl = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"]
|
||||||
|
concurrency = 1
|
||||||
|
rebootDelay = "30s"
|
||||||
|
# Halt rolling reboots when ANY firing Prometheus alert is not in the
|
||||||
|
# ignore-list. The ignore-list excludes self-referential / always-firing
|
||||||
|
# alerts that would otherwise deadlock kured. alertFilterMatchOnly stays
|
||||||
|
# false (default) so the regex marks alerts to IGNORE — every other
|
||||||
|
# firing alert blocks. See "Upgrade Gates" group in monitoring stack.
|
||||||
|
prometheusUrl = "http://prometheus-server.monitoring.svc.cluster.local:80"
|
||||||
|
alertFilterRegexp = "^(Watchdog|RebootRequired|KuredNodeWasNotDrained|InfoInhibitor)$"
|
||||||
|
alertFiringOnly = true
|
||||||
|
alertFilterMatchOnly = false
|
||||||
}
|
}
|
||||||
reboot_days = "mon,tue,wed,thu,fri"
|
reboot_days = "mon,tue,wed,thu,fri"
|
||||||
window_end = "06:00"
|
window_end = "06:00"
|
||||||
|
|
@ -192,14 +213,16 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" {
|
||||||
fi
|
fi
|
||||||
echo " All calico-node pods Running"
|
echo " All calico-node pods Running"
|
||||||
|
|
||||||
# Check 4: No node rebooted in last 30 minutes (cool-down)
|
# Check 4: No node rebooted in last 24 hours (soak window).
|
||||||
|
# Stretched from 30m to 24h on 2026-05-10 so the de-facto canary
|
||||||
|
# node has a full day of observation before the next node drains.
|
||||||
RECENT_REBOOT=0
|
RECENT_REBOOT=0
|
||||||
while IFS= read -r transition_time; do
|
while IFS= read -r transition_time; do
|
||||||
if [ -n "$transition_time" ]; then
|
if [ -n "$transition_time" ]; then
|
||||||
transition_epoch=$(date -d "$transition_time" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$transition_time" +%s 2>/dev/null)
|
transition_epoch=$(date -d "$transition_time" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$transition_time" +%s 2>/dev/null)
|
||||||
now_epoch=$(date +%s)
|
now_epoch=$(date +%s)
|
||||||
diff=$(( now_epoch - transition_epoch ))
|
diff=$(( now_epoch - transition_epoch ))
|
||||||
if [ "$diff" -lt 1800 ]; then
|
if [ "$diff" -lt 86400 ]; then
|
||||||
RECENT_REBOOT=1
|
RECENT_REBOOT=1
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
|
@ -207,12 +230,12 @@ resource "kubernetes_daemon_set_v1" "kured_sentinel_gate" {
|
||||||
done < <(kubectl get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
|
done < <(kubectl get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
|
||||||
|
|
||||||
if [ "$RECENT_REBOOT" -eq 1 ]; then
|
if [ "$RECENT_REBOOT" -eq 1 ]; then
|
||||||
echo " BLOCKED: A node transitioned Ready within the last 30 minutes (cool-down)"
|
echo " BLOCKED: A node transitioned Ready within the last 24 hours (soak window)"
|
||||||
rm -f /host/var-run/gated-reboot-required
|
rm -f /host/var-run/gated-reboot-required
|
||||||
sleep 300
|
sleep 300
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
echo " No recent node reboots (30m cool-down clear)"
|
echo " No recent node reboots (24h soak window clear)"
|
||||||
|
|
||||||
# All checks passed — create gated sentinel
|
# All checks passed — create gated sentinel
|
||||||
echo " ALL CHECKS PASSED — creating /var/run/gated-reboot-required"
|
echo " ALL CHECKS PASSED — creating /var/run/gated-reboot-required"
|
||||||
|
|
|
||||||
|
|
@ -179,13 +179,13 @@ resource "null_resource" "grafana_admin_only_folder_acl" {
|
||||||
# Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone
|
# Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone
|
||||||
# edits permissions via the UI or the folder is rebuilt.
|
# edits permissions via the UI or the folder is rebuilt.
|
||||||
triggers = {
|
triggers = {
|
||||||
folder = each.value
|
folder = each.value
|
||||||
always = timestamp()
|
always = timestamp()
|
||||||
}
|
}
|
||||||
|
|
||||||
provisioner "local-exec" {
|
provisioner "local-exec" {
|
||||||
interpreter = ["/bin/bash", "-c"]
|
interpreter = ["/bin/bash", "-c"]
|
||||||
command = <<-EOT
|
command = <<-EOT
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
FOLDER='${each.value}'
|
FOLDER='${each.value}'
|
||||||
KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}'
|
KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}'
|
||||||
|
|
|
||||||
|
|
@ -1866,12 +1866,12 @@ serverFiles:
|
||||||
- alert: KubeQuotaAlmostFull
|
- alert: KubeQuotaAlmostFull
|
||||||
expr: |
|
expr: |
|
||||||
kube_resourcequota{type="used"}
|
kube_resourcequota{type="used"}
|
||||||
/ on(namespace, resource) kube_resourcequota{type="hard"} > 0.95
|
/ on(namespace, resource, resourcequota) kube_resourcequota{type="hard"} > 0.95
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule"
|
summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resourcequota }} {{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule"
|
||||||
- name: "Traefik Ingress"
|
- name: "Traefik Ingress"
|
||||||
rules:
|
rules:
|
||||||
- alert: TraefikDown
|
- alert: TraefikDown
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue