infra: re-enable unattended-upgrades with kured prometheus-gating
Reverses the March 2026 outage mitigation that disabled unattended-
upgrades cluster-wide. Now re-enables it on the k8s template VM with:
- Allowed-Origins limited to security/updates pockets
- Package-Blacklist for k8s/containerd/runc/calico-node (apt-mark
hold on the cluster-critical components)
- Automatic-Reboot disabled — kured drives the actual reboots
- Compatible with the existing kured + sentinel-gate flow
kured side:
- rebootDelay 30s, concurrency 1
- Sentinel cool-down stretched 30m → 24h (aligns with the 24h soak
window from the post-mortem)
- prometheusUrl + alertFilterRegexp wired so any firing non-ignored
alert halts the rollout. Ignore-list excludes self-referential
alerts (Watchdog/RebootRequired/KuredNodeWasNotDrained/
InfoInhibitor) that would otherwise deadlock kured.
Prometheus side (already partly landed in 6c4e0966 — the "Upgrade
Gates" rule group):
- Refine `KubeQuotaAlmostFull` to include the resourcequota label in
both the on-clause and the summary, so multi-quota namespaces
(authentik, beads-server, frigate) report the quota name correctly.
grafana.tf: terraform fmt whitespace only.
Together with the post-mortem 2026-03-22 (memory id=390) the loop is
closed: unattended-upgrades runs again, kernel-class updates can land,
but only when cluster health is green and the reboot window is open.
This commit is contained in:
parent
fe75fad467
commit
5c0ea96a91
4 changed files with 71 additions and 15 deletions
|
|
@ -179,13 +179,13 @@ resource "null_resource" "grafana_admin_only_folder_acl" {
|
|||
# Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone
|
||||
# edits permissions via the UI or the folder is rebuilt.
|
||||
triggers = {
|
||||
folder = each.value
|
||||
always = timestamp()
|
||||
folder = each.value
|
||||
always = timestamp()
|
||||
}
|
||||
|
||||
provisioner "local-exec" {
|
||||
interpreter = ["/bin/bash", "-c"]
|
||||
command = <<-EOT
|
||||
command = <<-EOT
|
||||
set -euo pipefail
|
||||
FOLDER='${each.value}'
|
||||
KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}'
|
||||
|
|
|
|||
|
|
@ -1866,12 +1866,12 @@ serverFiles:
|
|||
- alert: KubeQuotaAlmostFull
|
||||
expr: |
|
||||
kube_resourcequota{type="used"}
|
||||
/ on(namespace, resource) kube_resourcequota{type="hard"} > 0.95
|
||||
/ on(namespace, resource, resourcequota) kube_resourcequota{type="hard"} > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule"
|
||||
summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resourcequota }} {{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule"
|
||||
- name: "Traefik Ingress"
|
||||
rules:
|
||||
- alert: TraefikDown
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue