k8s-version-upgrade: kill-switch + ignore RecentNodeReboot + shorter quiet window
Three changes from today's autonomous-pipeline validation session:
1. **Kill-switch ConfigMap** — chain checks for `k8s-upgrade-killswitch`
ConfigMap in `k8s-upgrade` namespace at the top of every phase + at the
start of version-check. Existence halts the chain (exit 0) with a Slack
message. Single-command emergency stop:
kubectl -n k8s-upgrade create configmap k8s-upgrade-killswitch \
--from-literal=reason="storm response"
Resume: kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch
Role rule for `configmaps` get/list/watch added (resourceName-scoped).
2. **Ignore RecentNodeReboot in halt_on_alert_query everywhere** — the
chain itself causes reboots. The pre-drain master check, post-upgrade
worker check, postflight check, and preflight halt-on-alert all now
pass `RecentNodeReboot` as the extra-ignore. Previously only worker
phase's post-upgrade gate did this. Master Failed silently this morning
on the pre-drain check after my own master reboot.
3. **Preflight quiet-baseline 3600s → 600s** — the 1h cooldown after any
Ready transition meant the chain refused to run for an hour after
every kured reboot. 10 min is enough for kubelet/control-plane to
settle; the 24h-between-cluster-reboots invariant lives in
kured-sentinel-gate, not here.
Validated by running the chain end-to-end: preflight passed in 5s,
master phase now in drain. Today's storm post-mortem (snapshot CoW
amplification + tigera-operator crashloop feedback loop) drove the
kill-switch design.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
944cf51f6b
commit
fc0510aa67
2 changed files with 63 additions and 15 deletions
|
|
@ -216,6 +216,7 @@ resource "kubernetes_cluster_role_binding" "k8s_upgrade_job" {
|
||||||
}
|
}
|
||||||
|
|
||||||
# Namespaced: read the credentials Secret in k8s-upgrade (SSH key + Slack URL)
|
# Namespaced: read the credentials Secret in k8s-upgrade (SSH key + Slack URL)
|
||||||
|
# + read the kill-switch ConfigMap (one-touch emergency-stop for the chain).
|
||||||
resource "kubernetes_role" "k8s_upgrade_job_ns" {
|
resource "kubernetes_role" "k8s_upgrade_job_ns" {
|
||||||
metadata {
|
metadata {
|
||||||
name = "k8s-upgrade-job-ns"
|
name = "k8s-upgrade-job-ns"
|
||||||
|
|
@ -227,6 +228,14 @@ resource "kubernetes_role" "k8s_upgrade_job_ns" {
|
||||||
resource_names = ["k8s-upgrade-creds"]
|
resource_names = ["k8s-upgrade-creds"]
|
||||||
verbs = ["get"]
|
verbs = ["get"]
|
||||||
}
|
}
|
||||||
|
# Kill-switch ConfigMap. Existence halts the chain (any phase) — see the
|
||||||
|
# "Kill-switch" block at the top of scripts/upgrade-step.sh.
|
||||||
|
rule {
|
||||||
|
api_groups = [""]
|
||||||
|
resources = ["configmaps"]
|
||||||
|
resource_names = ["k8s-upgrade-killswitch"]
|
||||||
|
verbs = ["get", "list", "watch"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "kubernetes_role_binding" "k8s_upgrade_job_ns" {
|
resource "kubernetes_role_binding" "k8s_upgrade_job_ns" {
|
||||||
|
|
@ -340,6 +349,16 @@ resource "kubernetes_cron_job_v1" "k8s_version_check" {
|
||||||
"$SLACK" || true
|
"$SLACK" || true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Kill-switch — see scripts/upgrade-step.sh for full docs.
|
||||||
|
# ConfigMap existence halts the chain (any phase).
|
||||||
|
if /usr/local/bin/kubectl -n k8s-upgrade get configmap k8s-upgrade-killswitch >/dev/null 2>&1; then
|
||||||
|
reason=$(/usr/local/bin/kubectl -n k8s-upgrade get configmap k8s-upgrade-killswitch \
|
||||||
|
-o jsonpath='{.data.reason}' 2>/dev/null || echo "(no reason set)")
|
||||||
|
slack "version-check HALTED by kill-switch: $reason"
|
||||||
|
echo "HALTED. Resume: kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# 1. Detect running version
|
# 1. Detect running version
|
||||||
RUNNING=$(/usr/local/bin/kubectl get nodes \
|
RUNNING=$(/usr/local/bin/kubectl get nodes \
|
||||||
-o jsonpath='{.items[0].status.nodeInfo.kubeletVersion}' | tr -d v)
|
-o jsonpath='{.items[0].status.nodeInfo.kubeletVersion}' | tr -d v)
|
||||||
|
|
|
||||||
|
|
@ -67,6 +67,26 @@ slack() {
|
||||||
"$SLACK_URL" >/dev/null || echo "warn: slack post failed"
|
"$SLACK_URL" >/dev/null || echo "warn: slack post failed"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Kill-switch — checked before every phase. If the ConfigMap
|
||||||
|
# `k8s-upgrade-killswitch` exists in the `k8s-upgrade` namespace, the chain
|
||||||
|
# halts immediately (exit 0, not 1 — this is an intentional pause, not a
|
||||||
|
# failure). Restores via `kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch`.
|
||||||
|
# Designed for "stop the storm" scenarios: emergency-press the brake from
|
||||||
|
# any kubectl session in <5 seconds, no script changes needed.
|
||||||
|
#
|
||||||
|
# Create: kubectl -n k8s-upgrade create configmap k8s-upgrade-killswitch \
|
||||||
|
# --from-literal=reason="why you stopped it"
|
||||||
|
# Inspect: kubectl -n k8s-upgrade get cm k8s-upgrade-killswitch -o yaml
|
||||||
|
# Resume: kubectl -n k8s-upgrade delete cm k8s-upgrade-killswitch
|
||||||
|
if $KUBECTL -n "$NS" get configmap k8s-upgrade-killswitch >/dev/null 2>&1; then
|
||||||
|
reason=$($KUBECTL -n "$NS" get configmap k8s-upgrade-killswitch \
|
||||||
|
-o jsonpath='{.data.reason}' 2>/dev/null || echo "(no reason set)")
|
||||||
|
slack "HALTED by kill-switch (phase=$PHASE target_node=${TARGET_NODE:-none}): $reason"
|
||||||
|
echo "HALTED by k8s-upgrade-killswitch ConfigMap. Reason: $reason"
|
||||||
|
echo "Resume: kubectl -n $NS delete cm k8s-upgrade-killswitch"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
push() {
|
push() {
|
||||||
printf '# TYPE %s gauge\n%s %s\n' "$1" "$1" "$2" \
|
printf '# TYPE %s gauge\n%s %s\n' "$1" "$1" "$2" \
|
||||||
| curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed"
|
| curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed"
|
||||||
|
|
@ -230,28 +250,33 @@ phase_preflight() {
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 2. Halt-on-alert
|
# 2. Halt-on-alert. RecentNodeReboot is fully redundant with check 3
|
||||||
|
# (inline quiet-baseline) below — both surface "a node rebooted recently".
|
||||||
|
# Including it here meant the chain refused to start for 1h after EVERY
|
||||||
|
# kured reboot of any node (kured fires whenever /var/run/reboot-required
|
||||||
|
# is set, often daily). Now skipped — check 3 is the single source of truth
|
||||||
|
# for "is the cluster quiet enough to upgrade".
|
||||||
local alerts
|
local alerts
|
||||||
alerts=$(halt_on_alert_query)
|
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||||
if [ -n "$alerts" ]; then
|
if [ -n "$alerts" ]; then
|
||||||
slack "ABORT preflight — firing alerts:\n$alerts"
|
slack "ABORT preflight — firing alerts:\n$alerts"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 3. Quiet-baseline check — fail if any node had a Ready transition in the
|
# 3. Quiet-baseline check — fail if any node had a Ready transition in the
|
||||||
# last hour. Threshold matches the RecentNodeReboot alert (3600s) — the
|
# last 10 min. Tightened from 3600s → 600s on 2026-05-21 after diagnosing
|
||||||
# 24h-between-cluster-reboots protection lives in kured-sentinel-gate
|
# that the previous 1h window meant the chain couldn't run after any
|
||||||
# Check 4, not here. Tightened from 86400 → 3600 on 2026-05-17; with the
|
# reboot for an hour. 10min is sufficient for kubelet/control-plane to
|
||||||
# alert clearing in 1h, this duplicate gate was the actual blocker for
|
# stabilise; the kured-sentinel-gate DaemonSet enforces the broader
|
||||||
# the chain after a session of manual reboots.
|
# 24h-between-cluster-reboots invariant.
|
||||||
local recent=0
|
local recent=0
|
||||||
while IFS= read -r ts; do
|
while IFS= read -r ts; do
|
||||||
[ -z "$ts" ] && continue
|
[ -z "$ts" ] && continue
|
||||||
local diff=$(( $(date +%s) - $(date -d "$ts" +%s) ))
|
local diff=$(( $(date +%s) - $(date -d "$ts" +%s) ))
|
||||||
if [ "$diff" -lt 3600 ]; then recent=1; break; fi
|
if [ "$diff" -lt 600 ]; then recent=1; break; fi
|
||||||
done < <($KUBECTL get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
|
done < <($KUBECTL get nodes -o jsonpath='{range .items[*]}{range .status.conditions[?(@.type=="Ready")]}{.lastTransitionTime}{"\n"}{end}{end}')
|
||||||
if [ "$recent" -eq 1 ]; then
|
if [ "$recent" -eq 1 ]; then
|
||||||
slack "ABORT preflight — node transitioned Ready <1h ago (settle window)"
|
slack "ABORT preflight — node transitioned Ready <10min ago (settle window)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
@ -334,9 +359,11 @@ phase_preflight() {
|
||||||
phase_master() {
|
phase_master() {
|
||||||
slack "Draining k8s-master"
|
slack "Draining k8s-master"
|
||||||
|
|
||||||
# Re-check halt-on-alert before drain
|
# Re-check halt-on-alert before drain. Always ignore RecentNodeReboot —
|
||||||
|
# the chain itself causes node reboots, so this alert firing is expected
|
||||||
|
# mid-chain (e.g. master was already upgraded+rebooted before this phase).
|
||||||
local alerts
|
local alerts
|
||||||
alerts=$(halt_on_alert_query)
|
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||||
[ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; }
|
[ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; }
|
||||||
|
|
||||||
drain_node k8s-master
|
drain_node k8s-master
|
||||||
|
|
@ -370,10 +397,11 @@ phase_worker() {
|
||||||
[ -z "$TARGET_NODE" ] && { echo "ERROR: worker phase requires TARGET_NODE"; exit 2; }
|
[ -z "$TARGET_NODE" ] && { echo "ERROR: worker phase requires TARGET_NODE"; exit 2; }
|
||||||
slack "Draining $TARGET_NODE"
|
slack "Draining $TARGET_NODE"
|
||||||
|
|
||||||
# Halt-on-alert wait (up to 30 min)
|
# Halt-on-alert wait (up to 30 min). Ignore RecentNodeReboot — the chain
|
||||||
|
# just rebooted a node, that's the cause and is expected.
|
||||||
local attempt alerts
|
local attempt alerts
|
||||||
for attempt in $(seq 1 30); do
|
for attempt in $(seq 1 30); do
|
||||||
alerts=$(halt_on_alert_query)
|
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||||
[ -z "$alerts" ] && break
|
[ -z "$alerts" ] && break
|
||||||
echo "Waiting for alerts to clear (attempt $attempt/30): $alerts"
|
echo "Waiting for alerts to clear (attempt $attempt/30): $alerts"
|
||||||
sleep 60
|
sleep 60
|
||||||
|
|
@ -427,9 +455,10 @@ phase_postflight() {
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# No alerts firing
|
# No alerts firing. Ignore RecentNodeReboot — by definition we just
|
||||||
|
# rebooted every node; this alert clears naturally in <1h.
|
||||||
local alerts
|
local alerts
|
||||||
alerts=$(halt_on_alert_query)
|
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||||
[ -n "$alerts" ] && slack "Postflight WARN — alerts still firing (cluster on target, please check):\n$alerts"
|
[ -n "$alerts" ] && slack "Postflight WARN — alerts still firing (cluster on target, please check):\n$alerts"
|
||||||
|
|
||||||
# Pod-ready ratio
|
# Pod-ready ratio
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue