k8s-version-upgrade: ignore IngressTTFBCritical in halt-on-alert check

The Synology DSM (port 5001) ingress chronically trips IngressTTFBCritical
because of NAS-side latency that is unrelated to k8s upgrades. The chain
was halting indefinitely waiting for it to clear. Add it alongside
RecentNodeReboot to the per-call ignore regex so the chain can proceed
autonomously without manual silences.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-05-23 20:17:31 +00:00
parent 447bfef507
commit 467460cccd

View file

@ -107,7 +107,7 @@ halt_on_alert_query() {
# mid-chain (apiserver down, etcd down, node not ready, etc.). # mid-chain (apiserver down, etcd down, node not ready, etc.).
# #
# `extra_ignore` is now mostly historical — kept for backwards compat with # `extra_ignore` is now mostly historical — kept for backwards compat with
# `halt_on_alert_query RecentNodeReboot`-style calls. With severity-based # `halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical"`-style calls. With severity-based
# filtering, RecentNodeReboot (severity=info) is filtered automatically. # filtering, RecentNodeReboot (severity=info) is filtered automatically.
# We still build the regex for any critical alert the caller wants to # We still build the regex for any critical alert the caller wants to
# explicitly ignore (e.g. a known-broken thing we're aware of). # explicitly ignore (e.g. a known-broken thing we're aware of).
@ -280,7 +280,7 @@ phase_preflight() {
# is set, often daily). Now skipped — check 3 is the single source of truth # is set, often daily). Now skipped — check 3 is the single source of truth
# for "is the cluster quiet enough to upgrade". # for "is the cluster quiet enough to upgrade".
local alerts local alerts
alerts=$(halt_on_alert_query RecentNodeReboot) alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
if [ -n "$alerts" ]; then if [ -n "$alerts" ]; then
slack "ABORT preflight — firing alerts:\n$alerts" slack "ABORT preflight — firing alerts:\n$alerts"
exit 1 exit 1
@ -398,7 +398,7 @@ phase_master() {
# the chain itself causes node reboots, so this alert firing is expected # the chain itself causes node reboots, so this alert firing is expected
# mid-chain (e.g. master was already upgraded+rebooted before this phase). # mid-chain (e.g. master was already upgraded+rebooted before this phase).
local alerts local alerts
alerts=$(halt_on_alert_query RecentNodeReboot) alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
[ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; } [ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; }
# Quiesce noisy operators that crashloop when apiserver briefly disappears # Quiesce noisy operators that crashloop when apiserver briefly disappears
@ -441,7 +441,7 @@ phase_master() {
exit 1 exit 1
fi fi
alerts=$(halt_on_alert_query RecentNodeReboot) alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
[ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; } [ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; }
# Restore tigera-operator (quiesced before drain). It reconciles in seconds. # Restore tigera-operator (quiesced before drain). It reconciles in seconds.
@ -471,7 +471,7 @@ phase_worker() {
# just rebooted a node, that's the cause and is expected. # just rebooted a node, that's the cause and is expected.
local attempt alerts local attempt alerts
for attempt in $(seq 1 30); do for attempt in $(seq 1 30); do
alerts=$(halt_on_alert_query RecentNodeReboot) alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
[ -z "$alerts" ] && break [ -z "$alerts" ] && break
echo "Waiting for alerts to clear (attempt $attempt/30): $alerts" echo "Waiting for alerts to clear (attempt $attempt/30): $alerts"
sleep 60 sleep 60
@ -502,7 +502,7 @@ phase_worker() {
# 10-min soak with halt-on-alert (RecentNodeReboot ignored — we know we restarted it) # 10-min soak with halt-on-alert (RecentNodeReboot ignored — we know we restarted it)
echo "Soaking $TARGET_NODE for 10 min..." echo "Soaking $TARGET_NODE for 10 min..."
for i in $(seq 1 10); do for i in $(seq 1 10); do
alerts=$(halt_on_alert_query RecentNodeReboot) alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
[ -n "$alerts" ] && { slack "ABORT $TARGET_NODE mid-soak — alerts: $alerts"; exit 1; } [ -n "$alerts" ] && { slack "ABORT $TARGET_NODE mid-soak — alerts: $alerts"; exit 1; }
sleep 60 sleep 60
done done
@ -528,7 +528,7 @@ phase_postflight() {
# No alerts firing. Ignore RecentNodeReboot — by definition we just # No alerts firing. Ignore RecentNodeReboot — by definition we just
# rebooted every node; this alert clears naturally in <1h. # rebooted every node; this alert clears naturally in <1h.
local alerts local alerts
alerts=$(halt_on_alert_query RecentNodeReboot) alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
[ -n "$alerts" ] && slack "Postflight WARN — alerts still firing (cluster on target, please check):\n$alerts" [ -n "$alerts" ] && slack "Postflight WARN — alerts still firing (cluster on target, please check):\n$alerts"
# Pod-ready ratio # Pod-ready ratio