k8s-version-upgrade: ignore IngressTTFBCritical in halt-on-alert check
The Synology DSM (port 5001) ingress chronically trips IngressTTFBCritical because of NAS-side latency that is unrelated to k8s upgrades. The chain was halting indefinitely waiting for it to clear. Add it alongside RecentNodeReboot to the per-call ignore regex so the chain can proceed autonomously without manual silences. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
447bfef507
commit
467460cccd
1 changed files with 7 additions and 7 deletions
|
|
@ -107,7 +107,7 @@ halt_on_alert_query() {
|
|||
# mid-chain (apiserver down, etcd down, node not ready, etc.).
|
||||
#
|
||||
# `extra_ignore` is now mostly historical — kept for backwards compat with
|
||||
# `halt_on_alert_query RecentNodeReboot`-style calls. With severity-based
|
||||
# `halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical"`-style calls. With severity-based
|
||||
# filtering, RecentNodeReboot (severity=info) is filtered automatically.
|
||||
# We still build the regex for any critical alert the caller wants to
|
||||
# explicitly ignore (e.g. a known-broken thing we're aware of).
|
||||
|
|
@ -280,7 +280,7 @@ phase_preflight() {
|
|||
# is set, often daily). Now skipped — check 3 is the single source of truth
|
||||
# for "is the cluster quiet enough to upgrade".
|
||||
local alerts
|
||||
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||
alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
|
||||
if [ -n "$alerts" ]; then
|
||||
slack "ABORT preflight — firing alerts:\n$alerts"
|
||||
exit 1
|
||||
|
|
@ -398,7 +398,7 @@ phase_master() {
|
|||
# the chain itself causes node reboots, so this alert firing is expected
|
||||
# mid-chain (e.g. master was already upgraded+rebooted before this phase).
|
||||
local alerts
|
||||
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||
alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
|
||||
[ -n "$alerts" ] && { slack "ABORT master — alerts firing pre-drain: $alerts"; exit 1; }
|
||||
|
||||
# Quiesce noisy operators that crashloop when apiserver briefly disappears
|
||||
|
|
@ -441,7 +441,7 @@ phase_master() {
|
|||
exit 1
|
||||
fi
|
||||
|
||||
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||
alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
|
||||
[ -n "$alerts" ] && { slack "ABORT master — alerts firing post-upgrade: $alerts"; exit 1; }
|
||||
|
||||
# Restore tigera-operator (quiesced before drain). It reconciles in seconds.
|
||||
|
|
@ -471,7 +471,7 @@ phase_worker() {
|
|||
# just rebooted a node, that's the cause and is expected.
|
||||
local attempt alerts
|
||||
for attempt in $(seq 1 30); do
|
||||
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||
alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
|
||||
[ -z "$alerts" ] && break
|
||||
echo "Waiting for alerts to clear (attempt $attempt/30): $alerts"
|
||||
sleep 60
|
||||
|
|
@ -502,7 +502,7 @@ phase_worker() {
|
|||
# 10-min soak with halt-on-alert (RecentNodeReboot ignored — we know we restarted it)
|
||||
echo "Soaking $TARGET_NODE for 10 min..."
|
||||
for i in $(seq 1 10); do
|
||||
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||
alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
|
||||
[ -n "$alerts" ] && { slack "ABORT $TARGET_NODE mid-soak — alerts: $alerts"; exit 1; }
|
||||
sleep 60
|
||||
done
|
||||
|
|
@ -528,7 +528,7 @@ phase_postflight() {
|
|||
# No alerts firing. Ignore RecentNodeReboot — by definition we just
|
||||
# rebooted every node; this alert clears naturally in <1h.
|
||||
local alerts
|
||||
alerts=$(halt_on_alert_query RecentNodeReboot)
|
||||
alerts=$(halt_on_alert_query "RecentNodeReboot|IngressTTFBCritical")
|
||||
[ -n "$alerts" ] && slack "Postflight WARN — alerts still firing (cluster on target, please check):\n$alerts"
|
||||
|
||||
# Pod-ready ratio
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue