Keel 1.2.0 registers a Slack socket-mode bot whenever SLACK_BOT_TOKEN is
set, then fails because we don't supply an `xapp-` app-level token:
bot.slack.Configure(): SLACK_APP_TOKEN must have the prefix "xapp-".
bot.Run(): can not get configuration for bot [slack]
We don't want the interactive bot — opt-out auto-update + no approval flow
(see stacks/keel/main.tf comment). The Slack NOTIFICATION sender works
independently and continues posting rollout messages to #general fine.
But /upgrade-state's broad `grep level=error` was counting these as real
errors → ⚠ on the Apps row every run. Add a small skip-pattern list so the
two recurring benign lines drop out; any new genuine Keel error still
shows. Reuses `bot.Run()` + `SLACK_APP_TOKEN must have the prev?if|prefix`
(typo in Keel's actual log message preserved as alternation).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
601 lines
24 KiB
Bash
Executable file
601 lines
24 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
#
|
|
# upgrade_state.sh — survey the three autonomous-upgrade pipelines.
|
|
#
|
|
# Companion to cluster_healthcheck.sh, surfaced via the /upgrade-state skill.
|
|
# Read-only by design — no --fix.
|
|
#
|
|
# The three pipelines:
|
|
# 1. Apps — Keel polls registries hourly and rolls Deployments tagged
|
|
# keel.sh/policy. Metrics on container :9300/metrics.
|
|
# 2. OS — unattended-upgrades patches in-release per node; kured
|
|
# reboots within a daily 02:00-06:00 London window.
|
|
# 3. K8s — k8s-version-check CronJob (Sun 12:00 UTC) detects new
|
|
# kubeadm patch/minor releases; Job-chain drains+upgrades
|
|
# node-by-node. Pushgateway holds k8s_upgrade_* gauges.
|
|
#
|
|
# Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
|
|
|
|
set -euo pipefail
|
|
|
|
# --- Colors ---
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
BLUE='\033[0;34m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
|
|
# --- Globals ---
|
|
JSON=false
|
|
KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
|
|
[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="/home/wizard/code/infra/config"
|
|
KUBECTL=""
|
|
NODES=(k8s-master:10.0.20.100 k8s-node1:10.0.20.101 k8s-node2:10.0.20.102 k8s-node3:10.0.20.103 k8s-node4:10.0.20.104)
|
|
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no)
|
|
NOW_EPOCH=$(date -u +%s)
|
|
HIGHEST_EXIT=0 # 0 healthy, 1 attention, 2 stalled
|
|
|
|
# Results — collectors fill these.
|
|
APPS_STATUS_ICON=""; APPS_STATUS_TEXT=""
|
|
APPS_LAST_CHECK=""; APPS_NEXT=""; APPS_NOTES=""
|
|
APPS_ENROLLED=0; APPS_PENDING=0; APPS_UPDATES_LINE=""; APPS_ERROR_LINE=""
|
|
|
|
OS_STATUS_ICON=""; OS_STATUS_TEXT=""
|
|
OS_LAST_CHECK=""; OS_NEXT=""; OS_NOTES=""
|
|
OS_DISTRO_SUMMARY=""; OS_KERNEL_SUMMARY=""
|
|
OS_PENDING_REBOOT_NODES=""; OS_HELD_DETAIL=""
|
|
OS_LAST_UU=""; OS_LAST_KURED=""
|
|
|
|
K8S_STATUS_ICON=""; K8S_STATUS_TEXT=""
|
|
K8S_LAST_CHECK=""; K8S_NEXT=""; K8S_NOTES=""
|
|
K8S_RUNNING=""; K8S_PATCH=""; K8S_MINOR=""
|
|
K8S_LAST_DETECT_LINE=""; K8S_IN_FLIGHT="no"; K8S_LAST_CHAIN=""
|
|
|
|
# --- Helpers ---
|
|
log() { [[ "$JSON" == true ]] && return 0; echo -e "$*"; }
|
|
|
|
raise_exit() {
|
|
local n="$1"
|
|
if [[ "$n" -gt "$HIGHEST_EXIT" ]]; then HIGHEST_EXIT="$n"; fi
|
|
return 0
|
|
}
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $0 [--json] [--kubeconfig <path>]
|
|
|
|
Read-only audit of the three autonomous-upgrade pipelines (apps, OS, k8s).
|
|
|
|
--json machine-readable JSON
|
|
--kubeconfig PATH override kubeconfig
|
|
|
|
Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
|
|
EOF
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--json) JSON=true; shift ;;
|
|
--kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
|
|
-h|--help) usage; exit 0 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
|
|
}
|
|
|
|
# Prometheus query — Prometheus + reload + backup share a network namespace,
|
|
# so reaching localhost:9090 works from any of the three sidecars.
|
|
prom_q() {
|
|
local q="$1"
|
|
$KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
|
|
wget -qO- "http://localhost:9090/api/v1/query?query=${q}" 2>/dev/null || true
|
|
}
|
|
|
|
pg_metrics() {
|
|
$KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
|
|
wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true
|
|
}
|
|
|
|
ssh_node() {
|
|
local ip="$1"; shift
|
|
ssh "${SSH_OPTS[@]}" "wizard@$ip" "$@" 2>/dev/null || true
|
|
}
|
|
|
|
human_age() {
|
|
local secs="$1"
|
|
if [[ "$secs" -lt 60 ]]; then printf '%ds ago' "$secs"
|
|
elif [[ "$secs" -lt 3600 ]]; then printf '%dm ago' $((secs/60))
|
|
elif [[ "$secs" -lt 86400 ]]; then printf '%dh ago' $((secs/3600))
|
|
else printf '%dd ago' $((secs/86400))
|
|
fi
|
|
}
|
|
|
|
# Pushgateway emits floats and scientific notation — coerce to integer
|
|
# epoch seconds. Returns 0 if the input is empty / zero / unparseable.
|
|
to_epoch_int() {
|
|
local v="${1:-}"
|
|
if [[ -z "$v" || "$v" == "0" ]]; then echo 0; return; fi
|
|
python3 -c "import sys; v=sys.argv[1]; print(int(float(v)))" "$v" 2>/dev/null || echo 0
|
|
}
|
|
|
|
# --- 1. Apps (Keel) ---
|
|
collect_apps() {
|
|
local pending tracked enrolled updates_24h errors
|
|
|
|
# Enrolled: count Deployments with keel.sh/policy != never (Keel itself
|
|
# is policy=never). The Kyverno auto-injection labels namespaces
|
|
# keel.sh/enrolled=true, but the annotation is what Keel watches.
|
|
enrolled=$($KUBECTL get deploy -A -o json 2>/dev/null | python3 -c '
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
n = sum(1 for d in data["items"]
|
|
if (d["metadata"].get("annotations") or {}).get("keel.sh/policy", "never") != "never")
|
|
print(n)
|
|
' 2>/dev/null || echo 0)
|
|
APPS_ENROLLED="$enrolled"
|
|
|
|
# Pending approvals (sum across Keel pods).
|
|
pending=$(prom_q 'sum(pending_approvals)' | python3 -c '
|
|
import json, sys
|
|
try:
|
|
r = json.load(sys.stdin)["data"]["result"]
|
|
print(int(float(r[0]["value"][1])) if r else 0)
|
|
except Exception:
|
|
print(0)
|
|
' 2>/dev/null || echo 0)
|
|
APPS_PENDING="$pending"
|
|
|
|
# Tracked images — proxy for "is the scrape live?".
|
|
tracked=$(prom_q 'count(count by (image) (registries_scanned_total))' | python3 -c '
|
|
import json, sys
|
|
try:
|
|
r = json.load(sys.stdin)["data"]["result"]
|
|
print(int(float(r[0]["value"][1])) if r else 0)
|
|
except Exception:
|
|
print(0)
|
|
' 2>/dev/null || echo 0)
|
|
|
|
# Last scrape age — `up{job="kubernetes-pods", app="keel"}` is 1 if the
|
|
# most recent scrape succeeded. We surface the wallclock age via a tiny
|
|
# `time() - timestamp(up{...})` query.
|
|
APPS_LAST_CHECK=$(prom_q 'time()-timestamp(up{job="kubernetes-pods",app="keel"})' | python3 -c '
|
|
import json, sys
|
|
try:
|
|
r = json.load(sys.stdin)["data"]["result"]
|
|
if not r: print("scrape not live")
|
|
else:
|
|
secs = int(float(r[0]["value"][1]))
|
|
if secs < 60: print(f"{secs}s ago")
|
|
elif secs < 3600: print(f"{secs//60}m ago")
|
|
else: print(f"{secs//3600}h ago")
|
|
except Exception:
|
|
print("?")
|
|
' 2>/dev/null || echo "?")
|
|
|
|
# Recent updates: count lines in Keel logs that report a successful
|
|
# rollout. Keel logs an "update completed" message per rollout.
|
|
local log_24h
|
|
log_24h=$($KUBECTL -n keel logs deploy/keel --since=24h --tail=2000 2>/dev/null || true)
|
|
updates_24h=$(echo "$log_24h" | grep -cE 'update completed|successfully updated|deployment updated' 2>/dev/null || true)
|
|
[[ -z "$updates_24h" ]] && updates_24h=0
|
|
APPS_UPDATES_LINE="$updates_24h in last 24h (tracked images: $tracked)"
|
|
|
|
# Known-benign Keel error patterns to suppress. Each is a real error
|
|
# line Keel emits, but the surrounding behaviour is fine, so flagging
|
|
# them in /upgrade-state is just noise.
|
|
# - `bot.Run(): can not get configuration for bot [slack]` — Keel
|
|
# 1.2.0 registers a Slack socket-mode bot whenever SLACK_BOT_TOKEN
|
|
# is set, then fails because we don't supply an `xapp-` app-level
|
|
# token. We don't want the interactive bot (no approvals; opt-out
|
|
# auto-update). The Slack NOTIFICATION sender works independently
|
|
# of the bot, so rollout messages still post to #general.
|
|
local benign_re='bot\.Run\(\): can not get configuration for bot \[slack\]|SLACK_APP_TOKEN must have the (previf|prefix)'
|
|
errors=$(echo "$log_24h" | grep -iE '"level":"(error|fatal)"|level=error' | grep -vE "$benign_re" | tail -3 || true)
|
|
if [[ -z "$errors" ]]; then
|
|
APPS_ERROR_LINE="(none in last 24h)"
|
|
else
|
|
APPS_ERROR_LINE="$(echo "$errors" | wc -l | tr -d ' ') error(s); newest: $(echo "$errors" | tail -1 | cut -c1-120)"
|
|
fi
|
|
|
|
# Keel pod state.
|
|
local pod_status
|
|
pod_status=$($KUBECTL -n keel get pods -l app=keel -o jsonpath='{.items[*].status.phase}' 2>/dev/null || true)
|
|
|
|
if [[ "$pod_status" != *"Running"* ]]; then
|
|
APPS_STATUS_ICON="✗"; APPS_STATUS_TEXT="down"
|
|
APPS_NOTES="Keel pod not Running ($pod_status)"
|
|
raise_exit 2
|
|
elif [[ "$pending" -gt 0 || -n "$errors" ]]; then
|
|
APPS_STATUS_ICON="⚠"; APPS_STATUS_TEXT="attn"
|
|
APPS_NOTES="$enrolled enrolled; $pending pending; $(echo "$errors" | wc -l | tr -d ' ') recent error(s)"
|
|
raise_exit 1
|
|
else
|
|
APPS_STATUS_ICON="✓"; APPS_STATUS_TEXT="healthy"
|
|
APPS_NOTES="$enrolled enrolled, 0 pending, 0 errors"
|
|
fi
|
|
|
|
APPS_NEXT="rolling, hourly poll"
|
|
}
|
|
|
|
# --- 2. OS (apt + kured) ---
|
|
collect_os() {
|
|
local distros kernels distro_uniq kernel_uniq
|
|
distros=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.osImage}{"\n"}{end}' 2>/dev/null)
|
|
kernels=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kernelVersion}{"\n"}{end}' 2>/dev/null)
|
|
distro_uniq=$(echo "$distros" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
|
|
kernel_uniq=$(echo "$kernels" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
|
|
OS_DISTRO_SUMMARY="$distro_uniq"
|
|
OS_KERNEL_SUMMARY="$kernel_uniq"
|
|
|
|
# SSH fan-out — parallel background subshells, write per-node results to tmp files.
|
|
local tmpdir; tmpdir=$(mktemp -d)
|
|
trap 'rm -rf "$tmpdir"' RETURN
|
|
local entry name ip
|
|
for entry in "${NODES[@]}"; do
|
|
name="${entry%%:*}"; ip="${entry##*:}"
|
|
(
|
|
local out reboot held upgradable uu_log
|
|
reboot=$(ssh_node "$ip" 'test -f /var/run/reboot-required && echo yes || echo no')
|
|
held=$(ssh_node "$ip" 'apt-mark showhold 2>/dev/null')
|
|
upgradable=$(ssh_node "$ip" 'apt list --upgradable 2>/dev/null | tail -n +2')
|
|
uu_log=$(ssh_node "$ip" 'tail -1 /var/log/unattended-upgrades/unattended-upgrades.log 2>/dev/null')
|
|
printf 'reboot=%s\n' "$reboot" > "$tmpdir/$name"
|
|
printf 'held<<<EOF\n%s\nEOF\n' "$held" >> "$tmpdir/$name"
|
|
printf 'upgradable<<<EOF\n%s\nEOF\n' "$upgradable" >> "$tmpdir/$name"
|
|
printf 'uu_log=%s\n' "$uu_log" >> "$tmpdir/$name"
|
|
) &
|
|
done
|
|
wait
|
|
|
|
# Aggregate.
|
|
local pending_reboots=() held_with_bumps_lines=() newest_uu_ts=0 newest_uu_iso=""
|
|
for entry in "${NODES[@]}"; do
|
|
name="${entry%%:*}"
|
|
[[ -f "$tmpdir/$name" ]] || continue
|
|
local reboot held upgradable uu_log uu_ts
|
|
reboot=$(awk -F= '/^reboot=/{print $2}' "$tmpdir/$name")
|
|
held=$(awk '/^held<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
|
|
upgradable=$(awk '/^upgradable<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
|
|
uu_log=$(awk -F= '/^uu_log=/{sub(/^uu_log=/,""); print}' "$tmpdir/$name")
|
|
|
|
[[ "$reboot" == "yes" ]] && pending_reboots+=("$name")
|
|
|
|
# Held + upgradable, excluding k8s components (managed by k8s pipeline).
|
|
local pkg from to bump
|
|
while IFS= read -r line; do
|
|
[[ -z "$line" ]] && continue
|
|
pkg=$(echo "$line" | awk -F/ '{print $1}')
|
|
# Skip k8s and kernel/linux-image — the chain handles those.
|
|
case "$pkg" in
|
|
kubeadm|kubectl|kubelet) continue ;;
|
|
linux-image-*|linux-headers-*|linux-modules-*|linux-generic|linux-headers-generic|linux-image-generic) continue ;;
|
|
esac
|
|
# Only flag if the package is held.
|
|
if echo "$held" | grep -qx "$pkg"; then
|
|
to=$(echo "$line" | awk '{print $2}')
|
|
from=$(echo "$line" | sed -n 's/.*from: \([^ ]*\).*/\1/p')
|
|
bump="$pkg ${from%-*}→${to%-*}"
|
|
held_with_bumps_lines+=("$name: $bump")
|
|
fi
|
|
done <<<"$upgradable"
|
|
|
|
# Newest uu timestamp (ISO at start of log line).
|
|
uu_ts=$(echo "$uu_log" | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}).*/\1/')
|
|
if [[ -n "$uu_ts" ]]; then
|
|
local epoch; epoch=$(date -u -d "$uu_ts" +%s 2>/dev/null || echo 0)
|
|
if [[ "$epoch" -gt "$newest_uu_ts" ]]; then
|
|
newest_uu_ts="$epoch"; newest_uu_iso="$uu_ts"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
OS_PENDING_REBOOT_NODES="${pending_reboots[*]:-}"
|
|
if [[ ${#held_with_bumps_lines[@]} -gt 0 ]]; then
|
|
OS_HELD_DETAIL=$(printf '%s\n' "${held_with_bumps_lines[@]}" | sort -u | paste -sd '; ' -)
|
|
fi
|
|
|
|
if [[ "$newest_uu_ts" -gt 0 ]]; then
|
|
local age=$((NOW_EPOCH - newest_uu_ts))
|
|
OS_LAST_UU="$newest_uu_iso UTC ($(human_age "$age"))"
|
|
OS_LAST_CHECK="$(human_age "$age") (uu daily)"
|
|
else
|
|
OS_LAST_UU="(no uu log accessible)"
|
|
OS_LAST_CHECK="?"
|
|
fi
|
|
|
|
# Last kured reboot — newest Ready transition across worker nodes.
|
|
# `Ready -> True` is what kured causes when the node returns; we surface
|
|
# the most recent timestamp and the node it belongs to.
|
|
local kured_raw kured_iso kured_node kured_ep kured_age
|
|
kured_raw=$($KUBECTL get nodes -o json 2>/dev/null | python3 -c '
|
|
import json, sys
|
|
from datetime import datetime
|
|
data = json.load(sys.stdin)
|
|
best = (0, "", "")
|
|
for n in data["items"]:
|
|
name = n["metadata"]["name"]
|
|
for c in n["status"].get("conditions", []):
|
|
if c["type"] == "Ready":
|
|
dt = datetime.strptime(c["lastTransitionTime"], "%Y-%m-%dT%H:%M:%SZ")
|
|
ep = int(dt.timestamp())
|
|
if ep > best[0]:
|
|
best = (ep, name, c["lastTransitionTime"])
|
|
print(f"{best[0]}|{best[1]}|{best[2]}")
|
|
' 2>/dev/null || echo "0||")
|
|
kured_ep="${kured_raw%%|*}"
|
|
kured_node=$(echo "$kured_raw" | cut -d'|' -f2)
|
|
kured_iso=$(echo "$kured_raw" | cut -d'|' -f3)
|
|
if [[ "$kured_ep" -gt 0 ]]; then
|
|
kured_age=$((NOW_EPOCH - kured_ep))
|
|
OS_LAST_KURED="$kured_iso ($kured_node, $(human_age "$kured_age"))"
|
|
else
|
|
OS_LAST_KURED="?"
|
|
fi
|
|
|
|
OS_NEXT="daily 02:00-06:00 London"
|
|
|
|
# Kured pod health.
|
|
local kured_pods kured_unhealthy
|
|
kured_pods=$($KUBECTL -n kured get pods -l app.kubernetes.io/name=kured -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/dev/null)
|
|
kured_unhealthy=$(echo "$kured_pods" | grep -cv '^Running$' 2>/dev/null || true)
|
|
|
|
local notes=()
|
|
[[ -n "$OS_HELD_DETAIL" ]] && notes+=("held with bumps: $OS_HELD_DETAIL")
|
|
[[ -n "$OS_PENDING_REBOOT_NODES" ]] && notes+=("pending reboot: $OS_PENDING_REBOOT_NODES")
|
|
|
|
if [[ "$kured_unhealthy" -gt 0 ]]; then
|
|
OS_STATUS_ICON="✗"; OS_STATUS_TEXT="kured down"
|
|
OS_NOTES="kured pods not all Running"
|
|
raise_exit 2
|
|
elif [[ ${#notes[@]} -gt 0 ]]; then
|
|
OS_STATUS_ICON="⚠"; OS_STATUS_TEXT="attn"
|
|
OS_NOTES="${notes[*]}"
|
|
raise_exit 1
|
|
else
|
|
OS_STATUS_ICON="✓"; OS_STATUS_TEXT="healthy"
|
|
OS_NOTES="distros uniform; no held bumps; no pending reboots"
|
|
fi
|
|
}
|
|
|
|
# --- 3. K8s (kubeadm/kubelet/kubectl) ---
|
|
collect_k8s() {
|
|
local kver_list kver_uniq metrics target_patch target_minor last_run in_flight started
|
|
|
|
kver_list=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' 2>/dev/null)
|
|
kver_uniq=$(echo "$kver_list" | sort -u)
|
|
local n_uniq; n_uniq=$(echo "$kver_uniq" | wc -l | tr -d ' ')
|
|
if [[ "$n_uniq" -eq 1 ]]; then
|
|
K8S_RUNNING="$kver_uniq across $(echo "$kver_list" | wc -l | tr -d ' ')/$(echo "$kver_list" | wc -l | tr -d ' ') nodes"
|
|
else
|
|
K8S_RUNNING="mixed: $(echo "$kver_uniq" | paste -sd', ' -)"
|
|
fi
|
|
local running_ver; running_ver=$(echo "$kver_uniq" | head -1)
|
|
|
|
metrics=$(pg_metrics)
|
|
# All five may legitimately be absent (cluster never ran the upgrade
|
|
# chain, kind="minor" not detected, etc.) — `|| true` keeps pipefail
|
|
# from killing the script on no-match.
|
|
target_patch=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="patch"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
|
|
target_minor=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="minor"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
|
|
# Pushgateway emits these with `{instance="",job="..."}` labels — the
|
|
# `awk '$1 ~ /^name(\{|$)/'` form matches both bare and labelled metrics.
|
|
last_run=$(echo "$metrics" | awk '$1 ~ /^k8s_version_check_last_run_timestamp(\{|$)/{print $2}' | head -1 || true)
|
|
in_flight=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_in_flight(\{|$)/{print $2}' | head -1 || true)
|
|
started=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_started_timestamp(\{|$)/{print $2}' | head -1 || true)
|
|
|
|
# Pushgateway timestamps come back in scientific notation
|
|
# (e.g. 1.779052159e+09) — convert to plain integer seconds.
|
|
local last_run_int started_int
|
|
last_run_int=$(to_epoch_int "$last_run")
|
|
started_int=$(to_epoch_int "$started")
|
|
|
|
if [[ "$last_run_int" -gt 0 ]]; then
|
|
local age=$((NOW_EPOCH - last_run_int))
|
|
K8S_LAST_CHECK="$(human_age "$age") (daily cron)"
|
|
if [[ -n "$target_patch" ]]; then
|
|
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_patch (patch)"
|
|
elif [[ -n "$target_minor" ]]; then
|
|
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_minor (minor)"
|
|
else
|
|
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): no upgrade available"
|
|
fi
|
|
else
|
|
K8S_LAST_CHECK="(metric missing)"
|
|
K8S_LAST_DETECT_LINE="(no k8s_version_check_last_run_timestamp in Pushgateway)"
|
|
fi
|
|
K8S_PATCH="${target_patch:-none}"
|
|
K8S_MINOR="${target_minor:-none}"
|
|
|
|
# In-flight / last chain.
|
|
if [[ "${in_flight:-0}" == "1" ]]; then
|
|
K8S_IN_FLIGHT="yes"
|
|
local since=0
|
|
[[ "$started_int" -gt 0 ]] && since=$((NOW_EPOCH - started_int))
|
|
K8S_LAST_CHAIN="in-flight (started $(human_age "$since"))"
|
|
else
|
|
K8S_IN_FLIGHT="no"
|
|
if [[ "$started_int" -gt 0 ]]; then
|
|
local age=$((NOW_EPOCH - started_int))
|
|
K8S_LAST_CHAIN="$(human_age "$age")"
|
|
else
|
|
K8S_LAST_CHAIN="never (or zeroed)"
|
|
fi
|
|
fi
|
|
|
|
K8S_NEXT="$(next_daily_noon_utc)"
|
|
|
|
# Status logic.
|
|
local stalled=0
|
|
if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then
|
|
# K8sUpgradeStalled fires after 5400s (90m) per monitoring stack.
|
|
local since=$((NOW_EPOCH - started_int))
|
|
[[ "$since" -gt 5400 ]] && stalled=1
|
|
fi
|
|
local last_run_age=999999999
|
|
[[ "$last_run_int" -gt 0 ]] && last_run_age=$((NOW_EPOCH - last_run_int))
|
|
|
|
if [[ "$stalled" == "1" ]]; then
|
|
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="stalled"
|
|
K8S_NOTES="K8sUpgradeStalled would fire — chain in-flight >90m"
|
|
raise_exit 2
|
|
elif [[ "$last_run_age" -gt $((9*86400)) ]]; then
|
|
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale"
|
|
K8S_NOTES="last detection >9d ago"
|
|
raise_exit 2
|
|
elif [[ "${in_flight:-0}" == "1" ]]; then
|
|
K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight"
|
|
K8S_NOTES="upgrade chain running"
|
|
raise_exit 1
|
|
elif [[ -n "$target_patch" ]]; then
|
|
K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_patch"
|
|
K8S_NOTES="running $running_ver → v$target_patch (patch) available"
|
|
raise_exit 1
|
|
elif [[ -n "$target_minor" ]]; then
|
|
K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_minor"
|
|
K8S_NOTES="running $running_ver → v$target_minor (minor) available"
|
|
raise_exit 1
|
|
else
|
|
K8S_STATUS_ICON="✓"; K8S_STATUS_TEXT="current"
|
|
K8S_NOTES="running $running_ver, nothing newer"
|
|
fi
|
|
}
|
|
|
|
# Next daily 12:00 UTC — pure bash date math, no croniter. Schedule was
|
|
# weekly Sunday until 2026-05-18; now `0 12 * * *` in the
|
|
# k8s-version-upgrade stack. If we're still before today's 12:00 UTC,
|
|
# the next run is today; otherwise it's tomorrow.
|
|
next_daily_noon_utc() {
|
|
local hr days_ahead
|
|
hr=$(date -u +%H)
|
|
if [[ "$hr" -lt 12 ]]; then days_ahead=0; else days_ahead=1; fi
|
|
date -u -d "+$days_ahead days" +"%a %Y-%m-%d 12:00 UTC"
|
|
}
|
|
|
|
# --- Renderers ---
|
|
# The table uses `column -t` so we don't have to compute visual widths
|
|
# manually (the status icons are multi-byte UTF-8 and ANSI escapes don't
|
|
# play nice with `printf %-Xs`). Trade-off: no in-cell colour, but the
|
|
# icon character already carries the signal.
|
|
render_table() {
|
|
echo
|
|
printf "${BOLD}Upgrade state — %s${NC}\n" "$(date -u +'%Y-%m-%d %H:%M UTC')"
|
|
echo
|
|
{
|
|
echo "Layer|Status|Last check|Next upgrade|Notes"
|
|
echo "-----|------|----------|------------|-----"
|
|
printf 'Apps|%s %s|%s|%s|%s\n' "$APPS_STATUS_ICON" "$APPS_STATUS_TEXT" "$APPS_LAST_CHECK" "$APPS_NEXT" "$APPS_NOTES"
|
|
printf 'OS |%s %s|%s|%s|%s\n' "$OS_STATUS_ICON" "$OS_STATUS_TEXT" "$OS_LAST_CHECK" "$OS_NEXT" "$OS_NOTES"
|
|
printf 'K8s |%s %s|%s|%s|%s\n' "$K8S_STATUS_ICON" "$K8S_STATUS_TEXT" "$K8S_LAST_CHECK" "$K8S_NEXT" "$K8S_NOTES"
|
|
} | column -t -s '|' -o ' | '
|
|
|
|
echo
|
|
printf "${BOLD}--- Apps (Keel) ---${NC}\n"
|
|
echo "Enrolled deployments: $APPS_ENROLLED"
|
|
echo "Recent rollouts: $APPS_UPDATES_LINE"
|
|
echo "Pending approvals: $APPS_PENDING"
|
|
echo "Last Keel error: $APPS_ERROR_LINE"
|
|
|
|
echo
|
|
printf "${BOLD}--- OS (apt + kured) ---${NC}\n"
|
|
echo "Ubuntu per node: $OS_DISTRO_SUMMARY"
|
|
echo "Kernel per node: $OS_KERNEL_SUMMARY"
|
|
echo "Pending reboot: ${OS_PENDING_REBOOT_NODES:-none}"
|
|
echo "Held packages with upstream bumps: ${OS_HELD_DETAIL:-none (excluding k8s components)}"
|
|
echo "Last uu run (newest across nodes): $OS_LAST_UU"
|
|
echo "Last kured reboot (newest Ready transition): $OS_LAST_KURED"
|
|
echo "Next kured window: $OS_NEXT"
|
|
|
|
echo
|
|
printf "${BOLD}--- K8s (kubeadm/kubelet/kubectl) ---${NC}\n"
|
|
echo "Running: $K8S_RUNNING"
|
|
echo "Latest patch (apt): ${K8S_PATCH}"
|
|
echo "Next minor available: ${K8S_MINOR}"
|
|
echo "Detection: $K8S_LAST_DETECT_LINE"
|
|
echo "In-flight: $K8S_IN_FLIGHT | Last chain start: $K8S_LAST_CHAIN"
|
|
echo "Next detection: $K8S_NEXT"
|
|
echo
|
|
}
|
|
|
|
render_json() {
|
|
# Pipe values into Python via env vars so we don't need to worry about
|
|
# embedded quotes/backslashes in error lines.
|
|
APPS_STATUS_ICON="$APPS_STATUS_ICON" APPS_STATUS_TEXT="$APPS_STATUS_TEXT" \
|
|
APPS_LAST_CHECK="$APPS_LAST_CHECK" APPS_NEXT="$APPS_NEXT" APPS_NOTES="$APPS_NOTES" \
|
|
APPS_ENROLLED="$APPS_ENROLLED" APPS_PENDING="$APPS_PENDING" \
|
|
APPS_UPDATES_LINE="$APPS_UPDATES_LINE" APPS_ERROR_LINE="$APPS_ERROR_LINE" \
|
|
OS_STATUS_ICON="$OS_STATUS_ICON" OS_STATUS_TEXT="$OS_STATUS_TEXT" \
|
|
OS_LAST_CHECK="$OS_LAST_CHECK" OS_NEXT="$OS_NEXT" OS_NOTES="$OS_NOTES" \
|
|
OS_DISTRO_SUMMARY="$OS_DISTRO_SUMMARY" OS_KERNEL_SUMMARY="$OS_KERNEL_SUMMARY" \
|
|
OS_PENDING_REBOOT_NODES="$OS_PENDING_REBOOT_NODES" OS_HELD_DETAIL="$OS_HELD_DETAIL" \
|
|
OS_LAST_UU="$OS_LAST_UU" OS_LAST_KURED="$OS_LAST_KURED" \
|
|
K8S_STATUS_ICON="$K8S_STATUS_ICON" K8S_STATUS_TEXT="$K8S_STATUS_TEXT" \
|
|
K8S_LAST_CHECK="$K8S_LAST_CHECK" K8S_NEXT="$K8S_NEXT" K8S_NOTES="$K8S_NOTES" \
|
|
K8S_RUNNING="$K8S_RUNNING" K8S_PATCH="$K8S_PATCH" K8S_MINOR="$K8S_MINOR" \
|
|
K8S_LAST_DETECT_LINE="$K8S_LAST_DETECT_LINE" K8S_IN_FLIGHT="$K8S_IN_FLIGHT" K8S_LAST_CHAIN="$K8S_LAST_CHAIN" \
|
|
HIGHEST_EXIT="$HIGHEST_EXIT" \
|
|
python3 -c '
|
|
import json, os
|
|
from datetime import datetime, timezone
|
|
def env(k): return os.environ.get(k, "")
|
|
out = {
|
|
"as_of_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"highest_exit": int(env("HIGHEST_EXIT")),
|
|
"apps": {
|
|
"status": env("APPS_STATUS_ICON"),
|
|
"status_text": env("APPS_STATUS_TEXT"),
|
|
"last_check": env("APPS_LAST_CHECK"),
|
|
"next_upgrade": env("APPS_NEXT"),
|
|
"notes": env("APPS_NOTES"),
|
|
"enrolled": int(env("APPS_ENROLLED") or 0),
|
|
"pending_approvals": int(env("APPS_PENDING") or 0),
|
|
"updates_line": env("APPS_UPDATES_LINE"),
|
|
"errors_line": env("APPS_ERROR_LINE"),
|
|
},
|
|
"os": {
|
|
"status": env("OS_STATUS_ICON"),
|
|
"status_text": env("OS_STATUS_TEXT"),
|
|
"last_check": env("OS_LAST_CHECK"),
|
|
"next_upgrade": env("OS_NEXT"),
|
|
"notes": env("OS_NOTES"),
|
|
"distros": env("OS_DISTRO_SUMMARY"),
|
|
"kernels": env("OS_KERNEL_SUMMARY"),
|
|
"pending_reboot_nodes": env("OS_PENDING_REBOOT_NODES"),
|
|
"held_with_bumps": env("OS_HELD_DETAIL"),
|
|
"last_uu_run": env("OS_LAST_UU"),
|
|
"last_kured_reboot": env("OS_LAST_KURED"),
|
|
},
|
|
"k8s": {
|
|
"status": env("K8S_STATUS_ICON"),
|
|
"status_text": env("K8S_STATUS_TEXT"),
|
|
"last_check": env("K8S_LAST_CHECK"),
|
|
"next_upgrade": env("K8S_NEXT"),
|
|
"notes": env("K8S_NOTES"),
|
|
"running": env("K8S_RUNNING"),
|
|
"patch_target": env("K8S_PATCH"),
|
|
"minor_target": env("K8S_MINOR"),
|
|
"last_detection_line": env("K8S_LAST_DETECT_LINE"),
|
|
"in_flight": env("K8S_IN_FLIGHT"),
|
|
"last_chain": env("K8S_LAST_CHAIN"),
|
|
},
|
|
}
|
|
print(json.dumps(out, indent=2))
|
|
'
|
|
}
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
collect_apps
|
|
collect_os
|
|
collect_k8s
|
|
if [[ "$JSON" == true ]]; then
|
|
render_json
|
|
else
|
|
render_table
|
|
fi
|
|
exit "$HIGHEST_EXIT"
|
|
}
|
|
|
|
main "$@"
|