k8s-version-upgrade: automated kubeadm/kubelet/kubectl upgrade pipeline
Adds a weekly detection CronJob (Sun 12:00 UTC) that probes apt-cache madison
on master for new patches + HEAD pkgs.k8s.io for next-minor availability,
then POSTs to claude-agent-service to dispatch the k8s-version-upgrade agent.
The agent (.claude/agents/k8s-version-upgrade.md) orchestrates:
pre-flight (5 nodes Ready + halt-on-alert + 24h-quiet + plan target match)
-> etcd snapshot save
-> optional master containerd skew fix
-> apt repo URL rewrite (minor bumps only)
-> drain/upgrade/uncordon master via ssh < update_k8s.sh
-> sequential workers k8s-node4 -> 3 -> 2 -> 1 with 10-min soak each
-> post-flight verification
Two new Upgrade Gates alerts catch failure modes:
- K8sVersionSkew (kubelet/apiserver gitVersion mismatch >30m)
- EtcdPreUpgradeSnapshotMissing (in_flight without snapshot_taken >10m)
update_k8s.sh refactored to take --role / --release args; the agent shells
it into each node via SSH pipe. update_node.sh annotated as OS-major path.
Operator-facing docs: docs/runbooks/k8s-version-upgrade.md and a new section
in docs/architecture/automated-upgrades.md.
Secrets: secret/k8s-upgrade/{ssh_key,ssh_key_pub,slack_webhook} (ed25519
keypair distributed to all 5 nodes via authorized_keys; slack_webhook
reuses kured webhook URL on initial deploy).
This commit is contained in:
parent
09f83b4e83
commit
e75bcaf394
8 changed files with 1379 additions and 34 deletions
456
stacks/k8s-version-upgrade/main.tf
Normal file
456
stacks/k8s-version-upgrade/main.tf
Normal file
|
|
@ -0,0 +1,456 @@
|
|||
# k8s-version-upgrade — Automated K8s component (kubeadm/kubelet/kubectl) upgrade
|
||||
#
|
||||
# Detects new patch/minor versions via a weekly CronJob, then dispatches the
|
||||
# `k8s-version-upgrade` agent (infra/.claude/agents/k8s-version-upgrade.md)
|
||||
# through claude-agent-service for the actual rolling upgrade.
|
||||
#
|
||||
# Reuse points:
|
||||
# - claude-agent-service.claude-agent.svc:8080 — agent job runner
|
||||
# - Vault secret/k8s-upgrade/* — operator populates ssh_key + slack_webhook
|
||||
# - Prometheus + Pushgateway + Upgrade Gates alert group (in monitoring stack)
|
||||
# - update_k8s.sh — library script the agent shells into nodes with
|
||||
#
|
||||
# Notes:
|
||||
# - Schedule is Sun 12:00 UTC — well outside the kured Mon-Fri 02:00-06:00
|
||||
# London window so OS reboots and K8s version rollouts can't overlap.
|
||||
# - Patch detection uses `apt-cache madison kubeadm` on master via SSH.
|
||||
# Minor detection probes the next-minor apt repo URL with HEAD.
|
||||
|
||||
variable "schedule" {
|
||||
type = string
|
||||
default = "0 12 * * 0" # Sunday 12:00 UTC
|
||||
}
|
||||
|
||||
# Toggle to suspend the detection CronJob without dropping the stack.
|
||||
variable "enabled" {
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
# Mirrors `local.image_tag` in stacks/claude-agent-service/main.tf — keep in
|
||||
# sync when the claude-agent-service image is rebuilt. Reused here because the
|
||||
# detection CronJob only needs kubectl, ssh-client, curl, jq — all of which
|
||||
# the claude-agent-service image already ships.
|
||||
variable "claude_agent_service_image_tag" {
|
||||
type = string
|
||||
default = "2fd7670d"
|
||||
}
|
||||
|
||||
# If true, the CronJob runs the detection sequence but does NOT POST to
|
||||
# claude-agent-service. Used for Test 1 to confirm detection works without
|
||||
# firing a real upgrade.
|
||||
variable "detection_dry_run" {
|
||||
type = bool
|
||||
default = false
|
||||
}
|
||||
|
||||
locals {
|
||||
namespace = "k8s-upgrade"
|
||||
ca_image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.claude_agent_service_image_tag}"
|
||||
labels = {
|
||||
app = "k8s-version-check"
|
||||
}
|
||||
}
|
||||
|
||||
# --- Namespace ---
|
||||
|
||||
resource "kubernetes_namespace" "k8s_upgrade" {
|
||||
metadata {
|
||||
name = local.namespace
|
||||
labels = {
|
||||
tier = local.tiers.cluster
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
|
||||
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
||||
}
|
||||
}
|
||||
|
||||
# --- ExternalSecret: ssh_key + slack_webhook + agent-service bearer ---
|
||||
#
|
||||
# Operator populates Vault `secret/k8s-upgrade/` with:
|
||||
# - ssh_key (PEM-encoded ed25519 private key)
|
||||
# - ssh_key_pub (the matching public key — distributed to nodes' authorized_keys)
|
||||
# - slack_webhook (Slack incoming-webhook URL, separate channel from kured for clean alerting)
|
||||
#
|
||||
# The claude-agent-service bearer token comes from secret/claude-agent-service
|
||||
# (reused — no parallel token needed).
|
||||
|
||||
resource "kubernetes_manifest" "external_secret" {
|
||||
manifest = {
|
||||
apiVersion = "external-secrets.io/v1beta1"
|
||||
kind = "ExternalSecret"
|
||||
metadata = {
|
||||
name = "k8s-upgrade-creds"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
refreshInterval = "15m"
|
||||
secretStoreRef = {
|
||||
name = "vault-kv"
|
||||
kind = "ClusterSecretStore"
|
||||
}
|
||||
target = {
|
||||
name = "k8s-upgrade-creds"
|
||||
}
|
||||
data = [
|
||||
{
|
||||
secretKey = "ssh_key"
|
||||
remoteRef = {
|
||||
key = "k8s-upgrade"
|
||||
property = "ssh_key"
|
||||
}
|
||||
},
|
||||
{
|
||||
secretKey = "slack_webhook"
|
||||
remoteRef = {
|
||||
key = "k8s-upgrade"
|
||||
property = "slack_webhook"
|
||||
}
|
||||
},
|
||||
{
|
||||
secretKey = "api_bearer_token"
|
||||
remoteRef = {
|
||||
key = "claude-agent-service"
|
||||
property = "api_bearer_token"
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# --- ServiceAccount + RBAC for the detection CronJob ---
|
||||
|
||||
resource "kubernetes_service_account" "k8s_version_check" {
|
||||
metadata {
|
||||
name = "k8s-version-check"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
# Cluster-wide read on nodes (for kubeletVersion comparison)
|
||||
resource "kubernetes_cluster_role" "k8s_version_check" {
|
||||
metadata {
|
||||
name = "k8s-version-check"
|
||||
}
|
||||
rule {
|
||||
api_groups = [""]
|
||||
resources = ["nodes"]
|
||||
verbs = ["get", "list"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "k8s_version_check" {
|
||||
metadata {
|
||||
name = "k8s-version-check"
|
||||
}
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "ClusterRole"
|
||||
name = kubernetes_cluster_role.k8s_version_check.metadata[0].name
|
||||
}
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = kubernetes_service_account.k8s_version_check.metadata[0].name
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
# Namespace-scoped: detection CronJob reads its own creds Secret.
|
||||
resource "kubernetes_role" "k8s_version_check_secrets" {
|
||||
metadata {
|
||||
name = "k8s-version-check-secrets"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
rule {
|
||||
api_groups = [""]
|
||||
resources = ["secrets"]
|
||||
resource_names = ["k8s-upgrade-creds"]
|
||||
verbs = ["get"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_role_binding" "k8s_version_check_secrets" {
|
||||
metadata {
|
||||
name = "k8s-version-check-secrets"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "Role"
|
||||
name = kubernetes_role.k8s_version_check_secrets.metadata[0].name
|
||||
}
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = kubernetes_service_account.k8s_version_check.metadata[0].name
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
# --- Cross-namespace RBAC: claude-agent SA reads k8s-upgrade-creds + annotates ns ---
|
||||
#
|
||||
# The k8s-version-upgrade agent runs inside the claude-agent-service pod (SA
|
||||
# `claude-agent` in `claude-agent` ns). It needs:
|
||||
# - GET on this namespace's k8s-upgrade-creds Secret (to fetch ssh_key + slack)
|
||||
# - PATCH on the k8s-upgrade Namespace annotations (in-flight marker)
|
||||
|
||||
resource "kubernetes_role" "claude_agent_reads_creds" {
|
||||
metadata {
|
||||
name = "claude-agent-reads-creds"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
rule {
|
||||
api_groups = [""]
|
||||
resources = ["secrets"]
|
||||
resource_names = ["k8s-upgrade-creds"]
|
||||
verbs = ["get"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_role_binding" "claude_agent_reads_creds" {
|
||||
metadata {
|
||||
name = "claude-agent-reads-creds"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
}
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "Role"
|
||||
name = kubernetes_role.claude_agent_reads_creds.metadata[0].name
|
||||
}
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = "claude-agent"
|
||||
namespace = "claude-agent"
|
||||
}
|
||||
}
|
||||
|
||||
# The claude-agent ClusterRole already grants `get,list,watch` on namespaces
|
||||
# but NOT patch — so we need to extend it here for the annotation write.
|
||||
# Bound via a separate ClusterRoleBinding so we don't fork the upstream stack.
|
||||
resource "kubernetes_cluster_role" "claude_agent_annotates_ns" {
|
||||
metadata {
|
||||
name = "claude-agent-annotates-k8s-upgrade-ns"
|
||||
}
|
||||
rule {
|
||||
api_groups = [""]
|
||||
resources = ["namespaces"]
|
||||
resource_names = ["k8s-upgrade"]
|
||||
verbs = ["patch", "update"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "claude_agent_annotates_ns" {
|
||||
metadata {
|
||||
name = "claude-agent-annotates-k8s-upgrade-ns"
|
||||
}
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "ClusterRole"
|
||||
name = kubernetes_cluster_role.claude_agent_annotates_ns.metadata[0].name
|
||||
}
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = "claude-agent"
|
||||
namespace = "claude-agent"
|
||||
}
|
||||
}
|
||||
|
||||
# --- Detection CronJob ---
|
||||
#
|
||||
# Weekly: compares running cluster version against latest available patch
|
||||
# (apt-cache madison kubeadm on master) and latest available minor (HEAD on
|
||||
# next-minor pkgs.k8s.io repo). When a target is detected, POSTs to
|
||||
# claude-agent-service to kick the upgrade agent.
|
||||
|
||||
resource "kubernetes_cron_job_v1" "k8s_version_check" {
|
||||
metadata {
|
||||
name = "k8s-version-check"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
schedule = var.schedule
|
||||
concurrency_policy = "Forbid"
|
||||
successful_jobs_history_limit = 3
|
||||
failed_jobs_history_limit = 3
|
||||
starting_deadline_seconds = 600
|
||||
suspend = !var.enabled
|
||||
job_template {
|
||||
metadata {
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
backoff_limit = 0
|
||||
ttl_seconds_after_finished = 86400
|
||||
template {
|
||||
metadata {
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
service_account_name = kubernetes_service_account.k8s_version_check.metadata[0].name
|
||||
restart_policy = "Never"
|
||||
image_pull_secrets {
|
||||
name = "registry-credentials"
|
||||
}
|
||||
container {
|
||||
name = "version-check"
|
||||
image = local.ca_image
|
||||
command = ["/bin/bash", "-c", <<-EOT
|
||||
set -euo pipefail
|
||||
echo "==> k8s-version-check ($(date -u +%FT%TZ))"
|
||||
|
||||
# 1. Load SSH key from K8s Secret
|
||||
mkdir -p /tmp
|
||||
/usr/local/bin/kubectl get secret k8s-upgrade-creds \
|
||||
-o jsonpath='{.data.ssh_key}' | base64 -d > /tmp/k8s-upgrade-ssh-key
|
||||
chmod 400 /tmp/k8s-upgrade-ssh-key
|
||||
|
||||
SLACK=$(/usr/local/bin/kubectl get secret k8s-upgrade-creds \
|
||||
-o jsonpath='{.data.slack_webhook}' | base64 -d)
|
||||
|
||||
AGENT_TOKEN=$(/usr/local/bin/kubectl get secret k8s-upgrade-creds \
|
||||
-o jsonpath='{.data.api_bearer_token}' | base64 -d)
|
||||
|
||||
SSH="ssh -i /tmp/k8s-upgrade-ssh-key \
|
||||
-o StrictHostKeyChecking=accept-new \
|
||||
-o UserKnownHostsFile=/tmp/known_hosts"
|
||||
|
||||
slack() {
|
||||
curl -sS -X POST -H 'Content-Type: application/json' \
|
||||
--data "$(jq -nc --arg t "[k8s-version-check] $1" '{text: $t}')" \
|
||||
"$SLACK" || true
|
||||
}
|
||||
|
||||
# 2. Detect running version
|
||||
RUNNING=$(/usr/local/bin/kubectl get nodes \
|
||||
-o jsonpath='{.items[0].status.nodeInfo.kubeletVersion}' \
|
||||
| tr -d v)
|
||||
RUNNING_MINOR=$(echo "$RUNNING" | awk -F. '{print $1"."$2}')
|
||||
echo "Running version: v$RUNNING (minor $RUNNING_MINOR)"
|
||||
|
||||
# 3. Detect highest available patch within the running minor track.
|
||||
LATEST_PATCH=$($SSH wizard@k8s-master \
|
||||
"apt-cache madison kubeadm 2>/dev/null \
|
||||
| awk '{print \$3}' \
|
||||
| sed 's/-.*//' \
|
||||
| grep '^$RUNNING_MINOR\\.' \
|
||||
| sort -V | tail -1" || echo "")
|
||||
echo "Latest patch (apt): v$LATEST_PATCH"
|
||||
|
||||
# 4. Detect next available minor by probing the apt repo URL.
|
||||
NEXT_MINOR_NUM=$(( $(echo "$RUNNING_MINOR" | cut -d. -f2) + 1 ))
|
||||
NEXT_MINOR="1.$NEXT_MINOR_NUM"
|
||||
NEXT_MINOR_AVAILABLE="no"
|
||||
if curl -sIo /dev/null -w '%%{http_code}' \
|
||||
"https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Release" \
|
||||
| grep -q '^200$'; then
|
||||
NEXT_MINOR_AVAILABLE="yes"
|
||||
fi
|
||||
echo "Next minor v$NEXT_MINOR available: $NEXT_MINOR_AVAILABLE"
|
||||
|
||||
# 5. Decide what to do
|
||||
TARGET=""
|
||||
KIND=""
|
||||
if [ -n "$LATEST_PATCH" ] && [ "$LATEST_PATCH" != "$RUNNING" ]; then
|
||||
TARGET="$LATEST_PATCH"
|
||||
KIND="patch"
|
||||
elif [ "$NEXT_MINOR_AVAILABLE" = "yes" ]; then
|
||||
# Probe the minor track to get its latest patch.
|
||||
NEXT_MINOR_PATCH=$($SSH wizard@k8s-master \
|
||||
"curl -sf 'https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Packages' \
|
||||
| grep -oE 'Version: [0-9.-]+' \
|
||||
| awk '{print \$2}' | sed 's/-.*//' \
|
||||
| sort -V | tail -1" || echo "")
|
||||
if [ -n "$NEXT_MINOR_PATCH" ]; then
|
||||
TARGET="$NEXT_MINOR_PATCH"
|
||||
KIND="minor"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 6. Push the discovery metric to Pushgateway
|
||||
PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/k8s-version-check'
|
||||
{
|
||||
echo "# TYPE k8s_upgrade_available gauge"
|
||||
if [ -n "$TARGET" ]; then
|
||||
echo "k8s_upgrade_available{kind=\"$KIND\",running=\"$RUNNING\",target=\"$TARGET\"} 1"
|
||||
else
|
||||
echo "k8s_upgrade_available{kind=\"none\",running=\"$RUNNING\",target=\"$RUNNING\"} 0"
|
||||
fi
|
||||
echo "# TYPE k8s_version_check_last_run_timestamp gauge"
|
||||
echo "k8s_version_check_last_run_timestamp $(date +%s)"
|
||||
} | curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed"
|
||||
|
||||
# 7. Decide whether to dispatch
|
||||
if [ -z "$TARGET" ]; then
|
||||
echo "No upgrade needed (running=$RUNNING, latest_patch=$LATEST_PATCH, next_minor_available=$NEXT_MINOR_AVAILABLE)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
slack "K8s upgrade available: v$RUNNING → v$TARGET ($KIND)"
|
||||
|
||||
if [ "$DRY_RUN" = "true" ]; then
|
||||
echo "DRY_RUN=true — not POSTing to claude-agent-service"
|
||||
slack "DRY_RUN — skipping agent dispatch"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 8. POST to claude-agent-service
|
||||
PAYLOAD=$(jq -nc \
|
||||
--arg target "$TARGET" \
|
||||
--arg kind "$KIND" \
|
||||
'{
|
||||
prompt: ("Run the k8s-version-upgrade agent. Inputs: " + ({target_version: $target, kind: $kind, dry_run: false, stages: "all"} | tostring)),
|
||||
agent: ".claude/agents/k8s-version-upgrade",
|
||||
max_budget_usd: 30
|
||||
}')
|
||||
|
||||
echo "Dispatching agent: $PAYLOAD"
|
||||
RESP=$(curl -sS -w '\n%%{http_code}' -X POST \
|
||||
-H "Authorization: Bearer $AGENT_TOKEN" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "$PAYLOAD" \
|
||||
http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute)
|
||||
CODE=$(printf '%s' "$RESP" | tail -n1)
|
||||
BODY=$(printf '%s' "$RESP" | sed '$d')
|
||||
|
||||
if [ "$CODE" = "200" ] || [ "$CODE" = "202" ]; then
|
||||
JOB_ID=$(printf '%s' "$BODY" | jq -r '.job_id // .id // "unknown"')
|
||||
slack "Agent dispatched: job=$JOB_ID (target=v$TARGET kind=$KIND)"
|
||||
echo "OK — job=$JOB_ID"
|
||||
else
|
||||
slack "ERROR dispatching agent: HTTP $CODE — $BODY"
|
||||
echo "dispatch failed: HTTP $CODE — $BODY" >&2
|
||||
exit 1
|
||||
fi
|
||||
EOT
|
||||
]
|
||||
env {
|
||||
name = "DRY_RUN"
|
||||
value = tostring(var.detection_dry_run)
|
||||
}
|
||||
env {
|
||||
name = "HOME"
|
||||
value = "/tmp"
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
23
stacks/k8s-version-upgrade/terragrunt.hcl
Normal file
23
stacks/k8s-version-upgrade/terragrunt.hcl
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
include "root" {
|
||||
path = find_in_parent_folders()
|
||||
}
|
||||
|
||||
# ExternalSecret hits ESO which needs to be alive when the manifest applies.
|
||||
dependency "external_secrets" {
|
||||
config_path = "../external-secrets"
|
||||
skip_outputs = true
|
||||
}
|
||||
|
||||
# Upgrade Gates rules (incl. K8sVersionSkew + EtcdPreUpgradeSnapshotMissing)
|
||||
# live in the monitoring stack — make the relationship visible so reapplies
|
||||
# don't race the alerts being available.
|
||||
dependency "monitoring" {
|
||||
config_path = "../monitoring"
|
||||
skip_outputs = true
|
||||
}
|
||||
|
||||
# Note: stacks/claude-agent-service has no terragrunt.hcl yet (manual apply
|
||||
# pattern) — its ServiceAccount + Namespace are referenced by name from this
|
||||
# stack's RoleBindings, which is fine because RoleBindings allow forward
|
||||
# references. Apply order: claude-agent-service first (or already deployed),
|
||||
# then this stack.
|
||||
|
|
@ -1890,14 +1890,13 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Kubelet/apiserver gitVersion skew detected — possible half-done k8s upgrade. Inspect: kubectl get nodes -o jsonpath='{.items[*].status.nodeInfo.kubeletVersion}'"
|
||||
# EtcdPreUpgradeSnapshotMissing: the k8s-version-upgrade agent pushes
|
||||
# k8s_upgrade_in_flight=1 when it starts, and k8s_upgrade_snapshot_taken=1
|
||||
# after the etcdctl snapshot is verified. If we see in_flight=1 with no
|
||||
# corresponding snapshot_taken=1 after 10 min, the agent has skipped or
|
||||
# failed the snapshot — that's a critical safety hole.
|
||||
# `k8s_upgrade_in_flight=1` + `k8s_upgrade_snapshot_taken=0` at Stage 0,
|
||||
# then sets snapshot_taken=1 in Stage 2 after etcdctl confirms the
|
||||
# snapshot file size. Anywhere in_flight=1 with snapshot_taken=0
|
||||
# lasting >10m means the agent skipped or failed Stage 2 — a critical
|
||||
# safety hole (no recovery point if master upgrade hangs).
|
||||
- alert: EtcdPreUpgradeSnapshotMissing
|
||||
expr: |
|
||||
k8s_upgrade_in_flight == 1
|
||||
unless on() k8s_upgrade_snapshot_taken == 1
|
||||
expr: k8s_upgrade_in_flight == 1 and k8s_upgrade_snapshot_taken == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue