infra/stacks/k8s-version-upgrade/main.tf
Viktor Barzin 01bc16d592 k8s-version-upgrade: decompose into Job chain to fix self-preemption
The agent-based v1 ran inside claude-agent-service (replicas=1, no
nodeSelector) and self-evicted when it tried to drain its host (k8s-node4
on 2026-05-11). Cluster ended half-upgraded (master v1.34.7, workers
v1.34.2) until manual recovery.

Rewrite the pipeline as a chain of nodeSelector-pinned Jobs:

  preflight (k8s-node1)
    → master   (k8s-node1)   drains k8s-master
    → worker × 4 (k8s-node1) drains k8s-node{4,3,2}
    → worker   (k8s-master + control-plane toleration) drains k8s-node1
    → postflight (no pinning)

Each Job runs scripts/upgrade-step.sh (case-on-$PHASE) and ends by
envsubst-ing job-template.yaml into the next Job. Deterministic names
(k8s-upgrade-<phase>-<target_version>[-<node>]) make `kubectl apply`
idempotent — a failed Job can be re-created without duplicating
downstream.

Also lands `predrain_unstick`: deletes pods on the target node whose PDB
has 0 disruptionsAllowed. Without this, drain loops indefinitely on
single-replica deployments (e.g. every Anubis instance — discovered the
hard way during 2026-05-11 manual recovery of k8s-node3).

Adds K8sUpgradeStalled alert (in_flight + started_timestamp > 90 min).
Deprecates the agent prompt (renamed to *.deprecated.md with a header
pointer to the new code).

Apply order: k8s-version-upgrade first (consumes new SA + ConfigMaps),
then monitoring (loads the new alert). Both applied 2026-05-11.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-11 23:54:22 +00:00

460 lines
16 KiB
HCL

# k8s-version-upgrade — Automated K8s component (kubeadm/kubelet/kubectl) upgrade
#
# Architecture: detection CronJob → chain of small Jobs, one per phase. Each
# Job's pod runs on a node that is NOT its drain target — eliminates the
# self-preemption bug that killed the agent-based v1 (2026-05-11 incident).
#
# Chain (Job 0 → Job 6):
# preflight (pinned: k8s-node1)
# master (pinned: k8s-node1; drains k8s-master)
# worker (pinned: k8s-node1; drains k8s-node4 → 3 → 2)
# worker (pinned: k8s-master + control-plane toleration; drains k8s-node1 last)
# postflight (no pinning)
#
# Each phase Job's container runs scripts/upgrade-step.sh which:
# - dispatches on $PHASE
# - spawns the next Job via envsubst on job-template.yaml
# - uses deterministic naming (k8s-upgrade-${phase}-${target_version}[-${node}])
# so re-running on failure reconciles to a single Job per run.
#
# Reuse points:
# - claude-agent-service image (kubectl + ssh + jq + curl + envsubst)
# - Vault secret/k8s-upgrade/* (ssh_key, slack_webhook)
# - Prometheus + Pushgateway + Upgrade Gates alerts
# - default/backup-etcd CronJob (snapshot trigger)
# - infra/scripts/update_k8s.sh (per-node upgrade body)
variable "schedule" {
type = string
default = "0 12 * * 0" # Sunday 12:00 UTC — outside kured window
}
variable "enabled" {
type = bool
default = true
}
# Mirrors `local.image_tag` in stacks/claude-agent-service/main.tf — bump
# in lockstep with claude-agent-service rebuilds. The image ships kubectl,
# ssh-client, curl, jq, envsubst — everything the upgrade Jobs need.
variable "image_tag" {
type = string
default = "2fd7670d"
}
# When true, detection runs but does NOT spawn the preflight Job.
variable "detection_dry_run" {
type = bool
default = false
}
locals {
namespace = "k8s-upgrade"
image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.image_tag}"
labels = {
app = "k8s-version-upgrade"
}
}
# --- Namespace ---
resource "kubernetes_namespace" "k8s_upgrade" {
metadata {
name = local.namespace
labels = {
tier = local.tiers.cluster
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
# --- ExternalSecret: SSH key + Slack webhook ---
#
# Operator populates Vault `secret/k8s-upgrade/` with:
# - ssh_key (ed25519 PRIVATE key, used to SSH wizard@<node> from Jobs)
# - ssh_key_pub (matching public key, deployed to nodes' authorized_keys)
# - slack_webhook (incoming-webhook URL)
#
# No claude-agent bearer needed — the chain no longer POSTs to that service.
resource "kubernetes_manifest" "external_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "k8s-upgrade-creds"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = {
name = "k8s-upgrade-creds"
}
data = [
{
secretKey = "ssh_key"
remoteRef = {
key = "k8s-upgrade"
property = "ssh_key"
}
},
{
secretKey = "slack_webhook"
remoteRef = {
key = "k8s-upgrade"
property = "slack_webhook"
}
},
]
}
}
}
# --- Unified ServiceAccount + RBAC ---
#
# One SA serves BOTH the detection CronJob and every phase Job:
# - detection CronJob: needs nodes:get/list + secrets:get + jobs:create
# (to spawn Job 0 = preflight)
# - phase Jobs: same + pods/eviction:create + pods:delete + namespaces:patch
#
# Cluster-scoped because the chain spans the whole cluster (drain works on
# any node, and the preflight Job creates a Job in `default` ns from
# `cronjob/backup-etcd`).
resource "kubernetes_service_account" "k8s_upgrade_job" {
metadata {
name = "k8s-upgrade-job"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
}
}
resource "kubernetes_cluster_role" "k8s_upgrade_job" {
metadata {
name = "k8s-upgrade-job"
}
# Read nodes (version comparison + readiness check)
rule {
api_groups = [""]
resources = ["nodes"]
verbs = ["get", "list", "patch", "update"]
}
# Drain — evict pods
rule {
api_groups = [""]
resources = ["pods/eviction"]
verbs = ["create"]
}
# Drain fallback — direct delete (predrain_unstick bypasses PDBs)
rule {
api_groups = [""]
resources = ["pods"]
verbs = ["get", "list", "delete"]
}
# Read PDBs to find drain-blocking pods
rule {
api_groups = ["policy"]
resources = ["poddisruptionbudgets"]
verbs = ["get", "list"]
}
# Chain dispatch — create the next Job; reconcile via apply on retry.
# In `default` ns to also create the etcd-snapshot Job from cronjob/backup-etcd.
rule {
api_groups = ["batch"]
resources = ["jobs"]
verbs = ["create", "get", "list", "delete", "patch", "watch"]
}
# Pull CronJob spec for `kubectl create job --from=cronjob/backup-etcd`
rule {
api_groups = ["batch"]
resources = ["cronjobs"]
verbs = ["get", "list"]
}
# Annotate the k8s-upgrade namespace (in-flight marker + snapshot path)
rule {
api_groups = [""]
resources = ["namespaces"]
resource_names = [local.namespace]
verbs = ["get", "patch", "update"]
}
}
resource "kubernetes_cluster_role_binding" "k8s_upgrade_job" {
metadata {
name = "k8s-upgrade-job"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = kubernetes_cluster_role.k8s_upgrade_job.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
}
}
# Namespaced: read the credentials Secret in k8s-upgrade (SSH key + Slack URL)
resource "kubernetes_role" "k8s_upgrade_job_ns" {
metadata {
name = "k8s-upgrade-job-ns"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
}
rule {
api_groups = [""]
resources = ["secrets"]
resource_names = ["k8s-upgrade-creds"]
verbs = ["get"]
}
}
resource "kubernetes_role_binding" "k8s_upgrade_job_ns" {
metadata {
name = "k8s-upgrade-job-ns"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = kubernetes_role.k8s_upgrade_job_ns.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
}
}
# --- ConfigMaps: scripts + Job template ---
resource "kubernetes_config_map" "k8s_upgrade_scripts" {
metadata {
name = "k8s-upgrade-scripts"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
labels = local.labels
}
data = {
"upgrade-step.sh" = file("${path.module}/scripts/upgrade-step.sh")
"update_k8s.sh" = file("${path.module}/../../scripts/update_k8s.sh")
}
}
resource "kubernetes_config_map" "k8s_upgrade_job_template" {
metadata {
name = "k8s-upgrade-job-template"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
labels = local.labels
}
data = {
"job-template.yaml" = file("${path.module}/job-template.yaml")
}
}
# --- Detection CronJob ---
#
# Probes for available patch/minor targets weekly. When one is found, renders
# Job 0 (preflight) from the same job-template the chain uses. The CronJob no
# longer POSTs to claude-agent-service; the whole pipeline now runs inside the
# cluster via Job-chaining.
resource "kubernetes_cron_job_v1" "k8s_version_check" {
metadata {
name = "k8s-version-check"
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
labels = local.labels
}
spec {
schedule = var.schedule
concurrency_policy = "Forbid"
successful_jobs_history_limit = 3
failed_jobs_history_limit = 3
starting_deadline_seconds = 600
suspend = !var.enabled
job_template {
metadata {
labels = local.labels
}
spec {
backoff_limit = 0
ttl_seconds_after_finished = 86400
template {
metadata {
labels = local.labels
}
spec {
service_account_name = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
restart_policy = "Never"
image_pull_secrets {
name = "registry-credentials"
}
volume {
name = "creds"
secret {
secret_name = "k8s-upgrade-creds"
# 0444 — non-root container needs read; SSH key gets re-installed
# with mode 0400 in the inline command before any ssh call.
default_mode = "0444"
}
}
volume {
name = "template"
config_map {
name = kubernetes_config_map.k8s_upgrade_job_template.metadata[0].name
}
}
container {
name = "version-check"
image = local.image
command = ["/bin/bash", "-c", <<-EOT
set -euo pipefail
echo "==> k8s-version-check ($(date -u +%FT%TZ))"
SLACK=$(cat /secrets/k8s-upgrade/slack_webhook)
install -m 0400 /secrets/k8s-upgrade/ssh_key /tmp/ssh_key
SSH="ssh -i /tmp/ssh_key -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/tmp/known_hosts -o ConnectTimeout=10"
slack() {
curl -sS -X POST -H 'Content-Type: application/json' \
--data "$(jq -nc --arg t "[k8s-version-check] $1" '{text: $t}')" \
"$SLACK" || true
}
# 1. Detect running version
RUNNING=$(/usr/local/bin/kubectl get nodes \
-o jsonpath='{.items[0].status.nodeInfo.kubeletVersion}' | tr -d v)
RUNNING_MINOR=$(echo "$RUNNING" | awk -F. '{print $1"."$2}')
echo "Running version: v$RUNNING (minor $RUNNING_MINOR)"
# 2. Latest patch within current minor (refresh master's apt cache)
LATEST_PATCH=$($SSH wizard@k8s-master \
"sudo apt-get update -qq -o Dir::Etc::sourcelist='sources.list.d/kubernetes.list' -o Dir::Etc::sourceparts='-' -o APT::Get::List-Cleanup='0' >/dev/null 2>&1 ; \
apt-cache madison kubeadm 2>/dev/null \
| awk '{print \$3}' \
| sed 's/-.*//' \
| grep '^$RUNNING_MINOR\\.' \
| sort -V | tail -1" || echo "")
echo "Latest patch: v$LATEST_PATCH"
# 3. Next-minor probe
NEXT_MINOR_NUM=$(( $(echo "$RUNNING_MINOR" | cut -d. -f2) + 1 ))
NEXT_MINOR="1.$NEXT_MINOR_NUM"
NEXT_MINOR_AVAILABLE="no"
if curl -sIo /dev/null -w '%%{http_code}' \
"https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Release" \
| grep -q '^200$'; then
NEXT_MINOR_AVAILABLE="yes"
fi
echo "Next minor v$NEXT_MINOR available: $NEXT_MINOR_AVAILABLE"
# 4. Choose target
TARGET=""
KIND=""
if [ -n "$LATEST_PATCH" ] && [ "$LATEST_PATCH" != "$RUNNING" ]; then
TARGET="$LATEST_PATCH"
KIND="patch"
elif [ "$NEXT_MINOR_AVAILABLE" = "yes" ]; then
NEXT_MINOR_PATCH=$($SSH wizard@k8s-master \
"curl -sf 'https://pkgs.k8s.io/core:/stable:/v$NEXT_MINOR/deb/Packages' \
| grep -oE 'Version: [0-9.-]+' \
| awk '{print \$2}' | sed 's/-.*//' \
| sort -V | tail -1" || echo "")
if [ -n "$NEXT_MINOR_PATCH" ]; then
TARGET="$NEXT_MINOR_PATCH"
KIND="minor"
fi
fi
# 5. Pushgateway discovery metric
PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/k8s-version-check'
{
echo "# TYPE k8s_upgrade_available gauge"
if [ -n "$TARGET" ]; then
echo "k8s_upgrade_available{kind=\"$KIND\",running=\"$RUNNING\",target=\"$TARGET\"} 1"
else
echo "k8s_upgrade_available{kind=\"none\",running=\"$RUNNING\",target=\"$RUNNING\"} 0"
fi
echo "# TYPE k8s_version_check_last_run_timestamp gauge"
echo "k8s_version_check_last_run_timestamp $(date +%s)"
} | curl -sS --data-binary @- "$PG" || echo "warn: pushgateway push failed"
# 6. Decide whether to spawn Job 0
if [ -z "$TARGET" ]; then
echo "No upgrade needed"
exit 0
fi
slack "K8s upgrade available: v$RUNNING → v$TARGET ($KIND)"
if [ "$DRY_RUN" = "true" ]; then
slack "DRY_RUN — not spawning preflight Job"
exit 0
fi
# 7. Spawn Job 0 (preflight) via envsubst on the job-template
# Idempotency: deterministic name reconciles via `apply`.
JOB_NAME="k8s-upgrade-preflight-$${TARGET//./-}"
if /usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" >/dev/null 2>&1; then
slack "Preflight Job $JOB_NAME already exists (rerunning detection mid-flight?)"
exit 0
fi
export JOB_NAME PHASE_NEXT=preflight TARGET_NODE_NEXT="" \
TARGET_VERSION="$TARGET" TARGET_VERSION_LABEL="$${TARGET//./-}" \
KIND="$KIND" IMAGE="$${IMAGE}" \
SCHEDULING_BLOCK=$' nodeSelector:\n kubernetes.io/hostname: k8s-node1'
envsubst < /template/job-template.yaml \
| /usr/local/bin/kubectl apply -f -
slack "Spawned $JOB_NAME (target=v$TARGET kind=$KIND)"
EOT
]
env {
name = "DRY_RUN"
value = tostring(var.detection_dry_run)
}
env {
name = "IMAGE"
value = local.image
}
env {
name = "HOME"
value = "/tmp"
}
volume_mount {
name = "creds"
mount_path = "/secrets/k8s-upgrade"
read_only = true
}
volume_mount {
name = "template"
mount_path = "/template"
read_only = true
}
resources {
requests = {
cpu = "50m"
memory = "128Mi"
}
limits = {
memory = "256Mi"
}
}
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}