From ae6dde45c2e37060a60e6414d2ba5af8cb6fee5f Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 19:16:12 +0000 Subject: [PATCH] k8s-version-upgrade: trigger etcd snapshot via existing backup-etcd Job; broaden agent RBAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 2 now reuses the existing default/backup-etcd CronJob (NFS-backed PV pointing at 192.168.1.127:/srv/nfs/etcd-backup) instead of trying to ssh into master and run etcdctl against a non-existent /mnt/main mount. The agent triggers a one-shot Job from cronjob/backup-etcd, waits up to 10 min, then parses the backup-manage container log for "Backup done" line + byte count. Test 2 (dry-run) surfaced 5 real cluster blockers — agent loop works end-to-end at the planning level. Expanded the claude-agent ServiceAccount's privileges via a sibling ClusterRole (claude-agent-upgrade-ops): - patch namespaces/k8s-upgrade (in-flight annotation) - create batch/jobs (trigger etcd snapshot Job) - patch nodes (cordon/uncordon) - create pods/eviction (drain) - delete pods (drain fallback) --- .claude/agents/k8s-version-upgrade.md | 40 ++++++++++++++--------- stacks/k8s-version-upgrade/main.tf | 47 ++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/.claude/agents/k8s-version-upgrade.md b/.claude/agents/k8s-version-upgrade.md index 8d5c4fc8..39d5f306 100644 --- a/.claude/agents/k8s-version-upgrade.md +++ b/.claude/agents/k8s-version-upgrade.md @@ -40,7 +40,7 @@ Parse the prompt's first JSON block to extract these. If anything is missing, ab - **Working dir**: `/workspace/infra` (`WORKSPACE_DIR` env var) - **Kubeconfig**: `/workspace/infra/config` (use `kubectl --kubeconfig $WORKSPACE_DIR/config ...` in every kubectl call) - **Prometheus**: `http://prometheus-server.monitoring.svc.cluster.local:80` (in-cluster, no auth) -- **Etcd snapshot dir**: `/mnt/main/etcd-backup/` (NFS, exists, writeable from master) +- **Etcd snapshot**: triggered as a one-shot Job from the existing `default/backup-etcd` CronJob (defined in `stacks/infra-maintenance/`). The Job runs on `k8s-master` with hostNetwork (so etcdctl reaches etcd at 127.0.0.1:2379), mounts the PV-backed NFS export `192.168.1.127:/srv/nfs/etcd-backup`, and writes `etcd-snapshot-.db` there. Do NOT shell into master with etcdctl directly — the cert paths + NFS mount are already wired into the CronJob. - **Library script**: `/workspace/infra/scripts/update_k8s.sh` — pipe via SSH to each node, do NOT modify on the fly. Invoke as `ssh ... 'bash -s' < update_k8s.sh --role --release `. ### Credentials — fetched at startup @@ -198,34 +198,44 @@ Slack: `Pre-flight clean. Proceeding to etcd snapshot.` ## Stage 2: Etcd snapshot (`stages` includes `snapshot`) -Always run — patch OR minor. +Always run — patch OR minor. Triggers a one-shot Job from the existing `default/backup-etcd` CronJob and waits for it to complete. ```bash -TARGET_PATH="/mnt/main/etcd-backup/k8s-upgrade-pre-${target_version}-$(date +%s).db" +JOB_NAME="pre-upgrade-etcd-${target_version}-$(date +%s)" if [ "$dry_run" = "false" ]; then - $SSH \ - wizard@k8s-master "sudo /usr/bin/env ETCDCTL_API=3 etcdctl snapshot save '$TARGET_PATH' \ - --endpoints=https://127.0.0.1:2379 \ - --cacert=/etc/kubernetes/pki/etcd/ca.crt \ - --cert=/etc/kubernetes/pki/etcd/server.crt \ - --key=/etc/kubernetes/pki/etcd/server.key" + $KUBECTL -n default create job --from=cronjob/backup-etcd "$JOB_NAME" + + # Wait up to 10 min for snapshot Job to complete + $KUBECTL -n default wait --for=condition=complete --timeout=600s "job/$JOB_NAME" || { + slack "ABORT Stage 2 — etcd snapshot Job did not complete in 10 min" + $KUBECTL -n default describe "job/$JOB_NAME" | tail -30 + exit 1 + } + + # Parse the Job's pod log for "Backup done: ( bytes)" + LOG=$($KUBECTL -n default logs "job/$JOB_NAME" -c backup-manage --tail=20) + echo "$LOG" + SNAPSHOT_LINE=$(echo "$LOG" | grep -E '^Backup done:') + SIZE=$(echo "$SNAPSHOT_LINE" | grep -oE '\([0-9]+ bytes\)' | grep -oE '[0-9]+') + SNAPSHOT_FILE=$(echo "$SNAPSHOT_LINE" | awk '{print $3}') - # Verify size > 0 - SIZE=$($SSH \ - wizard@k8s-master "sudo stat -c %s '$TARGET_PATH'") if [ -z "$SIZE" ] || [ "$SIZE" -lt 1024 ]; then - slack "ABORT — etcd snapshot empty or missing ($SIZE bytes at $TARGET_PATH)" + slack "ABORT Stage 2 — etcd snapshot empty or missing (size='$SIZE' line='$SNAPSHOT_LINE')" exit 1 fi - kubectl --kubeconfig $WORKSPACE_DIR/config annotate ns k8s-upgrade \ + TARGET_PATH="nfs://192.168.1.127:/srv/nfs/etcd-backup/$SNAPSHOT_FILE" + $KUBECTL annotate ns k8s-upgrade \ viktorbarzin.me/k8s-upgrade-snapshot-path="$TARGET_PATH" --overwrite push_metric k8s_upgrade_snapshot_taken 1 +else + TARGET_PATH="WOULD: trigger default/backup-etcd Job, wait, verify size" + SIZE="dry-run" fi -slack "Etcd snapshot saved at $TARGET_PATH ($SIZE bytes)" +slack "Etcd snapshot saved at $TARGET_PATH (size=$SIZE)" ``` ## Stage 3: Master containerd skew fix (`stages` includes `containerd`) diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf index 1dce957e..321136b1 100644 --- a/stacks/k8s-version-upgrade/main.tf +++ b/stacks/k8s-version-upgrade/main.tf @@ -226,29 +226,60 @@ resource "kubernetes_role_binding" "claude_agent_reads_creds" { } } -# The claude-agent ClusterRole already grants `get,list,watch` on namespaces -# but NOT patch — so we need to extend it here for the annotation write. -# Bound via a separate ClusterRoleBinding so we don't fork the upstream stack. -resource "kubernetes_cluster_role" "claude_agent_annotates_ns" { +# The base claude-agent ClusterRole grants get/list/watch on most resources +# but not the mutating verbs the upgrade agent needs. Rather than fork the +# upstream stack, we add a sibling ClusterRole here scoped to exactly the +# verbs+resources required: +# - patch on namespace k8s-upgrade (in-flight annotation) +# - create on batch/jobs (trigger etcd snapshot Job from cronjob/backup-etcd) +# - patch on nodes (cordon/uncordon — drain needs this) +# - create on pods/eviction (drain evicts pods) +resource "kubernetes_cluster_role" "claude_agent_upgrade_ops" { metadata { - name = "claude-agent-annotates-k8s-upgrade-ns" + name = "claude-agent-upgrade-ops" } + # Annotate the k8s-upgrade namespace rule { api_groups = [""] resources = ["namespaces"] resource_names = ["k8s-upgrade"] verbs = ["patch", "update"] } + # Trigger etcd snapshot Jobs (from cronjob/backup-etcd in default ns). + # Cluster-scoped because we may also create test Jobs in k8s-upgrade ns. + rule { + api_groups = ["batch"] + resources = ["jobs"] + verbs = ["create", "delete"] + } + # Cordon / uncordon nodes + rule { + api_groups = [""] + resources = ["nodes"] + verbs = ["patch", "update"] + } + # Drain (evict pods) + rule { + api_groups = [""] + resources = ["pods/eviction"] + verbs = ["create"] + } + # Delete pods stuck during drain (sometimes evict isn't enough) + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["delete"] + } } -resource "kubernetes_cluster_role_binding" "claude_agent_annotates_ns" { +resource "kubernetes_cluster_role_binding" "claude_agent_upgrade_ops" { metadata { - name = "claude-agent-annotates-k8s-upgrade-ns" + name = "claude-agent-upgrade-ops" } role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" - name = kubernetes_cluster_role.claude_agent_annotates_ns.metadata[0].name + name = kubernetes_cluster_role.claude_agent_upgrade_ops.metadata[0].name } subject { kind = "ServiceAccount"