k8s-version-upgrade: trigger etcd snapshot via existing backup-etcd Job; broaden agent RBAC

Stage 2 now reuses the existing default/backup-etcd CronJob (NFS-backed
PV pointing at 192.168.1.127:/srv/nfs/etcd-backup) instead of trying to
ssh into master and run etcdctl against a non-existent /mnt/main mount.
The agent triggers a one-shot Job from cronjob/backup-etcd, waits up to
10 min, then parses the backup-manage container log for "Backup done"
line + byte count.

Test 2 (dry-run) surfaced 5 real cluster blockers — agent loop works
end-to-end at the planning level.

Expanded the claude-agent ServiceAccount's privileges via a sibling
ClusterRole (claude-agent-upgrade-ops):
  - patch namespaces/k8s-upgrade (in-flight annotation)
  - create batch/jobs (trigger etcd snapshot Job)
  - patch nodes (cordon/uncordon)
  - create pods/eviction (drain)
  - delete pods (drain fallback)
This commit is contained in:
Viktor Barzin 2026-05-10 19:16:12 +00:00
parent a58d777059
commit 988bfde45c
2 changed files with 64 additions and 23 deletions

View file

@ -226,29 +226,60 @@ resource "kubernetes_role_binding" "claude_agent_reads_creds" {
}
}
# The claude-agent ClusterRole already grants `get,list,watch` on namespaces
# but NOT patch so we need to extend it here for the annotation write.
# Bound via a separate ClusterRoleBinding so we don't fork the upstream stack.
resource "kubernetes_cluster_role" "claude_agent_annotates_ns" {
# The base claude-agent ClusterRole grants get/list/watch on most resources
# but not the mutating verbs the upgrade agent needs. Rather than fork the
# upstream stack, we add a sibling ClusterRole here scoped to exactly the
# verbs+resources required:
# - patch on namespace k8s-upgrade (in-flight annotation)
# - create on batch/jobs (trigger etcd snapshot Job from cronjob/backup-etcd)
# - patch on nodes (cordon/uncordon drain needs this)
# - create on pods/eviction (drain evicts pods)
resource "kubernetes_cluster_role" "claude_agent_upgrade_ops" {
metadata {
name = "claude-agent-annotates-k8s-upgrade-ns"
name = "claude-agent-upgrade-ops"
}
# Annotate the k8s-upgrade namespace
rule {
api_groups = [""]
resources = ["namespaces"]
resource_names = ["k8s-upgrade"]
verbs = ["patch", "update"]
}
# Trigger etcd snapshot Jobs (from cronjob/backup-etcd in default ns).
# Cluster-scoped because we may also create test Jobs in k8s-upgrade ns.
rule {
api_groups = ["batch"]
resources = ["jobs"]
verbs = ["create", "delete"]
}
# Cordon / uncordon nodes
rule {
api_groups = [""]
resources = ["nodes"]
verbs = ["patch", "update"]
}
# Drain (evict pods)
rule {
api_groups = [""]
resources = ["pods/eviction"]
verbs = ["create"]
}
# Delete pods stuck during drain (sometimes evict isn't enough)
rule {
api_groups = [""]
resources = ["pods"]
verbs = ["delete"]
}
}
resource "kubernetes_cluster_role_binding" "claude_agent_annotates_ns" {
resource "kubernetes_cluster_role_binding" "claude_agent_upgrade_ops" {
metadata {
name = "claude-agent-annotates-k8s-upgrade-ns"
name = "claude-agent-upgrade-ops"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = kubernetes_cluster_role.claude_agent_annotates_ns.metadata[0].name
name = kubernetes_cluster_role.claude_agent_upgrade_ops.metadata[0].name
}
subject {
kind = "ServiceAccount"