diff --git a/stacks/proxmox-csi/ghost-reconcile.tf b/stacks/proxmox-csi/ghost-reconcile.tf new file mode 100644 index 00000000..73d17cec --- /dev/null +++ b/stacks/proxmox-csi/ghost-reconcile.tf @@ -0,0 +1,241 @@ +# Ghost-disk auto-reconcile (beads code-dfjn — the prevention half). +# +# proxmox-csi hot-plugs each PVC as a virtio-scsi disk via the Proxmox API. +# A failed detach (query-pci QMP timeout on a disk-heavy VM) leaves a "ghost": +# a scsiN entry in the VM config with NO matching k8s VolumeAttachment. Ghosts +# are invisible to the NodeVolumeLimits scheduler (it counts VAs, not real scsi +# disks), so the node gets oversubscribed until query-pci wedges — the doom loop. +# +# This CronJob closes the loop: every 15 min it compares each worker VM's real +# scsi disks (Proxmox API) against k8s VolumeAttachments, and safely detaches any +# ghost (`PUT .../config delete=scsiN` — frees the LUN slot, retains the LV, same +# as `qm set --delete scsiN`). Detection mirrors cluster-health check #47. +# +# SAFETY: only acts on `vm-9999-pvc-*` scsi entries with NO VolumeAttachment for +# that PV on that node; re-confirms after a 60s sleep (so an in-flight attach is +# never caught); caps detaches per run. Uses the scoped CSI API token (VM.Config.Disk), +# NOT root SSH. Detach is non-destructive to data (the LV is retained). + +locals { + ghost_reconcile_ns = "proxmox-csi" +} + +resource "kubernetes_secret" "ghost_reconcile_pve" { + metadata { + name = "csi-ghost-reconcile-pve" + namespace = local.ghost_reconcile_ns + } + data = { + token_id = data.vault_kv_secret_v2.secrets.data["proxmox_csi_token_id"] + token_secret = data.vault_kv_secret_v2.secrets.data["proxmox_csi_token_secret"] + } + depends_on = [module.proxmox-csi] +} + +resource "kubernetes_service_account" "ghost_reconcile" { + metadata { + name = "csi-ghost-reconcile" + namespace = local.ghost_reconcile_ns + } + depends_on = [module.proxmox-csi] +} + +resource "kubernetes_cluster_role" "ghost_reconcile" { + metadata { name = "csi-ghost-reconcile" } + rule { + api_groups = ["storage.k8s.io"] + resources = ["volumeattachments"] + verbs = ["get", "list"] + } +} + +resource "kubernetes_cluster_role_binding" "ghost_reconcile" { + metadata { name = "csi-ghost-reconcile" } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.ghost_reconcile.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.ghost_reconcile.metadata[0].name + namespace = local.ghost_reconcile_ns + } +} + +resource "kubernetes_config_map" "ghost_reconcile_script" { + metadata { + name = "csi-ghost-reconcile-script" + namespace = local.ghost_reconcile_ns + } + data = { + "reconcile.py" = <<-PY + import json, os, ssl, sys, time, urllib.request, urllib.parse + + DRY = os.environ.get("DRY_RUN", "false") == "true" + CAP = int(os.environ.get("MAX_DETACH", "5")) + PVE = os.environ["PVE_URL"].rstrip("/") + PVE_TOK = os.environ["PVE_TOKEN_ID"] + "=" + os.environ["PVE_TOKEN_SECRET"] + PG = os.environ.get("PUSHGATEWAY", "") + NODES = {201:"k8s-node1",202:"k8s-node2",203:"k8s-node3",204:"k8s-node4",205:"k8s-node5",206:"k8s-node6"} + + _ktok = open("/var/run/secrets/kubernetes.io/serviceaccount/token").read().strip() + _kctx = ssl.create_default_context(cafile="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt") + _kctx.check_hostname = False # reach the API by injected ClusterIP (cluster DNS may not resolve in this pod); CA chain still verified + _kapi = "https://%s:%s" % (os.environ["KUBERNETES_SERVICE_HOST"], os.environ.get("KUBERNETES_SERVICE_PORT", "443")) + _pctx = ssl.create_default_context(); _pctx.check_hostname = False; _pctx.verify_mode = ssl.CERT_NONE + + def k8s(path): + r = urllib.request.Request(_kapi + path, headers={"Authorization": "Bearer " + _ktok}) + return json.load(urllib.request.urlopen(r, context=_kctx, timeout=20)) + + def pve(path, method="GET", data=None): + body = urllib.parse.urlencode(data).encode() if data else None + r = urllib.request.Request(PVE + path, data=body, method=method, headers={"Authorization": "PVEAPIToken=" + PVE_TOK}) + return json.load(urllib.request.urlopen(r, context=_pctx, timeout=20)) + + def attached(): + out = set() + for v in k8s("/apis/storage.k8s.io/v1/volumeattachments").get("items", []): + sp = v.get("spec", {}) + if sp.get("attacher") != "csi.proxmox.sinextra.dev": continue + if not v.get("status", {}).get("attached"): continue + pv = (sp.get("source", {}) or {}).get("persistentVolumeName", "") + if pv: out.add((sp.get("nodeName", ""), pv.replace("pvc-", ""))) + return out + + def find_ghosts(att): + g = [] + for vmid, node in NODES.items(): + cfg = pve("/nodes/pve/qemu/%d/config" % vmid).get("data", {}) + for key, val in cfg.items(): + if not (key.startswith("scsi") and key[4:].isdigit()): continue + s = str(val) + if "vm-9999-pvc-" not in s: continue + uuid = s.split("vm-9999-pvc-")[1].split(",")[0] + if (node, uuid) not in att: + g.append((vmid, node, key, uuid)) + return g + + att = attached() + ghosts = find_ghosts(att) + print("[reconcile] attached_VAs=%d ghosts=%d dry=%s" % (len(att), len(ghosts), DRY), flush=True) + for vmid, node, scsi, uuid in ghosts: + print(" ghost: VM%d/%s %s -> pvc-%s" % (vmid, node, scsi, uuid), flush=True) + + detached = 0 + if ghosts: + time.sleep(60) # re-confirm: never act on an in-flight attach + att2 = attached() + confirmed = [x for x in ghosts if (x[1], x[3]) not in att2] + print("[reconcile] confirmed after 60s recheck: %d" % len(confirmed), flush=True) + for vmid, node, scsi, uuid in confirmed: + if detached >= CAP: + print("[reconcile] hit per-run cap %d, stopping" % CAP, flush=True); break + if DRY: + print(" DRY would detach VM%d %s (pvc-%s)" % (vmid, scsi, uuid), flush=True); continue + try: + pve("/nodes/pve/qemu/%d/config" % vmid, method="PUT", data={"delete": scsi}) + print(" DETACHED VM%d %s (pvc-%s)" % (vmid, scsi, uuid), flush=True); detached += 1 + except Exception as e: + print(" FAILED detach VM%d %s: %s" % (vmid, scsi, e), flush=True) + else: + print("[reconcile] no ghosts — all nodes reconciled", flush=True) + + if PG: + try: + body = "csi_ghosts_detected %d\ncsi_ghosts_detached %d\ncsi_ghost_reconcile_last_run %d\n" % (len(ghosts), detached, int(time.time())) + urllib.request.urlopen(urllib.request.Request(PG.rstrip("/") + "/metrics/job/csi-ghost-reconcile", data=body.encode(), method="PUT"), timeout=10) + except Exception as e: + print("[reconcile] metric push failed: %s" % e, flush=True) + PY + } + depends_on = [module.proxmox-csi] +} + +resource "kubernetes_cron_job_v1" "ghost_reconcile" { + metadata { + name = "csi-ghost-reconcile" + namespace = local.ghost_reconcile_ns + } + spec { + schedule = "*/15 * * * *" + concurrency_policy = "Forbid" + successful_jobs_history_limit = 3 + failed_jobs_history_limit = 3 + job_template { + metadata {} + spec { + backoff_limit = 1 + active_deadline_seconds = 300 + ttl_seconds_after_finished = 600 + template { + metadata {} + spec { + service_account_name = kubernetes_service_account.ghost_reconcile.metadata[0].name + restart_policy = "Never" + container { + name = "reconcile" + image = "python:3.13-alpine" + image_pull_policy = "IfNotPresent" + command = ["python3", "/script/reconcile.py"] + env { + name = "PVE_URL" + value = "https://192.168.1.127:8006/api2/json" + } + env { + name = "DRY_RUN" + value = "false" + } + env { + name = "MAX_DETACH" + value = "5" + } + env { + name = "PUSHGATEWAY" + value = "http://prometheus-prometheus-pushgateway.monitoring.svc.cluster.local:9091" + } + env { + name = "PVE_TOKEN_ID" + value_from { + secret_key_ref { + name = kubernetes_secret.ghost_reconcile_pve.metadata[0].name + key = "token_id" + } + } + } + env { + name = "PVE_TOKEN_SECRET" + value_from { + secret_key_ref { + name = kubernetes_secret.ghost_reconcile_pve.metadata[0].name + key = "token_secret" + } + } + } + volume_mount { + name = "script" + mount_path = "/script" + } + resources { + requests = { cpu = "10m", memory = "64Mi" } + limits = { memory = "128Mi" } + } + } + volume { + name = "script" + config_map { + name = kubernetes_config_map.ghost_reconcile_script.metadata[0].name + default_mode = "0555" + } + } + } + } + } + } + } + lifecycle { + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + } + depends_on = [module.proxmox-csi] +}