proxmox-csi: auto-reconcile CronJob to detach ghost disks (code-dfjn prevention)
All checks were successful
ci/woodpecker/push/default Pipeline was successful
ci/woodpecker/push/build-cli Pipeline was successful

Closes the ghost-disk doom loop by construction (failed detach -> orphan scsiN
with no VolumeAttachment -> invisible oversubscription -> query-pci wedge).
Every 15min csi-ghost-reconcile compares each worker VM's real scsi disks
(Proxmox API) vs k8s VolumeAttachments and safely detaches ghosts (PUT
.../config delete=scsiN -> frees the LUN slot, retains the LV).

- detection mirrors cluster-health check #47
- SAFETY: only vm-9999-pvc scsi with no matching VA; 60s re-confirm; per-run cap 5
- scoped CSI API token (VM.Config.Disk), not root SSH; k8s API via injected ClusterIP
- verified live: read 66 VAs, 0 ghosts, no false positives
- pushes csi_ghosts_detected/detached to Pushgateway

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-05 21:25:36 +00:00
parent e311cbe103
commit 355ca3ee91

View file

@ -0,0 +1,241 @@
# Ghost-disk auto-reconcile (beads code-dfjn the prevention half).
#
# proxmox-csi hot-plugs each PVC as a virtio-scsi disk via the Proxmox API.
# A failed detach (query-pci QMP timeout on a disk-heavy VM) leaves a "ghost":
# a scsiN entry in the VM config with NO matching k8s VolumeAttachment. Ghosts
# are invisible to the NodeVolumeLimits scheduler (it counts VAs, not real scsi
# disks), so the node gets oversubscribed until query-pci wedges the doom loop.
#
# This CronJob closes the loop: every 15 min it compares each worker VM's real
# scsi disks (Proxmox API) against k8s VolumeAttachments, and safely detaches any
# ghost (`PUT .../config delete=scsiN` frees the LUN slot, retains the LV, same
# as `qm set --delete scsiN`). Detection mirrors cluster-health check #47.
#
# SAFETY: only acts on `vm-9999-pvc-*` scsi entries with NO VolumeAttachment for
# that PV on that node; re-confirms after a 60s sleep (so an in-flight attach is
# never caught); caps detaches per run. Uses the scoped CSI API token (VM.Config.Disk),
# NOT root SSH. Detach is non-destructive to data (the LV is retained).
locals {
ghost_reconcile_ns = "proxmox-csi"
}
resource "kubernetes_secret" "ghost_reconcile_pve" {
metadata {
name = "csi-ghost-reconcile-pve"
namespace = local.ghost_reconcile_ns
}
data = {
token_id = data.vault_kv_secret_v2.secrets.data["proxmox_csi_token_id"]
token_secret = data.vault_kv_secret_v2.secrets.data["proxmox_csi_token_secret"]
}
depends_on = [module.proxmox-csi]
}
resource "kubernetes_service_account" "ghost_reconcile" {
metadata {
name = "csi-ghost-reconcile"
namespace = local.ghost_reconcile_ns
}
depends_on = [module.proxmox-csi]
}
resource "kubernetes_cluster_role" "ghost_reconcile" {
metadata { name = "csi-ghost-reconcile" }
rule {
api_groups = ["storage.k8s.io"]
resources = ["volumeattachments"]
verbs = ["get", "list"]
}
}
resource "kubernetes_cluster_role_binding" "ghost_reconcile" {
metadata { name = "csi-ghost-reconcile" }
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = kubernetes_cluster_role.ghost_reconcile.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.ghost_reconcile.metadata[0].name
namespace = local.ghost_reconcile_ns
}
}
resource "kubernetes_config_map" "ghost_reconcile_script" {
metadata {
name = "csi-ghost-reconcile-script"
namespace = local.ghost_reconcile_ns
}
data = {
"reconcile.py" = <<-PY
import json, os, ssl, sys, time, urllib.request, urllib.parse
DRY = os.environ.get("DRY_RUN", "false") == "true"
CAP = int(os.environ.get("MAX_DETACH", "5"))
PVE = os.environ["PVE_URL"].rstrip("/")
PVE_TOK = os.environ["PVE_TOKEN_ID"] + "=" + os.environ["PVE_TOKEN_SECRET"]
PG = os.environ.get("PUSHGATEWAY", "")
NODES = {201:"k8s-node1",202:"k8s-node2",203:"k8s-node3",204:"k8s-node4",205:"k8s-node5",206:"k8s-node6"}
_ktok = open("/var/run/secrets/kubernetes.io/serviceaccount/token").read().strip()
_kctx = ssl.create_default_context(cafile="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
_kctx.check_hostname = False # reach the API by injected ClusterIP (cluster DNS may not resolve in this pod); CA chain still verified
_kapi = "https://%s:%s" % (os.environ["KUBERNETES_SERVICE_HOST"], os.environ.get("KUBERNETES_SERVICE_PORT", "443"))
_pctx = ssl.create_default_context(); _pctx.check_hostname = False; _pctx.verify_mode = ssl.CERT_NONE
def k8s(path):
r = urllib.request.Request(_kapi + path, headers={"Authorization": "Bearer " + _ktok})
return json.load(urllib.request.urlopen(r, context=_kctx, timeout=20))
def pve(path, method="GET", data=None):
body = urllib.parse.urlencode(data).encode() if data else None
r = urllib.request.Request(PVE + path, data=body, method=method, headers={"Authorization": "PVEAPIToken=" + PVE_TOK})
return json.load(urllib.request.urlopen(r, context=_pctx, timeout=20))
def attached():
out = set()
for v in k8s("/apis/storage.k8s.io/v1/volumeattachments").get("items", []):
sp = v.get("spec", {})
if sp.get("attacher") != "csi.proxmox.sinextra.dev": continue
if not v.get("status", {}).get("attached"): continue
pv = (sp.get("source", {}) or {}).get("persistentVolumeName", "")
if pv: out.add((sp.get("nodeName", ""), pv.replace("pvc-", "")))
return out
def find_ghosts(att):
g = []
for vmid, node in NODES.items():
cfg = pve("/nodes/pve/qemu/%d/config" % vmid).get("data", {})
for key, val in cfg.items():
if not (key.startswith("scsi") and key[4:].isdigit()): continue
s = str(val)
if "vm-9999-pvc-" not in s: continue
uuid = s.split("vm-9999-pvc-")[1].split(",")[0]
if (node, uuid) not in att:
g.append((vmid, node, key, uuid))
return g
att = attached()
ghosts = find_ghosts(att)
print("[reconcile] attached_VAs=%d ghosts=%d dry=%s" % (len(att), len(ghosts), DRY), flush=True)
for vmid, node, scsi, uuid in ghosts:
print(" ghost: VM%d/%s %s -> pvc-%s" % (vmid, node, scsi, uuid), flush=True)
detached = 0
if ghosts:
time.sleep(60) # re-confirm: never act on an in-flight attach
att2 = attached()
confirmed = [x for x in ghosts if (x[1], x[3]) not in att2]
print("[reconcile] confirmed after 60s recheck: %d" % len(confirmed), flush=True)
for vmid, node, scsi, uuid in confirmed:
if detached >= CAP:
print("[reconcile] hit per-run cap %d, stopping" % CAP, flush=True); break
if DRY:
print(" DRY would detach VM%d %s (pvc-%s)" % (vmid, scsi, uuid), flush=True); continue
try:
pve("/nodes/pve/qemu/%d/config" % vmid, method="PUT", data={"delete": scsi})
print(" DETACHED VM%d %s (pvc-%s)" % (vmid, scsi, uuid), flush=True); detached += 1
except Exception as e:
print(" FAILED detach VM%d %s: %s" % (vmid, scsi, e), flush=True)
else:
print("[reconcile] no ghosts — all nodes reconciled", flush=True)
if PG:
try:
body = "csi_ghosts_detected %d\ncsi_ghosts_detached %d\ncsi_ghost_reconcile_last_run %d\n" % (len(ghosts), detached, int(time.time()))
urllib.request.urlopen(urllib.request.Request(PG.rstrip("/") + "/metrics/job/csi-ghost-reconcile", data=body.encode(), method="PUT"), timeout=10)
except Exception as e:
print("[reconcile] metric push failed: %s" % e, flush=True)
PY
}
depends_on = [module.proxmox-csi]
}
resource "kubernetes_cron_job_v1" "ghost_reconcile" {
metadata {
name = "csi-ghost-reconcile"
namespace = local.ghost_reconcile_ns
}
spec {
schedule = "*/15 * * * *"
concurrency_policy = "Forbid"
successful_jobs_history_limit = 3
failed_jobs_history_limit = 3
job_template {
metadata {}
spec {
backoff_limit = 1
active_deadline_seconds = 300
ttl_seconds_after_finished = 600
template {
metadata {}
spec {
service_account_name = kubernetes_service_account.ghost_reconcile.metadata[0].name
restart_policy = "Never"
container {
name = "reconcile"
image = "python:3.13-alpine"
image_pull_policy = "IfNotPresent"
command = ["python3", "/script/reconcile.py"]
env {
name = "PVE_URL"
value = "https://192.168.1.127:8006/api2/json"
}
env {
name = "DRY_RUN"
value = "false"
}
env {
name = "MAX_DETACH"
value = "5"
}
env {
name = "PUSHGATEWAY"
value = "http://prometheus-prometheus-pushgateway.monitoring.svc.cluster.local:9091"
}
env {
name = "PVE_TOKEN_ID"
value_from {
secret_key_ref {
name = kubernetes_secret.ghost_reconcile_pve.metadata[0].name
key = "token_id"
}
}
}
env {
name = "PVE_TOKEN_SECRET"
value_from {
secret_key_ref {
name = kubernetes_secret.ghost_reconcile_pve.metadata[0].name
key = "token_secret"
}
}
}
volume_mount {
name = "script"
mount_path = "/script"
}
resources {
requests = { cpu = "10m", memory = "64Mi" }
limits = { memory = "128Mi" }
}
}
volume {
name = "script"
config_map {
name = kubernetes_config_map.ghost_reconcile_script.metadata[0].name
default_mode = "0555"
}
}
}
}
}
}
}
lifecycle {
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
}
depends_on = [module.proxmox-csi]
}