proxmox-csi: auto-reconcile CronJob to detach ghost disks (code-dfjn prevention)
Closes the ghost-disk doom loop by construction (failed detach -> orphan scsiN with no VolumeAttachment -> invisible oversubscription -> query-pci wedge). Every 15min csi-ghost-reconcile compares each worker VM's real scsi disks (Proxmox API) vs k8s VolumeAttachments and safely detaches ghosts (PUT .../config delete=scsiN -> frees the LUN slot, retains the LV). - detection mirrors cluster-health check #47 - SAFETY: only vm-9999-pvc scsi with no matching VA; 60s re-confirm; per-run cap 5 - scoped CSI API token (VM.Config.Disk), not root SSH; k8s API via injected ClusterIP - verified live: read 66 VAs, 0 ghosts, no false positives - pushes csi_ghosts_detected/detached to Pushgateway Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
e311cbe103
commit
355ca3ee91
1 changed files with 241 additions and 0 deletions
241
stacks/proxmox-csi/ghost-reconcile.tf
Normal file
241
stacks/proxmox-csi/ghost-reconcile.tf
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
# Ghost-disk auto-reconcile (beads code-dfjn — the prevention half).
|
||||
#
|
||||
# proxmox-csi hot-plugs each PVC as a virtio-scsi disk via the Proxmox API.
|
||||
# A failed detach (query-pci QMP timeout on a disk-heavy VM) leaves a "ghost":
|
||||
# a scsiN entry in the VM config with NO matching k8s VolumeAttachment. Ghosts
|
||||
# are invisible to the NodeVolumeLimits scheduler (it counts VAs, not real scsi
|
||||
# disks), so the node gets oversubscribed until query-pci wedges — the doom loop.
|
||||
#
|
||||
# This CronJob closes the loop: every 15 min it compares each worker VM's real
|
||||
# scsi disks (Proxmox API) against k8s VolumeAttachments, and safely detaches any
|
||||
# ghost (`PUT .../config delete=scsiN` — frees the LUN slot, retains the LV, same
|
||||
# as `qm set --delete scsiN`). Detection mirrors cluster-health check #47.
|
||||
#
|
||||
# SAFETY: only acts on `vm-9999-pvc-*` scsi entries with NO VolumeAttachment for
|
||||
# that PV on that node; re-confirms after a 60s sleep (so an in-flight attach is
|
||||
# never caught); caps detaches per run. Uses the scoped CSI API token (VM.Config.Disk),
|
||||
# NOT root SSH. Detach is non-destructive to data (the LV is retained).
|
||||
|
||||
locals {
|
||||
ghost_reconcile_ns = "proxmox-csi"
|
||||
}
|
||||
|
||||
resource "kubernetes_secret" "ghost_reconcile_pve" {
|
||||
metadata {
|
||||
name = "csi-ghost-reconcile-pve"
|
||||
namespace = local.ghost_reconcile_ns
|
||||
}
|
||||
data = {
|
||||
token_id = data.vault_kv_secret_v2.secrets.data["proxmox_csi_token_id"]
|
||||
token_secret = data.vault_kv_secret_v2.secrets.data["proxmox_csi_token_secret"]
|
||||
}
|
||||
depends_on = [module.proxmox-csi]
|
||||
}
|
||||
|
||||
resource "kubernetes_service_account" "ghost_reconcile" {
|
||||
metadata {
|
||||
name = "csi-ghost-reconcile"
|
||||
namespace = local.ghost_reconcile_ns
|
||||
}
|
||||
depends_on = [module.proxmox-csi]
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role" "ghost_reconcile" {
|
||||
metadata { name = "csi-ghost-reconcile" }
|
||||
rule {
|
||||
api_groups = ["storage.k8s.io"]
|
||||
resources = ["volumeattachments"]
|
||||
verbs = ["get", "list"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "ghost_reconcile" {
|
||||
metadata { name = "csi-ghost-reconcile" }
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "ClusterRole"
|
||||
name = kubernetes_cluster_role.ghost_reconcile.metadata[0].name
|
||||
}
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = kubernetes_service_account.ghost_reconcile.metadata[0].name
|
||||
namespace = local.ghost_reconcile_ns
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map" "ghost_reconcile_script" {
|
||||
metadata {
|
||||
name = "csi-ghost-reconcile-script"
|
||||
namespace = local.ghost_reconcile_ns
|
||||
}
|
||||
data = {
|
||||
"reconcile.py" = <<-PY
|
||||
import json, os, ssl, sys, time, urllib.request, urllib.parse
|
||||
|
||||
DRY = os.environ.get("DRY_RUN", "false") == "true"
|
||||
CAP = int(os.environ.get("MAX_DETACH", "5"))
|
||||
PVE = os.environ["PVE_URL"].rstrip("/")
|
||||
PVE_TOK = os.environ["PVE_TOKEN_ID"] + "=" + os.environ["PVE_TOKEN_SECRET"]
|
||||
PG = os.environ.get("PUSHGATEWAY", "")
|
||||
NODES = {201:"k8s-node1",202:"k8s-node2",203:"k8s-node3",204:"k8s-node4",205:"k8s-node5",206:"k8s-node6"}
|
||||
|
||||
_ktok = open("/var/run/secrets/kubernetes.io/serviceaccount/token").read().strip()
|
||||
_kctx = ssl.create_default_context(cafile="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
|
||||
_kctx.check_hostname = False # reach the API by injected ClusterIP (cluster DNS may not resolve in this pod); CA chain still verified
|
||||
_kapi = "https://%s:%s" % (os.environ["KUBERNETES_SERVICE_HOST"], os.environ.get("KUBERNETES_SERVICE_PORT", "443"))
|
||||
_pctx = ssl.create_default_context(); _pctx.check_hostname = False; _pctx.verify_mode = ssl.CERT_NONE
|
||||
|
||||
def k8s(path):
|
||||
r = urllib.request.Request(_kapi + path, headers={"Authorization": "Bearer " + _ktok})
|
||||
return json.load(urllib.request.urlopen(r, context=_kctx, timeout=20))
|
||||
|
||||
def pve(path, method="GET", data=None):
|
||||
body = urllib.parse.urlencode(data).encode() if data else None
|
||||
r = urllib.request.Request(PVE + path, data=body, method=method, headers={"Authorization": "PVEAPIToken=" + PVE_TOK})
|
||||
return json.load(urllib.request.urlopen(r, context=_pctx, timeout=20))
|
||||
|
||||
def attached():
|
||||
out = set()
|
||||
for v in k8s("/apis/storage.k8s.io/v1/volumeattachments").get("items", []):
|
||||
sp = v.get("spec", {})
|
||||
if sp.get("attacher") != "csi.proxmox.sinextra.dev": continue
|
||||
if not v.get("status", {}).get("attached"): continue
|
||||
pv = (sp.get("source", {}) or {}).get("persistentVolumeName", "")
|
||||
if pv: out.add((sp.get("nodeName", ""), pv.replace("pvc-", "")))
|
||||
return out
|
||||
|
||||
def find_ghosts(att):
|
||||
g = []
|
||||
for vmid, node in NODES.items():
|
||||
cfg = pve("/nodes/pve/qemu/%d/config" % vmid).get("data", {})
|
||||
for key, val in cfg.items():
|
||||
if not (key.startswith("scsi") and key[4:].isdigit()): continue
|
||||
s = str(val)
|
||||
if "vm-9999-pvc-" not in s: continue
|
||||
uuid = s.split("vm-9999-pvc-")[1].split(",")[0]
|
||||
if (node, uuid) not in att:
|
||||
g.append((vmid, node, key, uuid))
|
||||
return g
|
||||
|
||||
att = attached()
|
||||
ghosts = find_ghosts(att)
|
||||
print("[reconcile] attached_VAs=%d ghosts=%d dry=%s" % (len(att), len(ghosts), DRY), flush=True)
|
||||
for vmid, node, scsi, uuid in ghosts:
|
||||
print(" ghost: VM%d/%s %s -> pvc-%s" % (vmid, node, scsi, uuid), flush=True)
|
||||
|
||||
detached = 0
|
||||
if ghosts:
|
||||
time.sleep(60) # re-confirm: never act on an in-flight attach
|
||||
att2 = attached()
|
||||
confirmed = [x for x in ghosts if (x[1], x[3]) not in att2]
|
||||
print("[reconcile] confirmed after 60s recheck: %d" % len(confirmed), flush=True)
|
||||
for vmid, node, scsi, uuid in confirmed:
|
||||
if detached >= CAP:
|
||||
print("[reconcile] hit per-run cap %d, stopping" % CAP, flush=True); break
|
||||
if DRY:
|
||||
print(" DRY would detach VM%d %s (pvc-%s)" % (vmid, scsi, uuid), flush=True); continue
|
||||
try:
|
||||
pve("/nodes/pve/qemu/%d/config" % vmid, method="PUT", data={"delete": scsi})
|
||||
print(" DETACHED VM%d %s (pvc-%s)" % (vmid, scsi, uuid), flush=True); detached += 1
|
||||
except Exception as e:
|
||||
print(" FAILED detach VM%d %s: %s" % (vmid, scsi, e), flush=True)
|
||||
else:
|
||||
print("[reconcile] no ghosts — all nodes reconciled", flush=True)
|
||||
|
||||
if PG:
|
||||
try:
|
||||
body = "csi_ghosts_detected %d\ncsi_ghosts_detached %d\ncsi_ghost_reconcile_last_run %d\n" % (len(ghosts), detached, int(time.time()))
|
||||
urllib.request.urlopen(urllib.request.Request(PG.rstrip("/") + "/metrics/job/csi-ghost-reconcile", data=body.encode(), method="PUT"), timeout=10)
|
||||
except Exception as e:
|
||||
print("[reconcile] metric push failed: %s" % e, flush=True)
|
||||
PY
|
||||
}
|
||||
depends_on = [module.proxmox-csi]
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "ghost_reconcile" {
|
||||
metadata {
|
||||
name = "csi-ghost-reconcile"
|
||||
namespace = local.ghost_reconcile_ns
|
||||
}
|
||||
spec {
|
||||
schedule = "*/15 * * * *"
|
||||
concurrency_policy = "Forbid"
|
||||
successful_jobs_history_limit = 3
|
||||
failed_jobs_history_limit = 3
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
active_deadline_seconds = 300
|
||||
ttl_seconds_after_finished = 600
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
service_account_name = kubernetes_service_account.ghost_reconcile.metadata[0].name
|
||||
restart_policy = "Never"
|
||||
container {
|
||||
name = "reconcile"
|
||||
image = "python:3.13-alpine"
|
||||
image_pull_policy = "IfNotPresent"
|
||||
command = ["python3", "/script/reconcile.py"]
|
||||
env {
|
||||
name = "PVE_URL"
|
||||
value = "https://192.168.1.127:8006/api2/json"
|
||||
}
|
||||
env {
|
||||
name = "DRY_RUN"
|
||||
value = "false"
|
||||
}
|
||||
env {
|
||||
name = "MAX_DETACH"
|
||||
value = "5"
|
||||
}
|
||||
env {
|
||||
name = "PUSHGATEWAY"
|
||||
value = "http://prometheus-prometheus-pushgateway.monitoring.svc.cluster.local:9091"
|
||||
}
|
||||
env {
|
||||
name = "PVE_TOKEN_ID"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.ghost_reconcile_pve.metadata[0].name
|
||||
key = "token_id"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "PVE_TOKEN_SECRET"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.ghost_reconcile_pve.metadata[0].name
|
||||
key = "token_secret"
|
||||
}
|
||||
}
|
||||
}
|
||||
volume_mount {
|
||||
name = "script"
|
||||
mount_path = "/script"
|
||||
}
|
||||
resources {
|
||||
requests = { cpu = "10m", memory = "64Mi" }
|
||||
limits = { memory = "128Mi" }
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "script"
|
||||
config_map {
|
||||
name = kubernetes_config_map.ghost_reconcile_script.metadata[0].name
|
||||
default_mode = "0555"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
depends_on = [module.proxmox-csi]
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue