From 935fb07df7be670b3003ce2c0adcf0e9ebf0592e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 31 May 2026 15:19:28 +0000 Subject: [PATCH] hermes-agent: gate PVC on parked flag (clears PVCStuckPending) The data_proxmox PVC is WaitForFirstConsumer; with the Deployment parked at replicas=0 it had no consumer pod and sat Pending forever, falsely tripping PVCStuckPending (which halts kured reboots). Introduce local.hermes_parked to drive both replicas and the PVC count, so a parked service has no PVC at all. Empty/never-bound PVC removed; recreated automatically when un-parked. Co-Authored-By: Claude Opus 4.7 --- stacks/hermes-agent/main.tf | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/stacks/hermes-agent/main.tf b/stacks/hermes-agent/main.tf index b4345982..dced38c4 100644 --- a/stacks/hermes-agent/main.tf +++ b/stacks/hermes-agent/main.tf @@ -3,6 +3,15 @@ variable "tls_secret_name" { sensitive = true } +locals { + # Parked since 2026-04-22 (PVC /opt/data perms bug). While parked we run zero + # replicas AND skip the PVC entirely: a WaitForFirstConsumer PVC with no + # consumer pod sits Pending forever and falsely trips PVCStuckPending, which + # halts kured node reboots. Flip to false to bring Hermes back — that + # recreates the PVC and scales the Deployment to 1 in a single apply. + hermes_parked = true +} + # --- Namespace --- resource "kubernetes_namespace" "hermes_agent" { @@ -57,6 +66,7 @@ resource "kubernetes_manifest" "external_secret" { # --- Storage --- resource "kubernetes_persistent_volume_claim" "data_proxmox" { + count = local.hermes_parked ? 0 : 1 wait_until_bound = false metadata { name = "hermes-agent-data-proxmox" @@ -228,8 +238,10 @@ resource "kubernetes_deployment" "hermes_agent" { strategy { type = "Recreate" } - # Disabled 2026-04-22 — main container fails with "mkdir: cannot create directory '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init container). Re-enable after fixing PVC permissions. - replicas = 0 + # Parked 2026-04-22 — main container fails "mkdir: cannot create directory + # '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init + # container). Fix PVC perms before un-parking (set local.hermes_parked = false). + replicas = local.hermes_parked ? 0 : 1 selector { match_labels = { app = "hermes-agent" @@ -362,7 +374,9 @@ resource "kubernetes_deployment" "hermes_agent" { volume { name = "data" persistent_volume_claim { - claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name + # Static name — the PVC resource is count-gated by local.hermes_parked, + # so we can't reference the (possibly zero-instance) resource here. + claim_name = "hermes-agent-data-proxmox" } } volume {