hermes-agent: gate PVC on parked flag (clears PVCStuckPending)

The data_proxmox PVC is WaitForFirstConsumer; with the Deployment parked at
replicas=0 it had no consumer pod and sat Pending forever, falsely tripping
PVCStuckPending (which halts kured reboots). Introduce local.hermes_parked to
drive both replicas and the PVC count, so a parked service has no PVC at all.
Empty/never-bound PVC removed; recreated automatically when un-parked.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-05-31 15:19:28 +00:00
parent 7b6a0e70af
commit 935fb07df7

View file

@ -3,6 +3,15 @@ variable "tls_secret_name" {
sensitive = true
}
locals {
# Parked since 2026-04-22 (PVC /opt/data perms bug). While parked we run zero
# replicas AND skip the PVC entirely: a WaitForFirstConsumer PVC with no
# consumer pod sits Pending forever and falsely trips PVCStuckPending, which
# halts kured node reboots. Flip to false to bring Hermes back that
# recreates the PVC and scales the Deployment to 1 in a single apply.
hermes_parked = true
}
# --- Namespace ---
resource "kubernetes_namespace" "hermes_agent" {
@ -57,6 +66,7 @@ resource "kubernetes_manifest" "external_secret" {
# --- Storage ---
resource "kubernetes_persistent_volume_claim" "data_proxmox" {
count = local.hermes_parked ? 0 : 1
wait_until_bound = false
metadata {
name = "hermes-agent-data-proxmox"
@ -228,8 +238,10 @@ resource "kubernetes_deployment" "hermes_agent" {
strategy {
type = "Recreate"
}
# Disabled 2026-04-22 main container fails with "mkdir: cannot create directory '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init container). Re-enable after fixing PVC permissions.
replicas = 0
# Parked 2026-04-22 main container fails "mkdir: cannot create directory
# '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init
# container). Fix PVC perms before un-parking (set local.hermes_parked = false).
replicas = local.hermes_parked ? 0 : 1
selector {
match_labels = {
app = "hermes-agent"
@ -362,7 +374,9 @@ resource "kubernetes_deployment" "hermes_agent" {
volume {
name = "data"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
# Static name the PVC resource is count-gated by local.hermes_parked,
# so we can't reference the (possibly zero-instance) resource here.
claim_name = "hermes-agent-data-proxmox"
}
}
volume {