hermes-agent: gate PVC on parked flag (clears PVCStuckPending)
The data_proxmox PVC is WaitForFirstConsumer; with the Deployment parked at replicas=0 it had no consumer pod and sat Pending forever, falsely tripping PVCStuckPending (which halts kured reboots). Introduce local.hermes_parked to drive both replicas and the PVC count, so a parked service has no PVC at all. Empty/never-bound PVC removed; recreated automatically when un-parked. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
7b6a0e70af
commit
935fb07df7
1 changed files with 17 additions and 3 deletions
|
|
@ -3,6 +3,15 @@ variable "tls_secret_name" {
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
locals {
|
||||||
|
# Parked since 2026-04-22 (PVC /opt/data perms bug). While parked we run zero
|
||||||
|
# replicas AND skip the PVC entirely: a WaitForFirstConsumer PVC with no
|
||||||
|
# consumer pod sits Pending forever and falsely trips PVCStuckPending, which
|
||||||
|
# halts kured node reboots. Flip to false to bring Hermes back — that
|
||||||
|
# recreates the PVC and scales the Deployment to 1 in a single apply.
|
||||||
|
hermes_parked = true
|
||||||
|
}
|
||||||
|
|
||||||
# --- Namespace ---
|
# --- Namespace ---
|
||||||
|
|
||||||
resource "kubernetes_namespace" "hermes_agent" {
|
resource "kubernetes_namespace" "hermes_agent" {
|
||||||
|
|
@ -57,6 +66,7 @@ resource "kubernetes_manifest" "external_secret" {
|
||||||
# --- Storage ---
|
# --- Storage ---
|
||||||
|
|
||||||
resource "kubernetes_persistent_volume_claim" "data_proxmox" {
|
resource "kubernetes_persistent_volume_claim" "data_proxmox" {
|
||||||
|
count = local.hermes_parked ? 0 : 1
|
||||||
wait_until_bound = false
|
wait_until_bound = false
|
||||||
metadata {
|
metadata {
|
||||||
name = "hermes-agent-data-proxmox"
|
name = "hermes-agent-data-proxmox"
|
||||||
|
|
@ -228,8 +238,10 @@ resource "kubernetes_deployment" "hermes_agent" {
|
||||||
strategy {
|
strategy {
|
||||||
type = "Recreate"
|
type = "Recreate"
|
||||||
}
|
}
|
||||||
# Disabled 2026-04-22 — main container fails with "mkdir: cannot create directory '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init container). Re-enable after fixing PVC permissions.
|
# Parked 2026-04-22 — main container fails "mkdir: cannot create directory
|
||||||
replicas = 0
|
# '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init
|
||||||
|
# container). Fix PVC perms before un-parking (set local.hermes_parked = false).
|
||||||
|
replicas = local.hermes_parked ? 0 : 1
|
||||||
selector {
|
selector {
|
||||||
match_labels = {
|
match_labels = {
|
||||||
app = "hermes-agent"
|
app = "hermes-agent"
|
||||||
|
|
@ -362,7 +374,9 @@ resource "kubernetes_deployment" "hermes_agent" {
|
||||||
volume {
|
volume {
|
||||||
name = "data"
|
name = "data"
|
||||||
persistent_volume_claim {
|
persistent_volume_claim {
|
||||||
claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
|
# Static name — the PVC resource is count-gated by local.hermes_parked,
|
||||||
|
# so we can't reference the (possibly zero-instance) resource here.
|
||||||
|
claim_name = "hermes-agent-data-proxmox"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
volume {
|
volume {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue