From 3ef860b2be15d3e573d476c5625fc543d157c609 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 16 May 2026 12:06:30 +0000
Subject: [PATCH] kured + cnpg: drain-safe defaults ahead of Monday reboot wave
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three defensive moves to make the kured rolling-reboot cycle survive
edge cases without operator intervention:

kured (stacks/kured/main.tf):
  - Set `configuration.drainTimeout = "30m"`. Default is unlimited; if
    a future PDB or finalizer stalls drain, kured retries forever and
    the node stays cordoned silently. 30m caps the silent-failure
    window — after timeout kured logs the abort and waits for the
    next period; the node stays Schedulable so cluster capacity isn't
    lost. Lets us fail closed instead of fail-silent.

CNPG pg-cluster (stacks/dbaas/modules/dbaas/main.tf):
  - Bump instances 2 → 3 (1 primary + 2 replicas). With 2 instances the
    failover during a primary-node drain depended on the lone replica
    being caught up; a WAL backlog would stall the drain until the
    replica was current. With 3 instances CNPG always has at least one
    fully-current replica to promote, and the PDB's
    `minAvailable=1` on the primary selector is satisfied throughout
    the switchover. Storage: +20Gi PVC on proxmox-lvm-encrypted (about
    35Gi after autoresize). Memory: +3Gi pod limit.
  - Updated the `triggers.instances` so the null_resource's local-exec
    actually re-applies the YAML (kubectl apply with the new spec). The
    YAML is the source-of-truth but the trigger is what tells terraform
    to re-run the provisioner.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 stacks/dbaas/modules/dbaas/main.tf | 10 ++++++++--
 stacks/kured/main.tf               |  8 ++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf
index bc173513..e1ce4b36 100644
--- a/stacks/dbaas/modules/dbaas/main.tf
+++ b/stacks/dbaas/modules/dbaas/main.tf
@@ -1050,7 +1050,7 @@ module "ingress" {
 # Ensure the CNPG cluster manifest exists (idempotent kubectl apply)
 resource "null_resource" "pg_cluster" {
   triggers = {
-    instances     = "2"
+    instances     = "3"
     image         = "ghcr.io/cloudnative-pg/postgis:16"
     storage_size  = "20Gi"
     storage_class = "proxmox-lvm-encrypted"
@@ -1067,7 +1067,13 @@ resource "null_resource" "pg_cluster" {
         name: pg-cluster
         namespace: dbaas
       spec:
-        instances: 2
+        # 3 instances (1 primary + 2 replicas) so a single-node drain (e.g.
+        # kured's weekly OS-reboot wave) still leaves a primary candidate
+        # immediately available for switchover. Previously 2; CNPG would
+        # still failover with 2 but only if the lone replica was caught up
+        # — during a long WAL backlog the failover would stall the drain.
+        # Bumped 2026-05-16 ahead of Monday's first post-fix kured cycle.
+        instances: 3
         imageName: ghcr.io/cloudnative-pg/postgis:16
         postgresql:
           parameters:
diff --git a/stacks/kured/main.tf b/stacks/kured/main.tf
index 514e07da..9b834468 100644
--- a/stacks/kured/main.tf
+++ b/stacks/kured/main.tf
@@ -72,6 +72,14 @@ resource "helm_release" "kured" {
       notifyUrl      = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"]
       concurrency    = 1
       rebootDelay    = "30s"
+      # Fail closed instead of looping forever. Default is 0 (unlimited) — if
+      # a future PDB or finalizer stalls drain, kured retries indefinitely and
+      # the node stays cordoned silently. 30m gives CNPG / shared-store
+      # Anubis / any other stateful workload plenty of time to settle, but
+      # caps the silent-failure window. After timeout kured logs the abort
+      # and waits for the next period; node stays Schedulable so the cluster
+      # doesn't lose capacity. Fixed 2026-05-16.
+      drainTimeout = "30m"
       # Halt rolling reboots when ANY firing Prometheus alert is not in the
       # ignore-list. The ignore-list excludes self-referential / always-firing
       # alerts that would otherwise deadlock kured. alertFilterMatchOnly stays