diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index bc173513..e1ce4b36 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -1050,7 +1050,7 @@ module "ingress" { # Ensure the CNPG cluster manifest exists (idempotent kubectl apply) resource "null_resource" "pg_cluster" { triggers = { - instances = "2" + instances = "3" image = "ghcr.io/cloudnative-pg/postgis:16" storage_size = "20Gi" storage_class = "proxmox-lvm-encrypted" @@ -1067,7 +1067,13 @@ resource "null_resource" "pg_cluster" { name: pg-cluster namespace: dbaas spec: - instances: 2 + # 3 instances (1 primary + 2 replicas) so a single-node drain (e.g. + # kured's weekly OS-reboot wave) still leaves a primary candidate + # immediately available for switchover. Previously 2; CNPG would + # still failover with 2 but only if the lone replica was caught up + # — during a long WAL backlog the failover would stall the drain. + # Bumped 2026-05-16 ahead of Monday's first post-fix kured cycle. + instances: 3 imageName: ghcr.io/cloudnative-pg/postgis:16 postgresql: parameters: diff --git a/stacks/kured/main.tf b/stacks/kured/main.tf index 514e07da..9b834468 100644 --- a/stacks/kured/main.tf +++ b/stacks/kured/main.tf @@ -72,6 +72,14 @@ resource "helm_release" "kured" { notifyUrl = data.vault_kv_secret_v2.secrets.data["slack_kured_webhook"] concurrency = 1 rebootDelay = "30s" + # Fail closed instead of looping forever. Default is 0 (unlimited) — if + # a future PDB or finalizer stalls drain, kured retries indefinitely and + # the node stays cordoned silently. 30m gives CNPG / shared-store + # Anubis / any other stateful workload plenty of time to settle, but + # caps the silent-failure window. After timeout kured logs the abort + # and waits for the next period; node stays Schedulable so the cluster + # doesn't lose capacity. Fixed 2026-05-16. + drainTimeout = "30m" # Halt rolling reboots when ANY firing Prometheus alert is not in the # ignore-list. The ignore-list excludes self-referential / always-firing # alerts that would otherwise deadlock kured. alertFilterMatchOnly stays