From 12b4f6f81a947cf2691b83d5e1aea38018ec52e5 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Tue, 26 May 2026 09:00:37 +0000
Subject: [PATCH] dbaas: require pod anti-affinity on pg-cluster (one PG per
 node)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Default CNPG affinity was `preferred` (soft). During the 2026-05-26
node4 outage, all 3 pg-cluster pods drifted onto k8s-node1 — losing
that node would have taken the whole PG cluster down (no quorum) AND
the 9.2 GiB pg-cluster footprint was the dominant reason frigate
couldn't fit on the GPU node.

With 3 instances + 4 worker nodes, `required` is safe under 1-node
drain (3 distinct nodes always available, even excluding the drained
one).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 stacks/dbaas/modules/dbaas/main.tf | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf
index e1f39f76..94236930 100644
--- a/stacks/dbaas/modules/dbaas/main.tf
+++ b/stacks/dbaas/modules/dbaas/main.tf
@@ -1088,6 +1088,7 @@ resource "null_resource" "pg_cluster" {
     storage_class = "proxmox-lvm-encrypted"
     memory_limit  = "3Gi"
     pg_params     = "v3-shared1024-walcomp-workmem16-max200"
+    affinity      = "required-hostname-v1"
   }
 
   provisioner "local-exec" {
@@ -1106,6 +1107,15 @@ resource "null_resource" "pg_cluster" {
         # — during a long WAL backlog the failover would stall the drain.
         # Bumped 2026-05-16 ahead of Monday's first post-fix kured cycle.
         instances: 3
+        # Hard anti-affinity: force one PG instance per node. Default is
+        # `preferred` which let all 3 pods collapse onto k8s-node1 during
+        # the 2026-05-26 node4 outage — losing node1 would have killed the
+        # whole cluster (no quorum). With 3 instances + 4 worker nodes,
+        # `required` is safe under 1-node drain.
+        affinity:
+          enablePodAntiAffinity: true
+          podAntiAffinityType: required
+          topologyKey: kubernetes.io/hostname
         imageName: ghcr.io/cloudnative-pg/postgis:16
         postgresql:
           parameters: