From d7953322ddb91b442616334aad87b01d5afeb7be Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Wed, 11 Mar 2026 11:43:00 +0000
Subject: [PATCH] fix cluster health: pin actualbudget, spread MySQL, scale
 grampsweb, fix GPU toleration

- Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to
  prevent recurring migration breakage from rolling nightly builds
- Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes,
  relieving memory pressure on k8s-node4
- Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory)
- Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead
  of patchStrategicMerge to fix toleration array being overwritten (caused
  caretta DaemonSet pod to be unable to schedule on k8s-master)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 stacks/actualbudget/main.tf                   | 10 ++--
 stacks/grampsweb/main.tf                      |  4 +-
 stacks/platform/modules/dbaas/main.tf         | 14 ++++-
 .../modules/kyverno/resource-governance.tf    | 52 +++++++++++++++++++
 4 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/stacks/actualbudget/main.tf b/stacks/actualbudget/main.tf
index 6883b31a..b257e549 100644
--- a/stacks/actualbudget/main.tf
+++ b/stacks/actualbudget/main.tf
@@ -1,9 +1,9 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
   sensitive = true
 }
 variable "actualbudget_credentials" {
-  type = map(any)
+  type      = map(any)
   sensitive = true
 }
 variable "nfs_server" { type = string }
@@ -37,7 +37,7 @@ module "tls_secret" {
 module "viktor" {
   source                     = "./factory"
   name                       = "viktor"
-  tag                        = "edge"
+  tag                        = "26.3.0"
   tls_secret_name            = var.tls_secret_name
   nfs_server                 = var.nfs_server
   depends_on                 = [kubernetes_namespace.actualbudget]
@@ -58,7 +58,7 @@ module "viktor" {
 module "anca" {
   source                     = "./factory"
   name                       = "anca"
-  tag                        = "edge"
+  tag                        = "26.3.0"
   tls_secret_name            = var.tls_secret_name
   nfs_server                 = var.nfs_server
   depends_on                 = [kubernetes_namespace.actualbudget]
@@ -79,7 +79,7 @@ module "anca" {
 module "emo" {
   source                     = "./factory"
   name                       = "emo"
-  tag                        = "edge"
+  tag                        = "26.3.0"
   tls_secret_name            = var.tls_secret_name
   nfs_server                 = var.nfs_server
   depends_on                 = [kubernetes_namespace.actualbudget]
diff --git a/stacks/grampsweb/main.tf b/stacks/grampsweb/main.tf
index 7c5c4909..b49a1045 100644
--- a/stacks/grampsweb/main.tf
+++ b/stacks/grampsweb/main.tf
@@ -1,5 +1,5 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
   sensitive = true
 }
 variable "mailserver_accounts" { type = map(any) }
@@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 0
     selector {
       match_labels = {
         app = "grampsweb"
diff --git a/stacks/platform/modules/dbaas/main.tf b/stacks/platform/modules/dbaas/main.tf
index 9a149e16..d11ece6c 100644
--- a/stacks/platform/modules/dbaas/main.tf
+++ b/stacks/platform/modules/dbaas/main.tf
@@ -13,7 +13,7 @@ variable "prod" {
 }
 variable "nfs_server" { type = string }
 variable "kube_config_path" {
-  type = string
+  type      = string
   sensitive = true
 }
 
@@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" {
               matchExpressions = [{
                 key      = "kubernetes.io/hostname"
                 operator = "NotIn"
-                values   = ["k8s-node2"]
+                values   = ["k8s-node1", "k8s-node2"]
               }]
             }]
           }
         }
+        podAntiAffinity = {
+          requiredDuringSchedulingIgnoredDuringExecution = [{
+            labelSelector = {
+              matchLabels = {
+                "component" = "mysqld"
+              }
+            }
+            topologyKey = "kubernetes.io/hostname"
+          }]
+        }
       }
       containers = [{
         name = "mysql"
diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf
index e019fdb8..3ac9800e 100644
--- a/stacks/platform/modules/kyverno/resource-governance.tf
+++ b/stacks/platform/modules/kyverno/resource-governance.tf
@@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
   }
 }
 
+# --- GPU toleration for critical tiers ---
+# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
+# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
+# to APPEND the toleration without replacing existing tolerations.
+resource "kubernetes_manifest" "mutate_gpu_toleration" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "gpu-toleration-critical-tiers"
+      annotations = {
+        "policies.kyverno.io/title"       = "GPU Toleration for Critical Tiers"
+        "policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
+      }
+    }
+    spec = {
+      rules = [for tier in ["0-core", "1-cluster"] : {
+        name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
+        match = {
+          any = [
+            {
+              resources = {
+                kinds      = ["Pod"]
+                operations = ["CREATE"]
+                namespaceSelector = {
+                  matchLabels = {
+                    tier = tier
+                  }
+                }
+              }
+            }
+          ]
+        }
+        skipBackgroundRequests = true
+        mutate = {
+          patchesJson6902 = yamlencode([
+            {
+              op   = "add"
+              path = "/spec/tolerations/-"
+              value = {
+                key      = "nvidia.com/gpu"
+                operator = "Exists"
+                effect   = "NoSchedule"
+              }
+            }
+          ])
+        }
+      }]
+    }
+  }
+}
+
 # --- ndots:2 injection ---
 # Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
 # external DNS lookup (search domain expansion). This policy injects ndots:2