fix cluster health: pin actualbudget, spread MySQL, scale grampsweb, fix GPU toleration

- Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to prevent recurring migration breakage from rolling nightly builds - Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes, relieving memory pressure on k8s-node4 - Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory) - Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead of patchStrategicMerge to fix toleration array being overwritten (caused caretta DaemonSet pod to be unable to schedule on k8s-master) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-11 11:43:00 +00:00 · 2026-03-11 11:43:00 +00:00 · d7953322dd
commit d7953322dd
parent 6bdcd88d25
4 changed files with 71 additions and 9 deletions
--- a/stacks/actualbudget/main.tf
+++ b/stacks/actualbudget/main.tf
@ -1,9 +1,9 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
  sensitive = true
 }
 variable "actualbudget_credentials" {
-  type = map(any)
+  type      = map(any)
  sensitive = true
 }
 variable "nfs_server" { type = string }
@ -37,7 +37,7 @@ module "tls_secret" {
 module "viktor" {
  source                     = "./factory"
  name                       = "viktor"
-  tag                        = "edge"
+  tag                        = "26.3.0"
  tls_secret_name            = var.tls_secret_name
  nfs_server                 = var.nfs_server
  depends_on                 = [kubernetes_namespace.actualbudget]
@ -58,7 +58,7 @@ module "viktor" {
 module "anca" {
  source                     = "./factory"
  name                       = "anca"
-  tag                        = "edge"
+  tag                        = "26.3.0"
  tls_secret_name            = var.tls_secret_name
  nfs_server                 = var.nfs_server
  depends_on                 = [kubernetes_namespace.actualbudget]
@ -79,7 +79,7 @@ module "anca" {
 module "emo" {
  source                     = "./factory"
  name                       = "emo"
-  tag                        = "edge"
+  tag                        = "26.3.0"
  tls_secret_name            = var.tls_secret_name
  nfs_server                 = var.nfs_server
  depends_on                 = [kubernetes_namespace.actualbudget]
--- a/stacks/grampsweb/main.tf
+++ b/stacks/grampsweb/main.tf
@ -1,5 +1,5 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
  sensitive = true
 }
 variable "mailserver_accounts" { type = map(any) }
@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0
    selector {
      match_labels = {
        app = "grampsweb"
--- a/stacks/platform/modules/dbaas/main.tf
+++ b/stacks/platform/modules/dbaas/main.tf
@ -13,7 +13,7 @@ variable "prod" {
 }
 variable "nfs_server" { type = string }
 variable "kube_config_path" {
-  type = string
+  type      = string
  sensitive = true
 }

@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" {
              matchExpressions = [{
                key      = "kubernetes.io/hostname"
                operator = "NotIn"
-                values   = ["k8s-node2"]
+                values   = ["k8s-node1", "k8s-node2"]
              }]
            }]
          }
        }
+        podAntiAffinity = {
+          requiredDuringSchedulingIgnoredDuringExecution = [{
+            labelSelector = {
+              matchLabels = {
+                "component" = "mysqld"
+              }
+            }
+            topologyKey = "kubernetes.io/hostname"
+          }]
+        }
      }
      containers = [{
        name = "mysql"
--- a/stacks/platform/modules/kyverno/resource-governance.tf
+++ b/stacks/platform/modules/kyverno/resource-governance.tf
@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
  }
 }

+# --- GPU toleration for critical tiers ---
+# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
+# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
+# to APPEND the toleration without replacing existing tolerations.
+resource "kubernetes_manifest" "mutate_gpu_toleration" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "gpu-toleration-critical-tiers"
+      annotations = {
+        "policies.kyverno.io/title"       = "GPU Toleration for Critical Tiers"
+        "policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
+      }
+    }
+    spec = {
+      rules = [for tier in ["0-core", "1-cluster"] : {
+        name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
+        match = {
+          any = [
+            {
+              resources = {
+                kinds      = ["Pod"]
+                operations = ["CREATE"]
+                namespaceSelector = {
+                  matchLabels = {
+                    tier = tier
+                  }
+                }
+              }
+            }
+          ]
+        }
+        skipBackgroundRequests = true
+        mutate = {
+          patchesJson6902 = yamlencode([
+            {
+              op   = "add"
+              path = "/spec/tolerations/-"
+              value = {
+                key      = "nvidia.com/gpu"
+                operator = "Exists"
+                effect   = "NoSchedule"
+              }
+            }
+          ])
+        }
+      }]
+    }
+  }
+}
+
 # --- ndots:2 injection ---
 # Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
 # external DNS lookup (search domain expansion). This policy injects ndots:2