fix cluster health: pin actualbudget, spread MySQL, scale grampsweb, fix GPU toleration

- Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to prevent recurring migration breakage from rolling nightly builds - Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes, relieving memory pressure on k8s-node4 - Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory) - Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead of patchStrategicMerge to fix toleration array being overwritten (caused caretta DaemonSet pod to be unable to schedule on k8s-master) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-11 11:43:00 +00:00 · 2026-03-11 11:43:00 +00:00 · ccbbd4bc19
commit ccbbd4bc19
parent 8565d90d23
4 changed files with 71 additions and 9 deletions
--- a/stacks/actualbudget/main.tf
+++ b/stacks/actualbudget/main.tf
@ -1,9 +1,9 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
  sensitive = true
 }
 variable "actualbudget_credentials" {
-  type = map(any)
+  type      = map(any)
  sensitive = true
 }
 variable "nfs_server" { type = string }
@ -37,7 +37,7 @@ module "tls_secret" {
 module "viktor" {
  source                     = "./factory"
  name                       = "viktor"
-  tag                        = "edge"
+  tag                        = "26.3.0"
  tls_secret_name            = var.tls_secret_name
  nfs_server                 = var.nfs_server
  depends_on                 = [kubernetes_namespace.actualbudget]
@ -58,7 +58,7 @@ module "viktor" {
 module "anca" {
  source                     = "./factory"
  name                       = "anca"
-  tag                        = "edge"
+  tag                        = "26.3.0"
  tls_secret_name            = var.tls_secret_name
  nfs_server                 = var.nfs_server
  depends_on                 = [kubernetes_namespace.actualbudget]
@ -79,7 +79,7 @@ module "anca" {
 module "emo" {
  source                     = "./factory"
  name                       = "emo"
-  tag                        = "edge"
+  tag                        = "26.3.0"
  tls_secret_name            = var.tls_secret_name
  nfs_server                 = var.nfs_server
  depends_on                 = [kubernetes_namespace.actualbudget]
--- a/stacks/grampsweb/main.tf
+++ b/stacks/grampsweb/main.tf
@ -1,5 +1,5 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
  sensitive = true
 }
 variable "mailserver_accounts" { type = map(any) }
@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0
    selector {
      match_labels = {
        app = "grampsweb"
--- a/stacks/platform/modules/dbaas/main.tf
+++ b/stacks/platform/modules/dbaas/main.tf
@ -13,7 +13,7 @@ variable "prod" {
 }
 variable "nfs_server" { type = string }
 variable "kube_config_path" {
-  type = string
+  type      = string
  sensitive = true
 }
@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" {
              matchExpressions = [{
                key      = "kubernetes.io/hostname"
                operator = "NotIn"
-                values   = ["k8s-node2"]
+                values   = ["k8s-node1", "k8s-node2"]
              }]
            }]
          }
        }
        podAntiAffinity = {
          requiredDuringSchedulingIgnoredDuringExecution = [{
            labelSelector = {
              matchLabels = {
                "component" = "mysqld"
              }
            }
            topologyKey = "kubernetes.io/hostname"
          }]
        }
      }
      containers = [{
        name = "mysql"
--- a/stacks/platform/modules/kyverno/resource-governance.tf
+++ b/stacks/platform/modules/kyverno/resource-governance.tf
@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
  }
 }
 # --- GPU toleration for critical tiers ---
 # Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
 # GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
 # to APPEND the toleration without replacing existing tolerations.
 resource "kubernetes_manifest" "mutate_gpu_toleration" {
  manifest = {
    apiVersion = "kyverno.io/v1"
    kind       = "ClusterPolicy"
    metadata = {
      name = "gpu-toleration-critical-tiers"
      annotations = {
        "policies.kyverno.io/title"       = "GPU Toleration for Critical Tiers"
        "policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
      }
    }
    spec = {
      rules = [for tier in ["0-core", "1-cluster"] : {
        name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
        match = {
          any = [
            {
              resources = {
                kinds      = ["Pod"]
                operations = ["CREATE"]
                namespaceSelector = {
                  matchLabels = {
                    tier = tier
                  }
                }
              }
            }
          ]
        }
        skipBackgroundRequests = true
        mutate = {
          patchesJson6902 = yamlencode([
            {
              op   = "add"
              path = "/spec/tolerations/-"
              value = {
                key      = "nvidia.com/gpu"
                operator = "Exists"
                effect   = "NoSchedule"
              }
            }
          ])
        }
      }]
    }
  }
 }
 # --- ndots:2 injection ---
 # Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
 # external DNS lookup (search domain expansion). This policy injects ndots:2