From d7953322ddb91b442616334aad87b01d5afeb7be Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 11 Mar 2026 11:43:00 +0000 Subject: [PATCH] fix cluster health: pin actualbudget, spread MySQL, scale grampsweb, fix GPU toleration - Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to prevent recurring migration breakage from rolling nightly builds - Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes, relieving memory pressure on k8s-node4 - Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory) - Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead of patchStrategicMerge to fix toleration array being overwritten (caused caretta DaemonSet pod to be unable to schedule on k8s-master) Co-Authored-By: Claude Opus 4.6 (1M context) --- stacks/actualbudget/main.tf | 10 ++-- stacks/grampsweb/main.tf | 4 +- stacks/platform/modules/dbaas/main.tf | 14 ++++- .../modules/kyverno/resource-governance.tf | 52 +++++++++++++++++++ 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/stacks/actualbudget/main.tf b/stacks/actualbudget/main.tf index 6883b31a..b257e549 100644 --- a/stacks/actualbudget/main.tf +++ b/stacks/actualbudget/main.tf @@ -1,9 +1,9 @@ variable "tls_secret_name" { - type = string + type = string sensitive = true } variable "actualbudget_credentials" { - type = map(any) + type = map(any) sensitive = true } variable "nfs_server" { type = string } @@ -37,7 +37,7 @@ module "tls_secret" { module "viktor" { source = "./factory" name = "viktor" - tag = "edge" + tag = "26.3.0" tls_secret_name = var.tls_secret_name nfs_server = var.nfs_server depends_on = [kubernetes_namespace.actualbudget] @@ -58,7 +58,7 @@ module "viktor" { module "anca" { source = "./factory" name = "anca" - tag = "edge" + tag = "26.3.0" tls_secret_name = var.tls_secret_name nfs_server = var.nfs_server depends_on = [kubernetes_namespace.actualbudget] @@ -79,7 +79,7 @@ module "anca" { module "emo" { source = "./factory" name = "emo" - tag = "edge" + tag = "26.3.0" tls_secret_name = var.tls_secret_name nfs_server = var.nfs_server depends_on = [kubernetes_namespace.actualbudget] diff --git a/stacks/grampsweb/main.tf b/stacks/grampsweb/main.tf index 7c5c4909..b49a1045 100644 --- a/stacks/grampsweb/main.tf +++ b/stacks/grampsweb/main.tf @@ -1,5 +1,5 @@ variable "tls_secret_name" { - type = string + type = string sensitive = true } variable "mailserver_accounts" { type = map(any) } @@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" { } } spec { - replicas = 1 + replicas = 0 selector { match_labels = { app = "grampsweb" diff --git a/stacks/platform/modules/dbaas/main.tf b/stacks/platform/modules/dbaas/main.tf index 9a149e16..d11ece6c 100644 --- a/stacks/platform/modules/dbaas/main.tf +++ b/stacks/platform/modules/dbaas/main.tf @@ -13,7 +13,7 @@ variable "prod" { } variable "nfs_server" { type = string } variable "kube_config_path" { - type = string + type = string sensitive = true } @@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" { matchExpressions = [{ key = "kubernetes.io/hostname" operator = "NotIn" - values = ["k8s-node2"] + values = ["k8s-node1", "k8s-node2"] }] }] } } + podAntiAffinity = { + requiredDuringSchedulingIgnoredDuringExecution = [{ + labelSelector = { + matchLabels = { + "component" = "mysqld" + } + } + topologyKey = "kubernetes.io/hostname" + }] + } } containers = [{ name = "mysql" diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf index e019fdb8..3ac9800e 100644 --- a/stacks/platform/modules/kyverno/resource-governance.tf +++ b/stacks/platform/modules/kyverno/resource-governance.tf @@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" { } } +# --- GPU toleration for critical tiers --- +# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the +# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge) +# to APPEND the toleration without replacing existing tolerations. +resource "kubernetes_manifest" "mutate_gpu_toleration" { + manifest = { + apiVersion = "kyverno.io/v1" + kind = "ClusterPolicy" + metadata = { + name = "gpu-toleration-critical-tiers" + annotations = { + "policies.kyverno.io/title" = "GPU Toleration for Critical Tiers" + "policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures." + } + } + spec = { + rules = [for tier in ["0-core", "1-cluster"] : { + name = "add-gpu-toleration-tier-${split("-", tier)[0]}" + match = { + any = [ + { + resources = { + kinds = ["Pod"] + operations = ["CREATE"] + namespaceSelector = { + matchLabels = { + tier = tier + } + } + } + } + ] + } + skipBackgroundRequests = true + mutate = { + patchesJson6902 = yamlencode([ + { + op = "add" + path = "/spec/tolerations/-" + value = { + key = "nvidia.com/gpu" + operator = "Exists" + effect = "NoSchedule" + } + } + ]) + } + }] + } + } +} + # --- ndots:2 injection --- # Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per # external DNS lookup (search domain expansion). This policy injects ndots:2