fix cluster health: pin actualbudget, spread MySQL, scale grampsweb, fix GPU toleration

- Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to
  prevent recurring migration breakage from rolling nightly builds
- Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes,
  relieving memory pressure on k8s-node4
- Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory)
- Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead
  of patchStrategicMerge to fix toleration array being overwritten (caused
  caretta DaemonSet pod to be unable to schedule on k8s-master)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-03-11 11:43:00 +00:00
parent 6bdcd88d25
commit d7953322dd
4 changed files with 71 additions and 9 deletions

View file

@ -1,9 +1,9 @@
variable "tls_secret_name" {
type = string
type = string
sensitive = true
}
variable "actualbudget_credentials" {
type = map(any)
type = map(any)
sensitive = true
}
variable "nfs_server" { type = string }
@ -37,7 +37,7 @@ module "tls_secret" {
module "viktor" {
source = "./factory"
name = "viktor"
tag = "edge"
tag = "26.3.0"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
depends_on = [kubernetes_namespace.actualbudget]
@ -58,7 +58,7 @@ module "viktor" {
module "anca" {
source = "./factory"
name = "anca"
tag = "edge"
tag = "26.3.0"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
depends_on = [kubernetes_namespace.actualbudget]
@ -79,7 +79,7 @@ module "anca" {
module "emo" {
source = "./factory"
name = "emo"
tag = "edge"
tag = "26.3.0"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
depends_on = [kubernetes_namespace.actualbudget]

View file

@ -1,5 +1,5 @@
variable "tls_secret_name" {
type = string
type = string
sensitive = true
}
variable "mailserver_accounts" { type = map(any) }
@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" {
}
}
spec {
replicas = 1
replicas = 0
selector {
match_labels = {
app = "grampsweb"

View file

@ -13,7 +13,7 @@ variable "prod" {
}
variable "nfs_server" { type = string }
variable "kube_config_path" {
type = string
type = string
sensitive = true
}
@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" {
matchExpressions = [{
key = "kubernetes.io/hostname"
operator = "NotIn"
values = ["k8s-node2"]
values = ["k8s-node1", "k8s-node2"]
}]
}]
}
}
podAntiAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = [{
labelSelector = {
matchLabels = {
"component" = "mysqld"
}
}
topologyKey = "kubernetes.io/hostname"
}]
}
}
containers = [{
name = "mysql"

View file

@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
}
}
# --- GPU toleration for critical tiers ---
# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
# to APPEND the toleration without replacing existing tolerations.
resource "kubernetes_manifest" "mutate_gpu_toleration" {
manifest = {
apiVersion = "kyverno.io/v1"
kind = "ClusterPolicy"
metadata = {
name = "gpu-toleration-critical-tiers"
annotations = {
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
}
}
spec = {
rules = [for tier in ["0-core", "1-cluster"] : {
name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
match = {
any = [
{
resources = {
kinds = ["Pod"]
operations = ["CREATE"]
namespaceSelector = {
matchLabels = {
tier = tier
}
}
}
}
]
}
skipBackgroundRequests = true
mutate = {
patchesJson6902 = yamlencode([
{
op = "add"
path = "/spec/tolerations/-"
value = {
key = "nvidia.com/gpu"
operator = "Exists"
effect = "NoSchedule"
}
}
])
}
}]
}
}
}
# --- ndots:2 injection ---
# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
# external DNS lookup (search domain expansion). This policy injects ndots:2