fix cluster health: pin actualbudget, spread MySQL, scale grampsweb, fix GPU toleration
- Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to prevent recurring migration breakage from rolling nightly builds - Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes, relieving memory pressure on k8s-node4 - Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory) - Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead of patchStrategicMerge to fix toleration array being overwritten (caused caretta DaemonSet pod to be unable to schedule on k8s-master) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6bdcd88d25
commit
d7953322dd
4 changed files with 71 additions and 9 deletions
|
|
@ -1,9 +1,9 @@
|
|||
variable "tls_secret_name" {
|
||||
type = string
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "actualbudget_credentials" {
|
||||
type = map(any)
|
||||
type = map(any)
|
||||
sensitive = true
|
||||
}
|
||||
variable "nfs_server" { type = string }
|
||||
|
|
@ -37,7 +37,7 @@ module "tls_secret" {
|
|||
module "viktor" {
|
||||
source = "./factory"
|
||||
name = "viktor"
|
||||
tag = "edge"
|
||||
tag = "26.3.0"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
nfs_server = var.nfs_server
|
||||
depends_on = [kubernetes_namespace.actualbudget]
|
||||
|
|
@ -58,7 +58,7 @@ module "viktor" {
|
|||
module "anca" {
|
||||
source = "./factory"
|
||||
name = "anca"
|
||||
tag = "edge"
|
||||
tag = "26.3.0"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
nfs_server = var.nfs_server
|
||||
depends_on = [kubernetes_namespace.actualbudget]
|
||||
|
|
@ -79,7 +79,7 @@ module "anca" {
|
|||
module "emo" {
|
||||
source = "./factory"
|
||||
name = "emo"
|
||||
tag = "edge"
|
||||
tag = "26.3.0"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
nfs_server = var.nfs_server
|
||||
depends_on = [kubernetes_namespace.actualbudget]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
variable "tls_secret_name" {
|
||||
type = string
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "mailserver_accounts" { type = map(any) }
|
||||
|
|
@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" {
|
|||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
replicas = 0
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "grampsweb"
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ variable "prod" {
|
|||
}
|
||||
variable "nfs_server" { type = string }
|
||||
variable "kube_config_path" {
|
||||
type = string
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
|
|
@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" {
|
|||
matchExpressions = [{
|
||||
key = "kubernetes.io/hostname"
|
||||
operator = "NotIn"
|
||||
values = ["k8s-node2"]
|
||||
values = ["k8s-node1", "k8s-node2"]
|
||||
}]
|
||||
}]
|
||||
}
|
||||
}
|
||||
podAntiAffinity = {
|
||||
requiredDuringSchedulingIgnoredDuringExecution = [{
|
||||
labelSelector = {
|
||||
matchLabels = {
|
||||
"component" = "mysqld"
|
||||
}
|
||||
}
|
||||
topologyKey = "kubernetes.io/hostname"
|
||||
}]
|
||||
}
|
||||
}
|
||||
containers = [{
|
||||
name = "mysql"
|
||||
|
|
|
|||
|
|
@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
|
|||
}
|
||||
}
|
||||
|
||||
# --- GPU toleration for critical tiers ---
|
||||
# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
|
||||
# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
|
||||
# to APPEND the toleration without replacing existing tolerations.
|
||||
resource "kubernetes_manifest" "mutate_gpu_toleration" {
|
||||
manifest = {
|
||||
apiVersion = "kyverno.io/v1"
|
||||
kind = "ClusterPolicy"
|
||||
metadata = {
|
||||
name = "gpu-toleration-critical-tiers"
|
||||
annotations = {
|
||||
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
|
||||
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
|
||||
}
|
||||
}
|
||||
spec = {
|
||||
rules = [for tier in ["0-core", "1-cluster"] : {
|
||||
name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = tier
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
skipBackgroundRequests = true
|
||||
mutate = {
|
||||
patchesJson6902 = yamlencode([
|
||||
{
|
||||
op = "add"
|
||||
path = "/spec/tolerations/-"
|
||||
value = {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
}
|
||||
])
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# --- ndots:2 injection ---
|
||||
# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
|
||||
# external DNS lookup (search domain expansion). This policy injects ndots:2
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue