fix cluster health: pin actualbudget, spread MySQL, scale grampsweb, fix GPU toleration
- Pin actualbudget/actual-server from edge to 26.3.0 (all 3 instances) to prevent recurring migration breakage from rolling nightly builds - Add podAntiAffinity to MySQL InnoDB Cluster to spread replicas across nodes, relieving memory pressure on k8s-node4 - Scale grampsweb to 0 replicas (unused, consuming 1.7Gi memory) - Add GPU toleration Kyverno policy to Terraform using patchesJson6902 instead of patchStrategicMerge to fix toleration array being overwritten (caused caretta DaemonSet pod to be unable to schedule on k8s-master) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8565d90d23
commit
ccbbd4bc19
4 changed files with 71 additions and 9 deletions
|
|
@ -1,9 +1,9 @@
|
||||||
variable "tls_secret_name" {
|
variable "tls_secret_name" {
|
||||||
type = string
|
type = string
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
variable "actualbudget_credentials" {
|
variable "actualbudget_credentials" {
|
||||||
type = map(any)
|
type = map(any)
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
variable "nfs_server" { type = string }
|
variable "nfs_server" { type = string }
|
||||||
|
|
@ -37,7 +37,7 @@ module "tls_secret" {
|
||||||
module "viktor" {
|
module "viktor" {
|
||||||
source = "./factory"
|
source = "./factory"
|
||||||
name = "viktor"
|
name = "viktor"
|
||||||
tag = "edge"
|
tag = "26.3.0"
|
||||||
tls_secret_name = var.tls_secret_name
|
tls_secret_name = var.tls_secret_name
|
||||||
nfs_server = var.nfs_server
|
nfs_server = var.nfs_server
|
||||||
depends_on = [kubernetes_namespace.actualbudget]
|
depends_on = [kubernetes_namespace.actualbudget]
|
||||||
|
|
@ -58,7 +58,7 @@ module "viktor" {
|
||||||
module "anca" {
|
module "anca" {
|
||||||
source = "./factory"
|
source = "./factory"
|
||||||
name = "anca"
|
name = "anca"
|
||||||
tag = "edge"
|
tag = "26.3.0"
|
||||||
tls_secret_name = var.tls_secret_name
|
tls_secret_name = var.tls_secret_name
|
||||||
nfs_server = var.nfs_server
|
nfs_server = var.nfs_server
|
||||||
depends_on = [kubernetes_namespace.actualbudget]
|
depends_on = [kubernetes_namespace.actualbudget]
|
||||||
|
|
@ -79,7 +79,7 @@ module "anca" {
|
||||||
module "emo" {
|
module "emo" {
|
||||||
source = "./factory"
|
source = "./factory"
|
||||||
name = "emo"
|
name = "emo"
|
||||||
tag = "edge"
|
tag = "26.3.0"
|
||||||
tls_secret_name = var.tls_secret_name
|
tls_secret_name = var.tls_secret_name
|
||||||
nfs_server = var.nfs_server
|
nfs_server = var.nfs_server
|
||||||
depends_on = [kubernetes_namespace.actualbudget]
|
depends_on = [kubernetes_namespace.actualbudget]
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
variable "tls_secret_name" {
|
variable "tls_secret_name" {
|
||||||
type = string
|
type = string
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
variable "mailserver_accounts" { type = map(any) }
|
variable "mailserver_accounts" { type = map(any) }
|
||||||
|
|
@ -116,7 +116,7 @@ resource "kubernetes_deployment" "grampsweb" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
replicas = 1
|
replicas = 0
|
||||||
selector {
|
selector {
|
||||||
match_labels = {
|
match_labels = {
|
||||||
app = "grampsweb"
|
app = "grampsweb"
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ variable "prod" {
|
||||||
}
|
}
|
||||||
variable "nfs_server" { type = string }
|
variable "nfs_server" { type = string }
|
||||||
variable "kube_config_path" {
|
variable "kube_config_path" {
|
||||||
type = string
|
type = string
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -193,11 +193,21 @@ resource "helm_release" "mysql_cluster" {
|
||||||
matchExpressions = [{
|
matchExpressions = [{
|
||||||
key = "kubernetes.io/hostname"
|
key = "kubernetes.io/hostname"
|
||||||
operator = "NotIn"
|
operator = "NotIn"
|
||||||
values = ["k8s-node2"]
|
values = ["k8s-node1", "k8s-node2"]
|
||||||
}]
|
}]
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
podAntiAffinity = {
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution = [{
|
||||||
|
labelSelector = {
|
||||||
|
matchLabels = {
|
||||||
|
"component" = "mysqld"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
topologyKey = "kubernetes.io/hostname"
|
||||||
|
}]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
containers = [{
|
containers = [{
|
||||||
name = "mysql"
|
name = "mysql"
|
||||||
|
|
|
||||||
|
|
@ -802,6 +802,58 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- GPU toleration for critical tiers ---
|
||||||
|
# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
|
||||||
|
# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
|
||||||
|
# to APPEND the toleration without replacing existing tolerations.
|
||||||
|
resource "kubernetes_manifest" "mutate_gpu_toleration" {
|
||||||
|
manifest = {
|
||||||
|
apiVersion = "kyverno.io/v1"
|
||||||
|
kind = "ClusterPolicy"
|
||||||
|
metadata = {
|
||||||
|
name = "gpu-toleration-critical-tiers"
|
||||||
|
annotations = {
|
||||||
|
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
|
||||||
|
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spec = {
|
||||||
|
rules = [for tier in ["0-core", "1-cluster"] : {
|
||||||
|
name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
|
||||||
|
match = {
|
||||||
|
any = [
|
||||||
|
{
|
||||||
|
resources = {
|
||||||
|
kinds = ["Pod"]
|
||||||
|
operations = ["CREATE"]
|
||||||
|
namespaceSelector = {
|
||||||
|
matchLabels = {
|
||||||
|
tier = tier
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
skipBackgroundRequests = true
|
||||||
|
mutate = {
|
||||||
|
patchesJson6902 = yamlencode([
|
||||||
|
{
|
||||||
|
op = "add"
|
||||||
|
path = "/spec/tolerations/-"
|
||||||
|
value = {
|
||||||
|
key = "nvidia.com/gpu"
|
||||||
|
operator = "Exists"
|
||||||
|
effect = "NoSchedule"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
])
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# --- ndots:2 injection ---
|
# --- ndots:2 injection ---
|
||||||
# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
|
# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
|
||||||
# external DNS lookup (search domain expansion). This policy injects ndots:2
|
# external DNS lookup (search domain expansion). This policy injects ndots:2
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue