fix immich TF drift from Kyverno ndots injection, right-size nvidia GPU operator

- immich: add lifecycle ignore_changes for dns_config on all 3 deployments
  to prevent perpetual plan drift from Kyverno ndots:2 mutation policy
- nvidia dcgm-exporter: 768Mi → 2560Mi (VPA upper 2091Mi, was under-provisioned)
- nvidia cuda-validator: 1024Mi → 256Mi (one-shot job, vastly over-provisioned)
This commit is contained in:
Viktor Barzin 2026-03-15 15:36:19 +00:00
parent a6d281dbc6
commit 5beb481dc4
2 changed files with 36 additions and 0 deletions

View file

@ -114,6 +114,12 @@ resource "kubernetes_deployment" "immich_server" {
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
]
}
spec {
replicas = 1
progress_deadline_seconds = 600
@ -326,6 +332,13 @@ resource "kubernetes_deployment" "immich-postgres" {
tier = local.tiers.gpu
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
]
}
spec {
replicas = 1
selector {
@ -436,6 +449,13 @@ resource "kubernetes_deployment" "immich-machine-learning" {
tier = local.tiers.gpu
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
]
}
spec {
replicas = 1
selector {

View file

@ -18,6 +18,22 @@ driver:
config:
name: time-slicing-config
# DCGM Exporter - increased from 768Mi to 2560Mi based on VPA upper bound of 2091Mi
dcgmExporter:
resources:
requests:
memory: "2560Mi"
limits:
memory: "2560Mi"
# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
validator:
resources:
requests:
memory: "256Mi"
limits:
memory: "256Mi"
# Tolerate GPU node taint for all GPU operator components
daemonsets:
tolerations: