From 5beb481dc420d716c8659f2ddf80b1fd81fabb28 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Mar 2026 15:36:19 +0000 Subject: [PATCH] fix immich TF drift from Kyverno ndots injection, right-size nvidia GPU operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - immich: add lifecycle ignore_changes for dns_config on all 3 deployments to prevent perpetual plan drift from Kyverno ndots:2 mutation policy - nvidia dcgm-exporter: 768Mi → 2560Mi (VPA upper 2091Mi, was under-provisioned) - nvidia cuda-validator: 1024Mi → 256Mi (one-shot job, vastly over-provisioned) --- stacks/immich/main.tf | 20 ++++++++++++++++++++ stacks/platform/modules/nvidia/values.yaml | 16 ++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 8752abe4..cb84ec15 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -114,6 +114,12 @@ resource "kubernetes_deployment" "immich_server" { } } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, + ] + } + spec { replicas = 1 progress_deadline_seconds = 600 @@ -326,6 +332,13 @@ resource "kubernetes_deployment" "immich-postgres" { tier = local.tiers.gpu } } + + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, + ] + } + spec { replicas = 1 selector { @@ -436,6 +449,13 @@ resource "kubernetes_deployment" "immich-machine-learning" { tier = local.tiers.gpu } } + + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, + ] + } + spec { replicas = 1 selector { diff --git a/stacks/platform/modules/nvidia/values.yaml b/stacks/platform/modules/nvidia/values.yaml index 3f9d4639..44b6dcc2 100644 --- a/stacks/platform/modules/nvidia/values.yaml +++ b/stacks/platform/modules/nvidia/values.yaml @@ -18,6 +18,22 @@ driver: config: name: time-slicing-config +# DCGM Exporter - increased from 768Mi to 2560Mi based on VPA upper bound of 2091Mi +dcgmExporter: + resources: + requests: + memory: "2560Mi" + limits: + memory: "2560Mi" + +# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job) +validator: + resources: + requests: + memory: "256Mi" + limits: + memory: "256Mi" + # Tolerate GPU node taint for all GPU operator components daemonsets: tolerations: