diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 8752abe4..cb84ec15 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -114,6 +114,12 @@ resource "kubernetes_deployment" "immich_server" { } } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, + ] + } + spec { replicas = 1 progress_deadline_seconds = 600 @@ -326,6 +332,13 @@ resource "kubernetes_deployment" "immich-postgres" { tier = local.tiers.gpu } } + + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, + ] + } + spec { replicas = 1 selector { @@ -436,6 +449,13 @@ resource "kubernetes_deployment" "immich-machine-learning" { tier = local.tiers.gpu } } + + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, + ] + } + spec { replicas = 1 selector { diff --git a/stacks/platform/modules/nvidia/values.yaml b/stacks/platform/modules/nvidia/values.yaml index 3f9d4639..44b6dcc2 100644 --- a/stacks/platform/modules/nvidia/values.yaml +++ b/stacks/platform/modules/nvidia/values.yaml @@ -18,6 +18,22 @@ driver: config: name: time-slicing-config +# DCGM Exporter - increased from 768Mi to 2560Mi based on VPA upper bound of 2091Mi +dcgmExporter: + resources: + requests: + memory: "2560Mi" + limits: + memory: "2560Mi" + +# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job) +validator: + resources: + requests: + memory: "256Mi" + limits: + memory: "256Mi" + # Tolerate GPU node taint for all GPU operator components daemonsets: tolerations: