From 12a51c4ffa8a406bddcca3ba6d5baeff08fbe602 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 17 Mar 2026 22:35:54 +0000 Subject: [PATCH] right-size memory requests to unblock GPU workloads and fix dbaas quota [ci skip] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvidia: custom LimitRange (128Mi default, was 1Gi from Kyverno) to stop inflating GPU operator init containers; saves ~2.5Gi on GPU node - nvidia: dcgm-exporter 1536Mi → 768Mi (actual usage 489Mi) - monitoring: prometheus server 4Gi → 3Gi (actual usage 2.6Gi) - onlyoffice: 2304Mi → 1536Mi (actual usage 1.3Gi) - immich: frame explicit 64Mi resources (was getting 1Gi LimitRange default) - dbaas: quota limits.memory 20Gi → 24Gi to fit 3rd MySQL replica Root cause: Kyverno tier-2-gpu LimitRange injected 1Gi on every NVIDIA init container (no explicit resources), wasting ~2.5Gi scheduling overhead on the GPU node. Combined with over-requesting, frigate and immich-ml couldn't schedule. --- stacks/dbaas/modules/dbaas/main.tf | 4 +-- stacks/immich/frame.tf | 9 ++++++ .../monitoring/prometheus_chart_values.tpl | 4 +-- stacks/nvidia/modules/nvidia/main.tf | 30 +++++++++++++++++-- stacks/nvidia/modules/nvidia/values.yaml | 6 ++-- stacks/onlyoffice/main.tf | 4 +-- 6 files changed, 46 insertions(+), 11 deletions(-) diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 1c94052e..233913f2 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -35,8 +35,8 @@ resource "kubernetes_resource_quota" "dbaas" { spec { hard = { "requests.cpu" = "8" - "requests.memory" = "20Gi" - "limits.memory" = "20Gi" + "requests.memory" = "24Gi" + "limits.memory" = "24Gi" pods = "30" } } diff --git a/stacks/immich/frame.tf b/stacks/immich/frame.tf index a6fa0098..f0c06487 100644 --- a/stacks/immich/frame.tf +++ b/stacks/immich/frame.tf @@ -66,6 +66,15 @@ resource "kubernetes_deployment" "immich-frame" { container { image = "ghcr.io/immichframe/immichframe:latest" name = "immich-frame" + resources { + requests = { + cpu = "10m" + memory = "64Mi" + } + limits = { + memory = "128Mi" + } + } port { container_port = 8080 protocol = "TCP" diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index c4d5070b..8dcda717 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -161,9 +161,9 @@ server: resources: requests: cpu: 100m - memory: 4Gi + memory: 3Gi limits: - memory: 4Gi + memory: 3Gi livenessProbeInitialDelay: 300 readinessProbeInitialDelay: 60 strategy: diff --git a/stacks/nvidia/modules/nvidia/main.tf b/stacks/nvidia/modules/nvidia/main.tf index f23226a2..eb7cae52 100644 --- a/stacks/nvidia/modules/nvidia/main.tf +++ b/stacks/nvidia/modules/nvidia/main.tf @@ -12,8 +12,34 @@ resource "kubernetes_namespace" "nvidia" { name = "nvidia" labels = { "istio-injection" : "disabled" - tier = var.tier - "resource-governance/custom-quota" = "true" + tier = var.tier + "resource-governance/custom-quota" = "true" + "resource-governance/custom-limitrange" = "true" + } + } +} + +# Custom LimitRange — overrides Kyverno tier-2-gpu default (1Gi per container) +# which was inflating NVIDIA operator init container requests by ~2.5Gi total. +# Init containers do quick validation checks and need minimal memory. +resource "kubernetes_limit_range" "nvidia_defaults" { + metadata { + name = "tier-defaults" + namespace = kubernetes_namespace.nvidia.metadata[0].name + } + spec { + limit { + type = "Container" + default = { + memory = "128Mi" + } + default_request = { + cpu = "50m" + memory = "128Mi" + } + max = { + memory = "16Gi" + } } } } diff --git a/stacks/nvidia/modules/nvidia/values.yaml b/stacks/nvidia/modules/nvidia/values.yaml index 71c2eac7..03548a55 100644 --- a/stacks/nvidia/modules/nvidia/values.yaml +++ b/stacks/nvidia/modules/nvidia/values.yaml @@ -18,13 +18,13 @@ driver: config: name: time-slicing-config -# DCGM Exporter - reduced from 2560Mi to 1536Mi based on VPA upper bound of 1459Mi (1.05x margin) +# DCGM Exporter - reduced to 768Mi (actual usage ~489Mi, 1.5x margin) dcgmExporter: resources: requests: - memory: "1536Mi" + memory: "768Mi" limits: - memory: "1536Mi" + memory: "768Mi" # CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job) validator: diff --git a/stacks/onlyoffice/main.tf b/stacks/onlyoffice/main.tf index 3643b454..86fd863b 100644 --- a/stacks/onlyoffice/main.tf +++ b/stacks/onlyoffice/main.tf @@ -134,10 +134,10 @@ resource "kubernetes_deployment" "onlyoffice-document-server" { resources { requests = { cpu = "100m" - memory = "2304Mi" + memory = "1536Mi" } limits = { - memory = "2304Mi" + memory = "1536Mi" } } port {