From 457d29dd3d77291ddcc62ef0f7bae39feb07df0a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 12 Mar 2026 07:04:34 +0000 Subject: [PATCH] fix nvidia quota: use custom quota (32 CPU) instead of Kyverno-generated (16 CPU) The GPU operator needs ~19 CPU limits across 16 pods (NFD, device plugin, driver, validators, exporters). The Kyverno auto-generated quota of 16 CPU was insufficient, blocking NFD worker and GC pods from scheduling. - Add custom-quota label to nvidia namespace to exempt from Kyverno generation - Add explicit ResourceQuota with limits.cpu=32, limits.memory=48Gi - Fix: nvidia namespace tier label was missing after CI re-apply, causing Kyverno to use fallback LimitRange instead of tier-2-gpu specific one Co-Authored-By: Claude Opus 4.6 (1M context) --- stacks/platform/modules/nvidia/main.tf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/stacks/platform/modules/nvidia/main.tf b/stacks/platform/modules/nvidia/main.tf index 49151a3d..eba209c3 100644 --- a/stacks/platform/modules/nvidia/main.tf +++ b/stacks/platform/modules/nvidia/main.tf @@ -13,6 +13,23 @@ resource "kubernetes_namespace" "nvidia" { labels = { "istio-injection" : "disabled" tier = var.tier + "resource-governance/custom-quota" = "true" + } + } +} + +resource "kubernetes_resource_quota" "nvidia_quota" { + metadata { + name = "tier-quota" + namespace = kubernetes_namespace.nvidia.metadata[0].name + } + spec { + hard = { + "limits.cpu" = "32" + "limits.memory" = "48Gi" + "requests.cpu" = "8" + "requests.memory" = "8Gi" + pods = "40" } } }