fix nvidia quota: use custom quota (32 CPU) instead of Kyverno-generated (16 CPU)

The GPU operator needs ~19 CPU limits across 16 pods (NFD, device plugin, driver, validators, exporters). The Kyverno auto-generated quota of 16 CPU was insufficient, blocking NFD worker and GC pods from scheduling. - Add custom-quota label to nvidia namespace to exempt from Kyverno generation - Add explicit ResourceQuota with limits.cpu=32, limits.memory=48Gi - Fix: nvidia namespace tier label was missing after CI re-apply, causing Kyverno to use fallback LimitRange instead of tier-2-gpu specific one Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-12 07:04:34 +00:00 · 2026-03-12 07:04:34 +00:00 · 457d29dd3d
commit 457d29dd3d
parent 8c920bd496
1 changed files with 17 additions and 0 deletions
--- a/stacks/platform/modules/nvidia/main.tf
+++ b/stacks/platform/modules/nvidia/main.tf
@ -13,6 +13,23 @@ resource "kubernetes_namespace" "nvidia" {
    labels = {
      "istio-injection" : "disabled"
      tier                               = var.tier
+      "resource-governance/custom-quota" = "true"
+    }
+  }
+}
+
+resource "kubernetes_resource_quota" "nvidia_quota" {
+  metadata {
+    name      = "tier-quota"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+  spec {
+    hard = {
+      "limits.cpu"      = "32"
+      "limits.memory"   = "48Gi"
+      "requests.cpu"    = "8"
+      "requests.memory" = "8Gi"
+      pods              = "40"
    }
  }
 }