cluster recovery: fix resource limits and node1 memory

- nvidia quota: requests.memory 8Gi → 12Gi (unblock cuda-validator) - calibre: startup probe initial_delay 60→120s, timeout 1→5s, wait_for_rollout=false (DOCKER_MODS install takes 10+ min) - immich ML: memory 2Gi → 4Gi (OOMKilled loading CLIP models) Also done outside TF (not in this commit): - node1 VM: 16 GiB → 24 GiB RAM (Proxmox) - tigera-operator: kubectl patch 128→256Mi - nvidia-driver-daemonset: kubectl patch 1→4Gi memory - kyverno reports-controller: kubectl patch 128→256Mi - CNPG operator: kubectl rollout restart
2026-03-15 01:44:28 +00:00 · 2026-03-15 01:44:28 +00:00 · 43b49f7f6c
commit 43b49f7f6c
parent a3c198e10e
3 changed files with 6 additions and 4 deletions
--- a/stacks/calibre/main.tf
+++ b/stacks/calibre/main.tf
@ -138,6 +138,7 @@ module "nfs_stacks_config" {
 # }

 resource "kubernetes_deployment" "calibre-web-automated" {
+  wait_for_rollout = false # DOCKER_MODS install takes 10+ min on every container start
  metadata {
    name      = "calibre-web-automated"
    namespace = kubernetes_namespace.calibre.metadata[0].name
@ -205,7 +206,8 @@ resource "kubernetes_deployment" "calibre-web-automated" {
              path = "/"
              port = 8083
            }
-            initial_delay_seconds = 60
+            initial_delay_seconds = 120
+            timeout_seconds       = 5
            period_seconds        = 15
            failure_threshold     = 56
          }
--- a/stacks/immich/main.tf
+++ b/stacks/immich/main.tf
@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" {
          resources {
            requests = {
              cpu    = "100m"
-              memory = "2Gi"
+              memory = "4Gi"
            }
            limits = {
-              memory = "2Gi"
+              memory = "4Gi"
              "nvidia.com/gpu" = "1"
            }
          }
--- a/stacks/platform/modules/nvidia/main.tf
+++ b/stacks/platform/modules/nvidia/main.tf
@ -27,7 +27,7 @@ resource "kubernetes_resource_quota" "nvidia_quota" {
    hard = {
      "limits.memory"   = "48Gi"
      "requests.cpu"    = "8"
-      "requests.memory" = "8Gi"
+      "requests.memory" = "12Gi"
      pods              = "40"
    }
  }