From 43b49f7f6c2ae3a9af06abfe9eb13efba49b832b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Mar 2026 01:44:28 +0000 Subject: [PATCH] cluster recovery: fix resource limits and node1 memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvidia quota: requests.memory 8Gi → 12Gi (unblock cuda-validator) - calibre: startup probe initial_delay 60→120s, timeout 1→5s, wait_for_rollout=false (DOCKER_MODS install takes 10+ min) - immich ML: memory 2Gi → 4Gi (OOMKilled loading CLIP models) Also done outside TF (not in this commit): - node1 VM: 16 GiB → 24 GiB RAM (Proxmox) - tigera-operator: kubectl patch 128→256Mi - nvidia-driver-daemonset: kubectl patch 1→4Gi memory - kyverno reports-controller: kubectl patch 128→256Mi - CNPG operator: kubectl rollout restart --- stacks/calibre/main.tf | 4 +++- stacks/immich/main.tf | 4 ++-- stacks/platform/modules/nvidia/main.tf | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/stacks/calibre/main.tf b/stacks/calibre/main.tf index 89de4259..64a94d64 100644 --- a/stacks/calibre/main.tf +++ b/stacks/calibre/main.tf @@ -138,6 +138,7 @@ module "nfs_stacks_config" { # } resource "kubernetes_deployment" "calibre-web-automated" { + wait_for_rollout = false # DOCKER_MODS install takes 10+ min on every container start metadata { name = "calibre-web-automated" namespace = kubernetes_namespace.calibre.metadata[0].name @@ -205,7 +206,8 @@ resource "kubernetes_deployment" "calibre-web-automated" { path = "/" port = 8083 } - initial_delay_seconds = 60 + initial_delay_seconds = 120 + timeout_seconds = 5 period_seconds = 15 failure_threshold = 56 } diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 28345fab..46f09603 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" { resources { requests = { cpu = "100m" - memory = "2Gi" + memory = "4Gi" } limits = { - memory = "2Gi" + memory = "4Gi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/platform/modules/nvidia/main.tf b/stacks/platform/modules/nvidia/main.tf index 0acc37ba..dd09d549 100644 --- a/stacks/platform/modules/nvidia/main.tf +++ b/stacks/platform/modules/nvidia/main.tf @@ -27,7 +27,7 @@ resource "kubernetes_resource_quota" "nvidia_quota" { hard = { "limits.memory" = "48Gi" "requests.cpu" = "8" - "requests.memory" = "8Gi" + "requests.memory" = "12Gi" pods = "40" } }