cluster recovery: fix resource limits and node1 memory
- nvidia quota: requests.memory 8Gi → 12Gi (unblock cuda-validator) - calibre: startup probe initial_delay 60→120s, timeout 1→5s, wait_for_rollout=false (DOCKER_MODS install takes 10+ min) - immich ML: memory 2Gi → 4Gi (OOMKilled loading CLIP models) Also done outside TF (not in this commit): - node1 VM: 16 GiB → 24 GiB RAM (Proxmox) - tigera-operator: kubectl patch 128→256Mi - nvidia-driver-daemonset: kubectl patch 1→4Gi memory - kyverno reports-controller: kubectl patch 128→256Mi - CNPG operator: kubectl rollout restart
This commit is contained in:
parent
a3c198e10e
commit
43b49f7f6c
3 changed files with 6 additions and 4 deletions
|
|
@ -138,6 +138,7 @@ module "nfs_stacks_config" {
|
|||
# }
|
||||
|
||||
resource "kubernetes_deployment" "calibre-web-automated" {
|
||||
wait_for_rollout = false # DOCKER_MODS install takes 10+ min on every container start
|
||||
metadata {
|
||||
name = "calibre-web-automated"
|
||||
namespace = kubernetes_namespace.calibre.metadata[0].name
|
||||
|
|
@ -205,7 +206,8 @@ resource "kubernetes_deployment" "calibre-web-automated" {
|
|||
path = "/"
|
||||
port = 8083
|
||||
}
|
||||
initial_delay_seconds = 60
|
||||
initial_delay_seconds = 120
|
||||
timeout_seconds = 5
|
||||
period_seconds = 15
|
||||
failure_threshold = 56
|
||||
}
|
||||
|
|
|
|||
|
|
@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
memory = "2Gi"
|
||||
memory = "4Gi"
|
||||
}
|
||||
limits = {
|
||||
memory = "2Gi"
|
||||
memory = "4Gi"
|
||||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ resource "kubernetes_resource_quota" "nvidia_quota" {
|
|||
hard = {
|
||||
"limits.memory" = "48Gi"
|
||||
"requests.cpu" = "8"
|
||||
"requests.memory" = "8Gi"
|
||||
"requests.memory" = "12Gi"
|
||||
pods = "40"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue