immich: bump server to 8Gi + override tier-2-gpu quota to 20Gi

Eliminates the OOM-on-face-detection-burst class of incidents (2026-04-26).
VPA upper for immich-server is 2.98Gi steady-state; the prior 4Gi limit was
1.34x upper and still got SIGKILL'd when face-detection bursts pushed
transient RSS past 4Gi. 8Gi gives 2.7x VPA upper headroom.

The kyverno tier-2-gpu default quota is 12Gi requests.memory which can't fit
8Gi (server) + 3.5Gi (ML) + 3Gi (PG) + backup CronJobs simultaneously. Opts
the namespace into the kyverno custom-quota exclude rule and overrides with
20Gi (~4.5Gi headroom) — same pattern as woodpecker/nvidia.
This commit is contained in:
Viktor Barzin 2026-04-26 20:02:28 +00:00
parent d093aed7f6
commit 6ad5292128

View file

@ -121,7 +121,10 @@ resource "kubernetes_namespace" "immich" {
metadata {
name = "immich"
labels = {
tier = local.tiers.gpu
# Opts immich out of kyverno's `quota-tier-2-gpu` generation rule
# so this stack can own the tier-quota with a higher memory cap.
"resource-governance/custom-quota" = "true"
tier = local.tiers.gpu
}
}
lifecycle {
@ -130,6 +133,25 @@ resource "kubernetes_namespace" "immich" {
}
}
# Override the kyverno-generated tier-2-gpu quota (12Gi requests.memory).
# Immich-server needs 8Gi to absorb face-detection burst spikes (OOM 2026-04-26)
# without OOM. Plus immich-machine-learning (3.5Gi) + immich-postgresql (3Gi) +
# backup CronJobs 15.5Gi. 20Gi gives ~4.5Gi headroom.
resource "kubernetes_resource_quota" "immich" {
metadata {
name = "tier-quota"
namespace = kubernetes_namespace.immich.metadata[0].name
}
spec {
hard = {
"requests.cpu" = "8"
"requests.memory" = "20Gi"
"limits.memory" = "32Gi"
pods = "40"
}
}
}
resource "kubernetes_manifest" "external_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
@ -311,10 +333,10 @@ resource "kubernetes_deployment" "immich_server" {
resources {
requests = {
cpu = "100m"
memory = "4096Mi"
memory = "8Gi"
}
limits = {
memory = "4096Mi"
memory = "8Gi"
}
}
}