From 7045559fee8724a8466d0a12a050a8bda1ae0f5a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 24 May 2026 22:13:35 +0000 Subject: [PATCH] immich: harden against bulk-import load (memory + probe + Job retries) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mid-flight stability changes from the 2026-05-24 Anca-elements import that surfaced multiple latent issues under sustained load: - `immich-postgresql` memory 3Gi → 5Gi. The original limit OOM-killed PG once the bulk insert + vector embeddings drove buffer pressure past 3 GiB. 5 GiB gives ~60% headroom over the observed steady state during ongoing imports. - `immich-server` startup probe `failure_threshold` 30 → 360 (5min → 1h). After any PG restart, immich-server reindexes `clip_index` + `face_index` (147k + 185k rows at the time of incident) before binding the API port. The old 5min budget was too tight, so each PG bounce trapped immich-server in a startup crashloop until the reindex was killed. 1h gives generous headroom. - `kubernetes_job_v1.anca_elements_import.backoff_limit` 2 → 20 and `--concurrent-tasks` 8 → 20 on the immich-go upload. Short cluster blips (PG restart, KCM lease loss) were exhausting the Job's 3-attempt budget. 20 attempts + 20 parallel hashers makes dedup-on-resume ~2.5x faster and tolerates a much rougher cluster. Co-Authored-By: Claude Opus 4.7 --- stacks/immich/main.tf | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 5a3d2413..deae2507 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -321,7 +321,12 @@ resource "kubernetes_deployment" "immich_server" { } period_seconds = 10 timeout_seconds = 1 - failure_threshold = 30 + # Bumped 30 → 360 (5min → 1h): after a PG restart, immich-server + # reindexes the clip_index + face_index vector tables before binding + # the API port. Hundreds of thousands of rows take longer than 5min + # on a cold cache, so the old threshold trapped us in a startup + # crashloop after every PG restart (2026-05-24 incident). + failure_threshold = 360 success_threshold = 1 } @@ -526,10 +531,10 @@ resource "kubernetes_deployment" "immich-postgres" { resources { requests = { cpu = "100m" - memory = "3Gi" + memory = "5Gi" } limits = { - memory = "3Gi" + memory = "5Gi" } } } @@ -906,7 +911,7 @@ resource "kubernetes_job_v1" "anca_elements_import" { wait_for_completion = false spec { - backoff_limit = 2 + backoff_limit = 20 ttl_seconds_after_finished = 604800 template { metadata { @@ -948,7 +953,7 @@ resource "kubernetes_job_v1" "anca_elements_import" { --ban-file "csp/" --ban-file "KOREAN/" \ --ban-file "System Volume Information/" \ --pause-immich-jobs=false \ - --concurrent-tasks 8 \ + --concurrent-tasks 20 \ --client-timeout 1h \ --no-ui \ --on-errors continue