From 9599beadc95ccdc9f2b5ea398243a39510bf6aca Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 27 Jun 2026 16:33:43 +0000 Subject: [PATCH] paperless-ngx: 2 task workers + 2 threads/worker + 4Gi limit for the Emo bulk import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Emo's ~13.7k-doc import is OCR-bound on a single celery worker (~10s/doc = multi-day). Bump PAPERLESS_TASK_WORKERS=2 + THREADS_PER_WORKER=2 for ~2x throughput, and the memory limit 2Gi->4Gi to fit two concurrent OCR jobs. Kept deliberately modest: archive writes hit the shared sdc HDD that etcd also lives on (IO-storm risk, code-oflt) — watch etcd apply latency and revert workers to 1 if it degrades. Revert to defaults once the import done. Co-Authored-By: Claude Opus 4.8 --- stacks/paperless-ngx/main.tf | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/stacks/paperless-ngx/main.tf b/stacks/paperless-ngx/main.tf index 7ca314ec..72a07bc3 100644 --- a/stacks/paperless-ngx/main.tf +++ b/stacks/paperless-ngx/main.tf @@ -217,6 +217,20 @@ resource "kubernetes_deployment" "paperless-ngx" { name = "PAPERLESS_TIKA_GOTENBERG_ENDPOINT" value = "http://gotenberg.paperless-ngx.svc.cluster.local:3000" } + # Processing concurrency, tuned for the bulk Emo import (~13.7k docs). + # 2 workers = 2 docs in parallel (≈2x throughput); kept modest because + # archive writes land on the shared sdc HDD that etcd also uses (IO + # storm risk, code-oflt). 2 threads/worker speeds per-doc OCR using the + # node's spare CPU. Watch etcd apply latency; dial workers back to 1 if + # it degrades. Revert both to defaults once the import is done. + env { + name = "PAPERLESS_TASK_WORKERS" + value = "2" + } + env { + name = "PAPERLESS_THREADS_PER_WORKER" + value = "2" + } volume_mount { name = "data" mount_path = "/usr/src/paperless/data" @@ -228,7 +242,7 @@ resource "kubernetes_deployment" "paperless-ngx" { memory = "2Gi" } limits = { - memory = "2Gi" + memory = "4Gi" } }