From 2cb37d51d42c768546730756f7bdb8b34b2c1ca6 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 27 Jun 2026 18:45:25 +0000 Subject: [PATCH] =?UTF-8?q?paperless-ngx:=20scale=20Gotenberg=20x3=20+=20T?= =?UTF-8?q?ika=20x2,=204=20workers,=20skip-archive=20=E2=80=94=20speed=20t?= =?UTF-8?q?he=20Emo=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bottleneck found: single Gotenberg 503s under concurrent workers (office docs failing + slow). Cluster is otherwise idle (sdc 0.5% util, etcd ~1/min), so: - Gotenberg 1->3 + Tika 1->2 (Service load-balances; fixes the 503s, parallel office conversion). - paperless TASK_WORKERS 2->4, THREADS_PER_WORKER 2->1, mem limit 4->8Gi (avoid OOM with 4 concurrent OCR). Requests kept low to stay within tier-quota (requests.memory 3840/4096Mi). - PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text: skip redundant archive for born- digital/office docs (big IO saver for the work-doc set). Guard + etcd watch stay in place; revert to defaults after the import. Co-Authored-By: Claude Opus 4.8 --- stacks/paperless-ngx/main.tf | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/stacks/paperless-ngx/main.tf b/stacks/paperless-ngx/main.tf index 72a07bc3..ce4c857e 100644 --- a/stacks/paperless-ngx/main.tf +++ b/stacks/paperless-ngx/main.tf @@ -225,11 +225,18 @@ resource "kubernetes_deployment" "paperless-ngx" { # it degrades. Revert both to defaults once the import is done. env { name = "PAPERLESS_TASK_WORKERS" - value = "2" + value = "4" } env { name = "PAPERLESS_THREADS_PER_WORKER" - value = "2" + value = "1" + } + # Skip the redundant OCR'd archive PDF for inputs that already carry a + # text layer (born-digital PDFs + office->PDF via Gotenberg). Big + # speed/IO saver for emo's work-doc set; scanned docs still OCR+archive. + env { + name = "PAPERLESS_OCR_SKIP_ARCHIVE_FILE" + value = "with_text" } volume_mount { name = "data" @@ -242,7 +249,7 @@ resource "kubernetes_deployment" "paperless-ngx" { memory = "2Gi" } limits = { - memory = "4Gi" + memory = "8Gi" } } @@ -299,7 +306,9 @@ resource "kubernetes_service" "paperless-ngx" { # --- Tika + Gotenberg: Office/email -> text/PDF conversion for paperless --- # Apache Tika extracts text+metadata; Gotenberg renders Office formats to PDF. # Paperless routes Office/email docs through these (PAPERLESS_TIKA_* above). -# Stateless (no PVC), pinned images, single replica — bulk import is serial. +# Stateless (no PVC), pinned images. 3 replicas during the bulk import: a +# single LibreOffice instance 503s under concurrent paperless workers; the +# Service load-balances office conversions across the replicas. resource "kubernetes_deployment" "gotenberg" { metadata { name = "gotenberg" @@ -310,7 +319,7 @@ resource "kubernetes_deployment" "gotenberg" { } } spec { - replicas = 1 + replicas = 3 selector { match_labels = { app = "gotenberg" @@ -395,7 +404,7 @@ resource "kubernetes_deployment" "tika" { } } spec { - replicas = 1 + replicas = 2 selector { match_labels = { app = "tika"