From 2cb37d51d42c768546730756f7bdb8b34b2c1ca6 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 27 Jun 2026 18:45:25 +0000
Subject: [PATCH] =?UTF-8?q?paperless-ngx:=20scale=20Gotenberg=20x3=20+=20T?=
 =?UTF-8?q?ika=20x2,=204=20workers,=20skip-archive=20=E2=80=94=20speed=20t?=
 =?UTF-8?q?he=20Emo=20import?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bottleneck found: single Gotenberg 503s under concurrent workers (office docs
failing + slow). Cluster is otherwise idle (sdc 0.5% util, etcd ~1/min), so:
- Gotenberg 1->3 + Tika 1->2 (Service load-balances; fixes the 503s, parallel
  office conversion).
- paperless TASK_WORKERS 2->4, THREADS_PER_WORKER 2->1, mem limit 4->8Gi (avoid
  OOM with 4 concurrent OCR). Requests kept low to stay within tier-quota
  (requests.memory 3840/4096Mi).
- PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text: skip redundant archive for born-
  digital/office docs (big IO saver for the work-doc set).
Guard + etcd watch stay in place; revert to defaults after the import.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 stacks/paperless-ngx/main.tf | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/stacks/paperless-ngx/main.tf b/stacks/paperless-ngx/main.tf
index 72a07bc3..ce4c857e 100644
--- a/stacks/paperless-ngx/main.tf
+++ b/stacks/paperless-ngx/main.tf
@@ -225,11 +225,18 @@ resource "kubernetes_deployment" "paperless-ngx" {
           # it degrades. Revert both to defaults once the import is done.
           env {
             name  = "PAPERLESS_TASK_WORKERS"
-            value = "2"
+            value = "4"
           }
           env {
             name  = "PAPERLESS_THREADS_PER_WORKER"
-            value = "2"
+            value = "1"
+          }
+          # Skip the redundant OCR'd archive PDF for inputs that already carry a
+          # text layer (born-digital PDFs + office->PDF via Gotenberg). Big
+          # speed/IO saver for emo's work-doc set; scanned docs still OCR+archive.
+          env {
+            name  = "PAPERLESS_OCR_SKIP_ARCHIVE_FILE"
+            value = "with_text"
           }
           volume_mount {
             name       = "data"
@@ -242,7 +249,7 @@ resource "kubernetes_deployment" "paperless-ngx" {
               memory = "2Gi"
             }
             limits = {
-              memory = "4Gi"
+              memory = "8Gi"
             }
           }
 
@@ -299,7 +306,9 @@ resource "kubernetes_service" "paperless-ngx" {
 # --- Tika + Gotenberg: Office/email -> text/PDF conversion for paperless ---
 # Apache Tika extracts text+metadata; Gotenberg renders Office formats to PDF.
 # Paperless routes Office/email docs through these (PAPERLESS_TIKA_* above).
-# Stateless (no PVC), pinned images, single replica — bulk import is serial.
+# Stateless (no PVC), pinned images. 3 replicas during the bulk import: a
+# single LibreOffice instance 503s under concurrent paperless workers; the
+# Service load-balances office conversions across the replicas.
 resource "kubernetes_deployment" "gotenberg" {
   metadata {
     name      = "gotenberg"
@@ -310,7 +319,7 @@ resource "kubernetes_deployment" "gotenberg" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 3
     selector {
       match_labels = {
         app = "gotenberg"
@@ -395,7 +404,7 @@ resource "kubernetes_deployment" "tika" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 2
     selector {
       match_labels = {
         app = "tika"