From 21d20dccf81bf70ae5c8caf5efd7b24b58e55e9d Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 27 Jun 2026 19:35:10 +0000
Subject: [PATCH] paperless-ngx: bulk-import via PVC consume dir (restart-safe)
 + 6 workers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Emo's ~13.7k-document import was going through the API upload path, which
stages each file on the pod's EPHEMERAL scratch before queuing it. Any
paperless pod or redis restart therefore destroyed all in-flight work
(the "File not found" failures we hit) and required manual re-uploads.

Move bulk ingest to paperless's consume directory placed on the encrypted
PVC, with PAPERLESS_CONSUMER_POLLING so the whole folder is re-scanned
periodically (and on startup) with a file-stability check. Files now live
on durable storage and survive any restart — the folder is the queue and
self-heals, so we can copy everything in fast and let it process over
time with zero retry/integrity risk. RECURSIVE preserves the source tree
(avoids basename collisions); owner+tag come from a consumption workflow.

Bump TASK_WORKERS 4->6 to speed the OCR/convert-bound processing (node6
has the core headroom for one pod) and mem limit 8->10Gi for the extra
workers. Revert workers/mem/consume envs to defaults once the import ends.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 stacks/paperless-ngx/main.tf | 42 +++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/stacks/paperless-ngx/main.tf b/stacks/paperless-ngx/main.tf
index ce4c857e..45cd3f50 100644
--- a/stacks/paperless-ngx/main.tf
+++ b/stacks/paperless-ngx/main.tf
@@ -217,15 +217,17 @@ resource "kubernetes_deployment" "paperless-ngx" {
             name  = "PAPERLESS_TIKA_GOTENBERG_ENDPOINT"
             value = "http://gotenberg.paperless-ngx.svc.cluster.local:3000"
           }
-          # Processing concurrency, tuned for the bulk Emo import (~13.7k docs).
-          # 2 workers = 2 docs in parallel (≈2x throughput); kept modest because
-          # archive writes land on the shared sdc HDD that etcd also uses (IO
-          # storm risk, code-oflt). 2 threads/worker speeds per-doc OCR using the
-          # node's spare CPU. Watch etcd apply latency; dial workers back to 1 if
-          # it degrades. Revert both to defaults once the import is done.
+          # Processing concurrency, tuned for the bulk Emo import (~13.7k docs,
+          # mostly scanned/office => OCR/convert-bound, ~3-4 docs/min/worker).
+          # 6 workers = 6 docs in parallel; paperless is a single pod (RWO PVC)
+          # pinned to one ~8-core node, so 6 leaves headroom for system + the
+          # node's co-tenants (8 would saturate it). OCR temp stays on ephemeral
+          # scratch (fast); the consume QUEUE is on the PVC (below) so a restart
+          # never loses queued work. Watch etcd apply latency; dial back if it
+          # degrades. Revert workers/threads/mem to defaults once import is done.
           env {
             name  = "PAPERLESS_TASK_WORKERS"
-            value = "4"
+            value = "6"
           }
           env {
             name  = "PAPERLESS_THREADS_PER_WORKER"
@@ -238,6 +240,30 @@ resource "kubernetes_deployment" "paperless-ngx" {
             name  = "PAPERLESS_OCR_SKIP_ARCHIVE_FILE"
             value = "with_text"
           }
+          # Bulk-import ingest path = the CONSUME DIRECTORY on the PVC (not the
+          # API). post_document writes each upload to ephemeral scratch then
+          # queues it in redis -> a pod or redis restart loses in-flight work
+          # ("File not found"). The consume dir instead lives on the encrypted
+          # PVC, and POLLING re-scans the whole dir every 60s (watchdog snapshot
+          # resets on startup, so files dropped while paperless was down are
+          # picked up too) with a size+mtime stability check (won't grab a
+          # half-copied file). Net: restart-safe, self-healing bulk ingest — the
+          # folder IS the durable queue. RECURSIVE walks subdirs (source tree is
+          # copied in with structure, avoiding basename collisions). Owner+tag
+          # are applied by a consumption workflow scoped to the import subdir.
+          # Revert (remove these three env blocks) once the import is done.
+          env {
+            name  = "PAPERLESS_CONSUMPTION_DIR"
+            value = "/usr/src/paperless/data/consume"
+          }
+          env {
+            name  = "PAPERLESS_CONSUMER_RECURSIVE"
+            value = "true"
+          }
+          env {
+            name  = "PAPERLESS_CONSUMER_POLLING"
+            value = "60"
+          }
           volume_mount {
             name       = "data"
             mount_path = "/usr/src/paperless/data"
@@ -249,7 +275,7 @@ resource "kubernetes_deployment" "paperless-ngx" {
               memory = "2Gi"
             }
             limits = {
-              memory = "8Gi"
+              memory = "10Gi"
             }
           }