paperless-ngx: add Bulgarian OCR (bul+eng) + raise data PVC ceiling to 30Gi

Preparing Paperless for Emo's document import from the NAS. His archive is Bulgarian (Cyrillic) + English, but OCR was English-only (tesseract had no 'bul' pack and PAPERLESS_OCR_LANGUAGE was unset/defaulted to eng), so scanned BG documents would OCR to garbage and be unsearchable. Add bul to the install list and set OCR_LANGUAGE=bul+eng. Also raise the data PVC autoresize ceiling from 5Gi to 30Gi: everything (originals + archive via PAPERLESS_MEDIA_ROOT=../data) lives on the single encrypted PVC, and the ~2.7GB in-scope import would blow past the 5Gi cap mid-ingest. The topolvm autoresizer grows the volume on demand up to the ceiling; 30Gi gives ample headroom. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 08:17:13 +00:00 · 2026-06-27 08:17:13 +00:00 · 7988a690ed
commit 7988a690ed
parent 82a7b2585b
1 changed files with 15 additions and 1 deletions
--- a/stacks/paperless-ngx/main.tf
+++ b/stacks/paperless-ngx/main.tf
@ -77,7 +77,7 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
    annotations = {
      "resize.topolvm.io/threshold"     = "10%"
      "resize.topolvm.io/increase"      = "100%"
-      "resize.topolvm.io/storage_limit" = "5Gi"
+      "resize.topolvm.io/storage_limit" = "30Gi"
    }
  }
  spec {
@ -186,6 +186,20 @@ resource "kubernetes_deployment" "paperless-ngx" {
            name  = "PAPERLESS_OCR_USER_ARGS"
            value = "{\"invalidate_digital_signatures\": true}"
          }
+          # OCR language(s) used per document. bul+eng covers the Bulgarian
+          # (Cyrillic) + English document set being imported (e.g. emo's
+          # archive). Multiple langs => tesseract tries all; "+" not " ".
+          env {
+            name  = "PAPERLESS_OCR_LANGUAGE"
+            value = "bul+eng"
+          }
+          # Language data packages installed at container start (space-
+          # separated). The image ships eng (+deu/fra/ita/spa); bul must be
+          # apt-installed here so OCR_LANGUAGE=bul+eng resolves.
+          env {
+            name  = "PAPERLESS_OCR_LANGUAGES"
+            value = "bul eng"
+          }
          volume_mount {
            name       = "data"
            mount_path = "/usr/src/paperless/data"