paperless-ngx: deploy Tika + Gotenberg for Office ingest + raise PVC ceiling to 80Gi
All checks were successful
ci/woodpecker/push/default Pipeline was successful

Emo's import scope now includes his work-PC document set (C/Documents,
Project Management, Service & MRO, etc. on the NAS), which is ~4.9k Office
files (.doc/.docx/.xls/.xlsx/.ppt/.pptx) on top of Emo shared. Paperless
can only archive/OCR/index those if it can convert them, so add the standard
Apache Tika (text+metadata) + Gotenberg (-> PDF) sidecar deployments + their
services in the paperless-ngx namespace and point PAPERLESS_TIKA_* at them.
Pinned images (gotenberg 8.25, tika 3.3.1.0), single replica, no PVC.

Total in-scope document set across all NAS locations is now ~13,700 PDF+Office
files / ~13.7GB source (~30GB once OCR'd + archived), so raise the data PVC
autoresize ceiling 30Gi -> 80Gi for comfortable headroom. The topolvm
autoresizer grows on demand up to the ceiling.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-27 12:02:04 +00:00
parent 041aedc486
commit e8b72019b5

View file

@ -20,7 +20,7 @@ resource "kubernetes_namespace" "paperless-ngx" {
metadata {
name = "paperless-ngx"
labels = {
tier = local.tiers.edge
tier = local.tiers.edge
"keel.sh/enrolled" = "true"
}
# labels = {
@ -77,7 +77,7 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
annotations = {
"resize.topolvm.io/threshold" = "10%"
"resize.topolvm.io/increase" = "100%"
"resize.topolvm.io/storage_limit" = "30Gi"
"resize.topolvm.io/storage_limit" = "80Gi"
}
}
spec {
@ -200,6 +200,23 @@ resource "kubernetes_deployment" "paperless-ngx" {
name = "PAPERLESS_OCR_LANGUAGES"
value = "bul eng"
}
# Office/email documents (.doc/.docx/.xls/.xlsx/.ppt/.pptx/.odt/.eml/
# .msg) are converted via Apache Tika (text+metadata) + Gotenberg
# (-> PDF) so paperless can archive/OCR/index them. Needed for emo's
# work-PC document set (~4.9k Office files). Endpoints = the tika /
# gotenberg deployments defined below in this stack.
env {
name = "PAPERLESS_TIKA_ENABLED"
value = "1"
}
env {
name = "PAPERLESS_TIKA_ENDPOINT"
value = "http://tika.paperless-ngx.svc.cluster.local:9998"
}
env {
name = "PAPERLESS_TIKA_GOTENBERG_ENDPOINT"
value = "http://gotenberg.paperless-ngx.svc.cluster.local:3000"
}
volume_mount {
name = "data"
mount_path = "/usr/src/paperless/data"
@ -265,6 +282,173 @@ resource "kubernetes_service" "paperless-ngx" {
}
}
# --- Tika + Gotenberg: Office/email -> text/PDF conversion for paperless ---
# Apache Tika extracts text+metadata; Gotenberg renders Office formats to PDF.
# Paperless routes Office/email docs through these (PAPERLESS_TIKA_* above).
# Stateless (no PVC), pinned images, single replica bulk import is serial.
resource "kubernetes_deployment" "gotenberg" {
metadata {
name = "gotenberg"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "gotenberg"
tier = local.tiers.edge
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "gotenberg"
}
}
template {
metadata {
labels = {
app = "gotenberg"
}
}
spec {
container {
image = "docker.io/gotenberg/gotenberg:8.25"
name = "gotenberg"
# docker-compose `command:` == k8s `args` (overrides CMD, keeps the
# image's tini ENTRYPOINT). Paperless's recommended hardening flags.
args = [
"gotenberg",
"--chromium-disable-javascript=true",
"--chromium-allow-list=file:///tmp/.*",
]
port {
container_port = 3000
}
resources {
requests = {
cpu = "50m"
memory = "256Mi"
}
limits = {
memory = "1536Mi"
}
}
readiness_probe {
http_get {
path = "/health"
port = 3000
}
initial_delay_seconds = 5
period_seconds = 15
}
}
}
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "gotenberg" {
metadata {
name = "gotenberg"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "gotenberg"
}
}
spec {
selector = {
app = "gotenberg"
}
port {
name = "http"
port = 3000
target_port = 3000
protocol = "TCP"
}
}
}
resource "kubernetes_deployment" "tika" {
metadata {
name = "tika"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "tika"
tier = local.tiers.edge
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "tika"
}
}
template {
metadata {
labels = {
app = "tika"
}
}
spec {
container {
image = "docker.io/apache/tika:3.3.1.0"
name = "tika"
port {
container_port = 9998
}
resources {
requests = {
cpu = "50m"
memory = "512Mi"
}
limits = {
memory = "1Gi"
}
}
readiness_probe {
http_get {
path = "/tika"
port = 9998
}
initial_delay_seconds = 10
period_seconds = 15
}
}
}
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "tika" {
metadata {
name = "tika"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "tika"
}
}
spec {
selector = {
app = "tika"
}
port {
name = "http"
port = 9998
target_port = 9998
protocol = "TCP"
}
}
}
module "ingress" {
source = "../../modules/kubernetes/ingress_factory"
# Paperless has a mobile app (`Paperless`) that uses /api/* with token