infra/stacks/paperless-ngx/main.tf
Viktor Barzin 21d20dccf8
Some checks failed
ci/woodpecker/push/default Pipeline was canceled
paperless-ngx: bulk-import via PVC consume dir (restart-safe) + 6 workers
Emo's ~13.7k-document import was going through the API upload path, which
stages each file on the pod's EPHEMERAL scratch before queuing it. Any
paperless pod or redis restart therefore destroyed all in-flight work
(the "File not found" failures we hit) and required manual re-uploads.

Move bulk ingest to paperless's consume directory placed on the encrypted
PVC, with PAPERLESS_CONSUMER_POLLING so the whole folder is re-scanned
periodically (and on startup) with a file-stability check. Files now live
on durable storage and survive any restart — the folder is the queue and
self-heals, so we can copy everything in fast and let it process over
time with zero retry/integrity risk. RECURSIVE preserves the source tree
(avoids basename collisions); owner+tag come from a consumption workflow.

Bump TASK_WORKERS 4->6 to speed the OCR/convert-bound processing (node6
has the core headroom for one pod) and mem limit 8->10Gi for the extra
workers. Revert workers/mem/consume envs to defaults once the import ends.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 19:35:10 +00:00

534 lines
16 KiB
HCL

variable "tls_secret_name" {
type = string
sensitive = true
}
variable "nfs_server" { type = string }
variable "redis_host" { type = string }
variable "mysql_host" { type = string }
data "vault_kv_secret_v2" "secrets" {
mount = "secret"
name = "paperless-ngx"
}
locals {
homepage_credentials = jsondecode(data.vault_kv_secret_v2.secrets.data["homepage_credentials"])
}
resource "kubernetes_namespace" "paperless-ngx" {
metadata {
name = "paperless-ngx"
labels = {
tier = local.tiers.edge
"keel.sh/enrolled" = "true"
}
# labels = {
# "istio-injection" : "enabled"
# }
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
resource "kubernetes_manifest" "external_secret" {
field_manager {
force_conflicts = true
}
manifest = {
apiVersion = "external-secrets.io/v1"
kind = "ExternalSecret"
metadata = {
name = "paperless-ngx-secrets"
namespace = "paperless-ngx"
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = {
name = "paperless-ngx-secrets"
}
dataFrom = [{
extract = {
key = "paperless-ngx"
}
}]
}
}
depends_on = [kubernetes_namespace.paperless-ngx]
}
module "tls_secret" {
source = "../../modules/kubernetes/setup_tls_secret"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
tls_secret_name = var.tls_secret_name
}
resource "kubernetes_persistent_volume_claim" "data_encrypted" {
wait_until_bound = false
metadata {
name = "paperless-ngx-data-encrypted"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
annotations = {
"resize.topolvm.io/threshold" = "10%"
"resize.topolvm.io/increase" = "100%"
"resize.topolvm.io/storage_limit" = "80Gi"
}
}
spec {
access_modes = ["ReadWriteOnce"]
storage_class_name = "proxmox-lvm-encrypted"
resources {
requests = {
storage = "1Gi"
}
}
}
lifecycle {
# The autoresizer expands requests.storage up to storage_limit and
# PVCs can't shrink. Without this, every TF apply tries to revert
# to the spec value, K8s rejects the shrink, and the PVC ends up
# in Terminating-but-in-use limbo.
ignore_changes = [spec[0].resources[0].requests]
}
}
resource "kubernetes_deployment" "paperless-ngx" {
metadata {
name = "paperless-ngx"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "paperless-ngx"
tier = local.tiers.edge
}
annotations = {
"reloader.stakater.com/search" = "true"
}
}
spec {
replicas = 1
strategy {
type = "Recreate"
}
selector {
match_labels = {
app = "paperless-ngx"
}
}
template {
metadata {
labels = {
app = "paperless-ngx"
}
annotations = {
"diun.enable" = "true"
"diun.include_tags" = "^\\d+(?:\\.\\d+)?(?:\\.\\d+)?$"
"dependency.kyverno.io/wait-for" = "mysql.dbaas:3306,redis-master.redis:6379"
}
}
spec {
container {
image = "ghcr.io/paperless-ngx/paperless-ngx:2.20.14"
name = "paperless-ngx"
env {
name = "PAPERLESS_REDIS"
// If redis gets stuck, try deleting the locks files in log dir
value = "redis://${var.redis_host}"
}
env {
name = "PAPERLESS_REDIS_PREFIX"
value = "paperless-ngx"
}
env {
name = "PAPERLESS_DBENGINE"
value = "mariadb"
}
env {
name = "PAPERLESS_DBHOST"
value = var.mysql_host
}
env {
name = "PAPERLESS_DBNAME"
value = "paperless-ngx"
}
env {
name = "PAPERLESS_DBUSER"
value = "paperless-ngx"
}
env {
name = "PAPERLESS_DBPASS"
value_from {
secret_key_ref {
name = "paperless-ngx-secrets"
key = "db_password"
}
}
}
env {
name = "PAPERLESS_CSRF_TRUSTED_ORIGINS"
value = "https://paperless-ngx.viktorbarzin.me,https://pdf.viktorbarzin.me"
}
env {
name = "PAPERLESS_DEBUG"
value = "false"
}
env {
name = "PAPERLESS_MEDIA_ROOT"
value = "../data"
}
env {
name = "PAPERLESS_OCR_USER_ARGS"
value = "{\"invalidate_digital_signatures\": true}"
}
# OCR language(s) used per document. bul+eng covers the Bulgarian
# (Cyrillic) + English document set being imported (e.g. emo's
# archive). Multiple langs => tesseract tries all; "+" not " ".
env {
name = "PAPERLESS_OCR_LANGUAGE"
value = "bul+eng"
}
# Language data packages installed at container start (space-
# separated). The image ships eng (+deu/fra/ita/spa); bul must be
# apt-installed here so OCR_LANGUAGE=bul+eng resolves.
env {
name = "PAPERLESS_OCR_LANGUAGES"
value = "bul eng"
}
# Office/email documents (.doc/.docx/.xls/.xlsx/.ppt/.pptx/.odt/.eml/
# .msg) are converted via Apache Tika (text+metadata) + Gotenberg
# (-> PDF) so paperless can archive/OCR/index them. Needed for emo's
# work-PC document set (~4.9k Office files). Endpoints = the tika /
# gotenberg deployments defined below in this stack.
env {
name = "PAPERLESS_TIKA_ENABLED"
value = "1"
}
env {
name = "PAPERLESS_TIKA_ENDPOINT"
value = "http://tika.paperless-ngx.svc.cluster.local:9998"
}
env {
name = "PAPERLESS_TIKA_GOTENBERG_ENDPOINT"
value = "http://gotenberg.paperless-ngx.svc.cluster.local:3000"
}
# Processing concurrency, tuned for the bulk Emo import (~13.7k docs,
# mostly scanned/office => OCR/convert-bound, ~3-4 docs/min/worker).
# 6 workers = 6 docs in parallel; paperless is a single pod (RWO PVC)
# pinned to one ~8-core node, so 6 leaves headroom for system + the
# node's co-tenants (8 would saturate it). OCR temp stays on ephemeral
# scratch (fast); the consume QUEUE is on the PVC (below) so a restart
# never loses queued work. Watch etcd apply latency; dial back if it
# degrades. Revert workers/threads/mem to defaults once import is done.
env {
name = "PAPERLESS_TASK_WORKERS"
value = "6"
}
env {
name = "PAPERLESS_THREADS_PER_WORKER"
value = "1"
}
# Skip the redundant OCR'd archive PDF for inputs that already carry a
# text layer (born-digital PDFs + office->PDF via Gotenberg). Big
# speed/IO saver for emo's work-doc set; scanned docs still OCR+archive.
env {
name = "PAPERLESS_OCR_SKIP_ARCHIVE_FILE"
value = "with_text"
}
# Bulk-import ingest path = the CONSUME DIRECTORY on the PVC (not the
# API). post_document writes each upload to ephemeral scratch then
# queues it in redis -> a pod or redis restart loses in-flight work
# ("File not found"). The consume dir instead lives on the encrypted
# PVC, and POLLING re-scans the whole dir every 60s (watchdog snapshot
# resets on startup, so files dropped while paperless was down are
# picked up too) with a size+mtime stability check (won't grab a
# half-copied file). Net: restart-safe, self-healing bulk ingest — the
# folder IS the durable queue. RECURSIVE walks subdirs (source tree is
# copied in with structure, avoiding basename collisions). Owner+tag
# are applied by a consumption workflow scoped to the import subdir.
# Revert (remove these three env blocks) once the import is done.
env {
name = "PAPERLESS_CONSUMPTION_DIR"
value = "/usr/src/paperless/data/consume"
}
env {
name = "PAPERLESS_CONSUMER_RECURSIVE"
value = "true"
}
env {
name = "PAPERLESS_CONSUMER_POLLING"
value = "60"
}
volume_mount {
name = "data"
mount_path = "/usr/src/paperless/data"
}
resources {
requests = {
cpu = "50m"
memory = "2Gi"
}
limits = {
memory = "10Gi"
}
}
port {
container_port = 8000
}
}
volume {
name = "data"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.data_encrypted.metadata[0].name
}
}
}
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
metadata[0].annotations["kubernetes.io/change-cause"],
metadata[0].annotations["deployment.kubernetes.io/revision"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "paperless-ngx" {
metadata {
name = "paperless-ngx"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
"app" = "paperless-ngx"
}
}
spec {
selector = {
app = "paperless-ngx"
}
port {
name = "http"
target_port = 8000
port = 80
protocol = "TCP"
}
}
}
# --- Tika + Gotenberg: Office/email -> text/PDF conversion for paperless ---
# Apache Tika extracts text+metadata; Gotenberg renders Office formats to PDF.
# Paperless routes Office/email docs through these (PAPERLESS_TIKA_* above).
# Stateless (no PVC), pinned images. 3 replicas during the bulk import: a
# single LibreOffice instance 503s under concurrent paperless workers; the
# Service load-balances office conversions across the replicas.
resource "kubernetes_deployment" "gotenberg" {
metadata {
name = "gotenberg"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "gotenberg"
tier = local.tiers.edge
}
}
spec {
replicas = 3
selector {
match_labels = {
app = "gotenberg"
}
}
template {
metadata {
labels = {
app = "gotenberg"
}
}
spec {
container {
image = "docker.io/gotenberg/gotenberg:8.25"
name = "gotenberg"
# docker-compose `command:` == k8s `args` (overrides CMD, keeps the
# image's tini ENTRYPOINT). Paperless's recommended hardening flags.
args = [
"gotenberg",
"--chromium-disable-javascript=true",
"--chromium-allow-list=file:///tmp/.*",
]
port {
container_port = 3000
}
resources {
requests = {
cpu = "50m"
memory = "256Mi"
}
limits = {
memory = "1536Mi"
}
}
readiness_probe {
http_get {
path = "/health"
port = 3000
}
initial_delay_seconds = 5
period_seconds = 15
}
}
}
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "gotenberg" {
metadata {
name = "gotenberg"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "gotenberg"
}
}
spec {
selector = {
app = "gotenberg"
}
port {
name = "http"
port = 3000
target_port = 3000
protocol = "TCP"
}
}
}
resource "kubernetes_deployment" "tika" {
metadata {
name = "tika"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "tika"
tier = local.tiers.edge
}
}
spec {
replicas = 2
selector {
match_labels = {
app = "tika"
}
}
template {
metadata {
labels = {
app = "tika"
}
}
spec {
container {
image = "docker.io/apache/tika:3.3.1.0"
name = "tika"
port {
container_port = 9998
}
resources {
requests = {
cpu = "50m"
memory = "512Mi"
}
limits = {
memory = "1Gi"
}
}
readiness_probe {
http_get {
path = "/tika"
port = 9998
}
initial_delay_seconds = 10
period_seconds = 15
}
}
}
}
}
lifecycle {
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "tika" {
metadata {
name = "tika"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
labels = {
app = "tika"
}
}
spec {
selector = {
app = "tika"
}
port {
name = "http"
port = 9998
target_port = 9998
protocol = "TCP"
}
}
}
module "ingress" {
source = "../../modules/kubernetes/ingress_factory"
# Paperless has a mobile app (`Paperless`) that uses /api/* with token
# auth. The app can't follow Authentik 302s. Paperless's own login
# gates the web UI.
# auth = "app": Paperless mobile app uses /api/* with token auth; Paperless enforces app-layer login for web UI; backend manages authentication.
auth = "app"
namespace = kubernetes_namespace.paperless-ngx.metadata[0].name
name = "paperless-ngx"
service_name = "paperless-ngx"
host = "pdf"
dns_type = "proxied"
tls_secret_name = var.tls_secret_name
port = 80
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/description" = "Document library"
"gethomepage.dev/group" = "Productivity"
"gethomepage.dev/icon" : "paperless-ngx.png"
"gethomepage.dev/name" = "Paperless-ngx"
"gethomepage.dev/widget.type" = "paperlessngx"
"gethomepage.dev/widget.url" = "http://paperless-ngx.paperless-ngx.svc.cluster.local"
# "gethomepage.dev/widget.token" = var.homepage_token
"gethomepage.dev/widget.username" = local.homepage_credentials["paperless-ngx"]["username"]
"gethomepage.dev/widget.password" = local.homepage_credentials["paperless-ngx"]["password"]
"gethomepage.dev/widget.fields" = "[\"total\"]"
"gethomepage.dev/pod-selector" = ""
# gethomepage.dev/weight: 10 # optional
# gethomepage.dev/instance: "public" # optional
}
}
# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
# CI retrigger v2 2026-05-16T13:46:35+00:00