2026-03-07 14:30:36 +00:00
|
|
|
variable "tls_secret_name" {
|
2026-03-14 08:51:45 +00:00
|
|
|
type = string
|
2026-03-07 14:30:36 +00:00
|
|
|
sensitive = true
|
|
|
|
|
}
|
[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
|
|
|
variable "nfs_server" { type = string }
|
2026-02-22 13:56:34 +00:00
|
|
|
|
2026-02-22 15:13:55 +00:00
|
|
|
resource "kubernetes_namespace" "calibre" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "calibre"
|
|
|
|
|
labels = {
|
|
|
|
|
tier = local.tiers.edge
|
|
|
|
|
}
|
|
|
|
|
# labels = {
|
|
|
|
|
# "istio-injection" : "enabled"
|
|
|
|
|
# }
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
migrate 16 plan-time stacks: vault data source → ESO + kubernetes_secret
Replaced data "vault_kv_secret_v2" with:
1. ExternalSecret (ESO syncs Vault KV → K8s Secret)
2. data "kubernetes_secret" (reads ESO-created secret at plan time)
This removes the Vault provider dependency at plan time for these
stacks — they now only need K8s API access, not a Vault token.
Stacks: actualbudget, affine, audiobookshelf, calibre, changedetection,
coturn, freedify, freshrss, grampsweb, navidrome, novelapp, ollama,
owntracks, real-estate-crawler, servarr, ytdlp
2026-03-15 22:06:39 +00:00
|
|
|
resource "kubernetes_manifest" "external_secret" {
|
|
|
|
|
manifest = {
|
|
|
|
|
apiVersion = "external-secrets.io/v1beta1"
|
|
|
|
|
kind = "ExternalSecret"
|
|
|
|
|
metadata = {
|
|
|
|
|
name = "calibre-secrets"
|
|
|
|
|
namespace = "calibre"
|
|
|
|
|
}
|
|
|
|
|
spec = {
|
|
|
|
|
refreshInterval = "15m"
|
|
|
|
|
secretStoreRef = {
|
|
|
|
|
name = "vault-kv"
|
|
|
|
|
kind = "ClusterSecretStore"
|
|
|
|
|
}
|
|
|
|
|
target = {
|
|
|
|
|
name = "calibre-secrets"
|
|
|
|
|
}
|
|
|
|
|
dataFrom = [{
|
|
|
|
|
extract = {
|
|
|
|
|
key = "calibre"
|
|
|
|
|
}
|
|
|
|
|
}]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
depends_on = [kubernetes_namespace.calibre]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
data "kubernetes_secret" "eso_secrets" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "calibre-secrets"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
}
|
|
|
|
|
depends_on = [kubernetes_manifest.external_secret]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
locals {
|
|
|
|
|
homepage_credentials = jsondecode(data.kubernetes_secret.eso_secrets.data["homepage_credentials"])
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-22 15:13:55 +00:00
|
|
|
module "tls_secret" {
|
|
|
|
|
source = "../../modules/kubernetes/setup_tls_secret"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
tls_secret_name = var.tls_secret_name
|
|
|
|
|
}
|
|
|
|
|
|
[ci skip] migrate 29 services from inline NFS to CSI-backed PV/PVC
Batch migration of all single-volume and simple multi-volume stacks.
All services verified healthy after migration. Uses nfs-truenas
StorageClass with soft,timeo=30,retrans=3 mount options to eliminate
stale NFS mount hangs.
Services: atuin, audiobookshelf, calibre, changedetection, diun,
excalidraw, forgejo, freshrss, grampsweb, hackmd, health,
isponsorblocktv, matrix, meshcentral, n8n, navidrome, ntfy, ollama,
onlyoffice, owntracks, paperless-ngx, poison-fountain, send,
stirling-pdf, tandoor, wealthfolio, whisper, woodpecker, ytdlp
2026-03-02 00:15:39 +00:00
|
|
|
module "nfs_library" {
|
|
|
|
|
source = "../../modules/kubernetes/nfs_volume"
|
|
|
|
|
name = "calibre-library"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
nfs_server = var.nfs_server
|
|
|
|
|
nfs_path = "/mnt/main/calibre-web-automated/calibre-library"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module "nfs_config" {
|
|
|
|
|
source = "../../modules/kubernetes/nfs_volume"
|
|
|
|
|
name = "calibre-config"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
nfs_server = var.nfs_server
|
|
|
|
|
nfs_path = "/mnt/main/calibre-web-automated/config"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module "nfs_ingest" {
|
|
|
|
|
source = "../../modules/kubernetes/nfs_volume"
|
|
|
|
|
name = "calibre-ingest"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
nfs_server = var.nfs_server
|
|
|
|
|
nfs_path = "/mnt/main/calibre-web-automated/cwa-book-ingest"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module "nfs_stacks_config" {
|
|
|
|
|
source = "../../modules/kubernetes/nfs_volume"
|
|
|
|
|
name = "calibre-stacks-config"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
nfs_server = var.nfs_server
|
|
|
|
|
nfs_path = "/mnt/main/calibre-web-automated/stacks"
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-22 15:13:55 +00:00
|
|
|
# resource "kubernetes_deployment" "calibre" {
|
|
|
|
|
# metadata {
|
|
|
|
|
# name = "calibre"
|
|
|
|
|
# namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
# labels = {
|
|
|
|
|
# app = "calibre"
|
|
|
|
|
# }
|
|
|
|
|
# annotations = {
|
|
|
|
|
# "reloader.stakater.com/search" = "true"
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# spec {
|
|
|
|
|
# replicas = 1
|
|
|
|
|
# strategy {
|
|
|
|
|
# type = "Recreate"
|
|
|
|
|
# }
|
|
|
|
|
# selector {
|
|
|
|
|
# match_labels = {
|
|
|
|
|
# app = "calibre"
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# template {
|
|
|
|
|
# metadata {
|
|
|
|
|
# annotations = {
|
|
|
|
|
# # "diun.enable" = "true"
|
|
|
|
|
# "diun.enable" = "false"
|
|
|
|
|
# "diun.include_tags" = "^\\d+(?:\\.\\d+)?(?:\\.\\d+)?$"
|
|
|
|
|
# }
|
|
|
|
|
# labels = {
|
|
|
|
|
# app = "calibre"
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# spec {
|
|
|
|
|
# container {
|
|
|
|
|
# image = "lscr.io/linuxserver/calibre-web:latest"
|
|
|
|
|
# name = "calibre"
|
|
|
|
|
# env {
|
|
|
|
|
# name = "PUID"
|
|
|
|
|
# value = 1000
|
|
|
|
|
# }
|
|
|
|
|
# env {
|
|
|
|
|
# name = "PGID"
|
|
|
|
|
# value = 1000
|
|
|
|
|
# }
|
|
|
|
|
# env {
|
|
|
|
|
# name = "DOCKER_MODS"
|
|
|
|
|
# value = "linuxserver/mods:universal-calibre"
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
# port {
|
|
|
|
|
# container_port = 8083
|
|
|
|
|
# }
|
|
|
|
|
# volume_mount {
|
|
|
|
|
# name = "data"
|
|
|
|
|
# mount_path = "/config"
|
|
|
|
|
# }
|
|
|
|
|
# volume_mount {
|
|
|
|
|
# name = "data"
|
|
|
|
|
# mount_path = "/books"
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# volume {
|
|
|
|
|
# name = "data"
|
|
|
|
|
# nfs {
|
|
|
|
|
# path = "/mnt/main/calibre"
|
[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
|
|
|
# server = var.nfs_server
|
2026-02-22 15:13:55 +00:00
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
|
|
resource "kubernetes_deployment" "calibre-web-automated" {
|
2026-03-15 02:03:19 +00:00
|
|
|
wait_for_rollout = true
|
2026-02-22 15:13:55 +00:00
|
|
|
metadata {
|
|
|
|
|
name = "calibre-web-automated"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
labels = {
|
|
|
|
|
app = "calibre-web-automated"
|
|
|
|
|
tier = local.tiers.edge
|
|
|
|
|
}
|
|
|
|
|
annotations = {
|
|
|
|
|
"reloader.stakater.com/search" = "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spec {
|
|
|
|
|
replicas = 1
|
|
|
|
|
strategy {
|
|
|
|
|
type = "Recreate"
|
|
|
|
|
}
|
|
|
|
|
selector {
|
|
|
|
|
match_labels = {
|
|
|
|
|
app = "calibre-web-automated"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
template {
|
|
|
|
|
metadata {
|
|
|
|
|
annotations = {
|
|
|
|
|
# "diun.enable" = "true"
|
|
|
|
|
"diun.enable" = "false"
|
|
|
|
|
"diun.include_tags" = "^\\d+(?:\\.\\d+)?(?:\\.\\d+)?$"
|
|
|
|
|
}
|
|
|
|
|
labels = {
|
|
|
|
|
app = "calibre-web-automated"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spec {
|
|
|
|
|
container {
|
2026-03-15 02:03:19 +00:00
|
|
|
image = "viktorbarzin/calibre-web-automated:latest"
|
2026-02-22 15:13:55 +00:00
|
|
|
name = "calibre-web-automated"
|
|
|
|
|
env {
|
|
|
|
|
name = "PUID"
|
|
|
|
|
value = 1000
|
|
|
|
|
}
|
|
|
|
|
env {
|
|
|
|
|
name = "PGID"
|
|
|
|
|
value = 1000
|
|
|
|
|
}
|
|
|
|
|
env {
|
2026-03-15 02:03:19 +00:00
|
|
|
name = "NO_CHOWN"
|
|
|
|
|
value = "true"
|
2026-02-22 15:13:55 +00:00
|
|
|
}
|
|
|
|
|
env {
|
|
|
|
|
# If your library is on a network share (e.g., NFS/SMB), disable WAL to reduce locking issues
|
|
|
|
|
name = "NETWORK_SHARE_MODE"
|
|
|
|
|
value = "true"
|
|
|
|
|
}
|
|
|
|
|
env {
|
|
|
|
|
name = "CALIBRE_PORT"
|
|
|
|
|
value = "8083"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
port {
|
|
|
|
|
container_port = 8083
|
|
|
|
|
}
|
2026-03-14 17:15:48 +00:00
|
|
|
startup_probe {
|
|
|
|
|
http_get {
|
|
|
|
|
path = "/"
|
|
|
|
|
port = 8083
|
|
|
|
|
}
|
2026-03-15 02:03:19 +00:00
|
|
|
initial_delay_seconds = 10
|
2026-03-15 01:44:28 +00:00
|
|
|
timeout_seconds = 5
|
2026-03-15 02:03:19 +00:00
|
|
|
period_seconds = 5
|
|
|
|
|
failure_threshold = 24
|
2026-03-14 17:15:48 +00:00
|
|
|
}
|
|
|
|
|
liveness_probe {
|
|
|
|
|
http_get {
|
|
|
|
|
path = "/"
|
|
|
|
|
port = 8083
|
|
|
|
|
}
|
2026-03-15 02:33:46 +00:00
|
|
|
timeout_seconds = 5
|
2026-03-14 17:15:48 +00:00
|
|
|
period_seconds = 30
|
|
|
|
|
failure_threshold = 3
|
|
|
|
|
}
|
2026-03-01 16:42:35 +00:00
|
|
|
resources {
|
|
|
|
|
requests = {
|
2026-03-03 19:48:45 +00:00
|
|
|
cpu = "50m"
|
right-size memory: set requests=limits based on actual usage
- Set memory requests = limits across 56 stacks to prevent overcommit
- Right-sized limits based on actual pod usage (2x actual, rounded up)
- Scaled down trading-bot (replicas=0) to free memory
- Fixed OOMKilled services: forgejo, dawarich, health, meshcentral,
paperless-ngx, vault auto-unseal, rybbit, whisper, openclaw, clickhouse
- Added startup+liveness probes to calibre-web
- Bumped inotify limits on nodes 2,3 (max_user_instances 128->8192)
Post node2 OOM incident (2026-03-14). Previous kubelet config had no
kubeReserved/systemReserved set, allowing pods to starve the kernel.
2026-03-14 21:01:24 +00:00
|
|
|
memory = "512Mi"
|
2026-03-01 16:42:35 +00:00
|
|
|
}
|
|
|
|
|
limits = {
|
2026-03-15 00:03:59 +00:00
|
|
|
memory = "1Gi"
|
2026-03-01 16:42:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
2026-02-22 15:13:55 +00:00
|
|
|
volume_mount {
|
|
|
|
|
name = "config"
|
|
|
|
|
mount_path = "/config"
|
|
|
|
|
}
|
|
|
|
|
volume_mount {
|
|
|
|
|
name = "library"
|
|
|
|
|
mount_path = "/calibre-library"
|
|
|
|
|
}
|
|
|
|
|
volume_mount {
|
|
|
|
|
name = "ingest"
|
|
|
|
|
mount_path = "/cwa-book-ingest"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
volume {
|
|
|
|
|
name = "library"
|
[ci skip] migrate 29 services from inline NFS to CSI-backed PV/PVC
Batch migration of all single-volume and simple multi-volume stacks.
All services verified healthy after migration. Uses nfs-truenas
StorageClass with soft,timeo=30,retrans=3 mount options to eliminate
stale NFS mount hangs.
Services: atuin, audiobookshelf, calibre, changedetection, diun,
excalidraw, forgejo, freshrss, grampsweb, hackmd, health,
isponsorblocktv, matrix, meshcentral, n8n, navidrome, ntfy, ollama,
onlyoffice, owntracks, paperless-ngx, poison-fountain, send,
stirling-pdf, tandoor, wealthfolio, whisper, woodpecker, ytdlp
2026-03-02 00:15:39 +00:00
|
|
|
persistent_volume_claim {
|
|
|
|
|
claim_name = module.nfs_library.claim_name
|
2026-02-22 15:13:55 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
volume {
|
|
|
|
|
name = "config"
|
[ci skip] migrate 29 services from inline NFS to CSI-backed PV/PVC
Batch migration of all single-volume and simple multi-volume stacks.
All services verified healthy after migration. Uses nfs-truenas
StorageClass with soft,timeo=30,retrans=3 mount options to eliminate
stale NFS mount hangs.
Services: atuin, audiobookshelf, calibre, changedetection, diun,
excalidraw, forgejo, freshrss, grampsweb, hackmd, health,
isponsorblocktv, matrix, meshcentral, n8n, navidrome, ntfy, ollama,
onlyoffice, owntracks, paperless-ngx, poison-fountain, send,
stirling-pdf, tandoor, wealthfolio, whisper, woodpecker, ytdlp
2026-03-02 00:15:39 +00:00
|
|
|
persistent_volume_claim {
|
|
|
|
|
claim_name = module.nfs_config.claim_name
|
2026-02-22 15:13:55 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
volume {
|
|
|
|
|
name = "ingest"
|
[ci skip] migrate 29 services from inline NFS to CSI-backed PV/PVC
Batch migration of all single-volume and simple multi-volume stacks.
All services verified healthy after migration. Uses nfs-truenas
StorageClass with soft,timeo=30,retrans=3 mount options to eliminate
stale NFS mount hangs.
Services: atuin, audiobookshelf, calibre, changedetection, diun,
excalidraw, forgejo, freshrss, grampsweb, hackmd, health,
isponsorblocktv, matrix, meshcentral, n8n, navidrome, ntfy, ollama,
onlyoffice, owntracks, paperless-ngx, poison-fountain, send,
stirling-pdf, tandoor, wealthfolio, whisper, woodpecker, ytdlp
2026-03-02 00:15:39 +00:00
|
|
|
persistent_volume_claim {
|
|
|
|
|
claim_name = module.nfs_ingest.claim_name
|
2026-02-22 15:13:55 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
resource "kubernetes_service" "calibre" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "calibre"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
labels = {
|
|
|
|
|
"app" = "calibre"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
spec {
|
|
|
|
|
selector = {
|
|
|
|
|
# app = "calibre"
|
|
|
|
|
app = "calibre-web-automated"
|
|
|
|
|
}
|
|
|
|
|
port {
|
|
|
|
|
name = "http"
|
|
|
|
|
target_port = 8083
|
|
|
|
|
port = 80
|
|
|
|
|
protocol = "TCP"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module "ingress" {
|
|
|
|
|
source = "../../modules/kubernetes/ingress_factory"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
name = "calibre"
|
|
|
|
|
tls_secret_name = var.tls_secret_name
|
|
|
|
|
extra_annotations = {
|
|
|
|
|
"gethomepage.dev/enabled" = "true"
|
|
|
|
|
"gethomepage.dev/description" = "Book library"
|
2026-03-07 16:41:36 +00:00
|
|
|
"gethomepage.dev/group" = "Media & Entertainment"
|
2026-02-22 15:13:55 +00:00
|
|
|
"gethomepage.dev/icon" : "calibre-web.png"
|
|
|
|
|
"gethomepage.dev/name" = "Calibre"
|
|
|
|
|
"gethomepage.dev/widget.type" = "calibreweb"
|
2026-03-07 19:23:57 +00:00
|
|
|
"gethomepage.dev/widget.url" = "http://calibre.calibre.svc.cluster.local"
|
2026-03-14 17:15:48 +00:00
|
|
|
"gethomepage.dev/widget.username" = local.homepage_credentials["calibre-web"]["username"]
|
|
|
|
|
"gethomepage.dev/widget.password" = local.homepage_credentials["calibre-web"]["password"]
|
2026-02-22 15:13:55 +00:00
|
|
|
"gethomepage.dev/pod-selector" = ""
|
|
|
|
|
# gethomepage.dev/weight: 10 # optional
|
|
|
|
|
# gethomepage.dev/instance: "public" # optional
|
|
|
|
|
}
|
|
|
|
|
rybbit_site_id = "17a5c7fbb077"
|
|
|
|
|
custom_content_security_policy = "script-src 'self' 'unsafe-inline' 'unsafe-eval' https://rybbit.viktorbarzin.me"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Stacks - Anna's Archive Download Manager
|
|
|
|
|
|
|
|
|
|
resource "kubernetes_deployment" "annas-archive-stacks" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "annas-archive-stacks"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
labels = {
|
|
|
|
|
app = "annas-archive-stacks"
|
|
|
|
|
tier = local.tiers.edge
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spec {
|
|
|
|
|
replicas = 1
|
|
|
|
|
selector {
|
|
|
|
|
match_labels = {
|
|
|
|
|
app = "annas-archive-stacks"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
template {
|
|
|
|
|
metadata {
|
|
|
|
|
labels = {
|
|
|
|
|
app = "annas-archive-stacks"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spec {
|
|
|
|
|
container {
|
|
|
|
|
image = "zelest/stacks:latest"
|
|
|
|
|
name = "annas-archive-stacks"
|
resource quota review: fix OOM risks, close quota gaps, add HA protections
Phase 1 - OOM fixes:
- dashy: increase memory limit 512Mi→1Gi (was at 99% utilization)
- caretta DaemonSet: set explicit resources 300Mi/512Mi (was at 85-98%)
- mysql-operator: add Helm resource values 256Mi/512Mi, create namespace
with tier label (was at 92% of LimitRange default)
- prowlarr, flaresolverr, annas-archive-stacks: add explicit resources
(outgrowing 256Mi LimitRange defaults)
- real-estate-crawler celery: add resources 512Mi/3Gi (608Mi actual, no
explicit resources)
Phase 2 - Close quota gaps:
- nvidia, real-estate-crawler, trading-bot: remove custom-quota=true
labels so Kyverno generates tier-appropriate quotas
- descheduler: add tier=1-cluster label for proper classification
Phase 3 - Reduce excessive quotas:
- monitoring: limits.memory 240Gi→64Gi, limits.cpu 120→64
- woodpecker: limits.memory 128Gi→32Gi, limits.cpu 64→16
- GPU tier default: limits.memory 96Gi→32Gi, limits.cpu 48→16
Phase 4 - Kubelet protection:
- Add cpu: 200m to systemReserved and kubeReserved in kubelet template
Phase 5 - HA improvements:
- cloudflared: add topology spread (ScheduleAnyway) + PDB (maxUnavailable:1)
- grafana: add topology spread + PDB via Helm values
- crowdsec LAPI: add topology spread + PDB via Helm values
- authentik server: add topology spread via Helm values
- authentik worker: add topology spread + PDB via Helm values
2026-03-08 18:17:46 +00:00
|
|
|
resources {
|
|
|
|
|
requests = {
|
|
|
|
|
cpu = "10m"
|
right-size memory: set requests=limits based on actual usage
- Set memory requests = limits across 56 stacks to prevent overcommit
- Right-sized limits based on actual pod usage (2x actual, rounded up)
- Scaled down trading-bot (replicas=0) to free memory
- Fixed OOMKilled services: forgejo, dawarich, health, meshcentral,
paperless-ngx, vault auto-unseal, rybbit, whisper, openclaw, clickhouse
- Added startup+liveness probes to calibre-web
- Bumped inotify limits on nodes 2,3 (max_user_instances 128->8192)
Post node2 OOM incident (2026-03-14). Previous kubelet config had no
kubeReserved/systemReserved set, allowing pods to starve the kernel.
2026-03-14 21:01:24 +00:00
|
|
|
memory = "384Mi"
|
resource quota review: fix OOM risks, close quota gaps, add HA protections
Phase 1 - OOM fixes:
- dashy: increase memory limit 512Mi→1Gi (was at 99% utilization)
- caretta DaemonSet: set explicit resources 300Mi/512Mi (was at 85-98%)
- mysql-operator: add Helm resource values 256Mi/512Mi, create namespace
with tier label (was at 92% of LimitRange default)
- prowlarr, flaresolverr, annas-archive-stacks: add explicit resources
(outgrowing 256Mi LimitRange defaults)
- real-estate-crawler celery: add resources 512Mi/3Gi (608Mi actual, no
explicit resources)
Phase 2 - Close quota gaps:
- nvidia, real-estate-crawler, trading-bot: remove custom-quota=true
labels so Kyverno generates tier-appropriate quotas
- descheduler: add tier=1-cluster label for proper classification
Phase 3 - Reduce excessive quotas:
- monitoring: limits.memory 240Gi→64Gi, limits.cpu 120→64
- woodpecker: limits.memory 128Gi→32Gi, limits.cpu 64→16
- GPU tier default: limits.memory 96Gi→32Gi, limits.cpu 48→16
Phase 4 - Kubelet protection:
- Add cpu: 200m to systemReserved and kubeReserved in kubelet template
Phase 5 - HA improvements:
- cloudflared: add topology spread (ScheduleAnyway) + PDB (maxUnavailable:1)
- grafana: add topology spread + PDB via Helm values
- crowdsec LAPI: add topology spread + PDB via Helm values
- authentik server: add topology spread via Helm values
- authentik worker: add topology spread + PDB via Helm values
2026-03-08 18:17:46 +00:00
|
|
|
}
|
|
|
|
|
limits = {
|
|
|
|
|
memory = "384Mi"
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-02-22 15:13:55 +00:00
|
|
|
port {
|
|
|
|
|
container_port = 7788
|
|
|
|
|
}
|
|
|
|
|
volume_mount {
|
|
|
|
|
name = "config"
|
|
|
|
|
mount_path = "/opt/stacks/config"
|
|
|
|
|
}
|
|
|
|
|
volume_mount {
|
|
|
|
|
name = "ingest"
|
|
|
|
|
mount_path = "/opt/stacks/download" # this must be the same as CWA ingest dir to auto ingest
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
volume {
|
|
|
|
|
name = "config"
|
[ci skip] migrate 29 services from inline NFS to CSI-backed PV/PVC
Batch migration of all single-volume and simple multi-volume stacks.
All services verified healthy after migration. Uses nfs-truenas
StorageClass with soft,timeo=30,retrans=3 mount options to eliminate
stale NFS mount hangs.
Services: atuin, audiobookshelf, calibre, changedetection, diun,
excalidraw, forgejo, freshrss, grampsweb, hackmd, health,
isponsorblocktv, matrix, meshcentral, n8n, navidrome, ntfy, ollama,
onlyoffice, owntracks, paperless-ngx, poison-fountain, send,
stirling-pdf, tandoor, wealthfolio, whisper, woodpecker, ytdlp
2026-03-02 00:15:39 +00:00
|
|
|
persistent_volume_claim {
|
|
|
|
|
claim_name = module.nfs_stacks_config.claim_name
|
2026-02-22 15:13:55 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
volume {
|
|
|
|
|
name = "ingest"
|
[ci skip] migrate 29 services from inline NFS to CSI-backed PV/PVC
Batch migration of all single-volume and simple multi-volume stacks.
All services verified healthy after migration. Uses nfs-truenas
StorageClass with soft,timeo=30,retrans=3 mount options to eliminate
stale NFS mount hangs.
Services: atuin, audiobookshelf, calibre, changedetection, diun,
excalidraw, forgejo, freshrss, grampsweb, hackmd, health,
isponsorblocktv, matrix, meshcentral, n8n, navidrome, ntfy, ollama,
onlyoffice, owntracks, paperless-ngx, poison-fountain, send,
stirling-pdf, tandoor, wealthfolio, whisper, woodpecker, ytdlp
2026-03-02 00:15:39 +00:00
|
|
|
persistent_volume_claim {
|
|
|
|
|
claim_name = module.nfs_ingest.claim_name
|
2026-02-22 15:13:55 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
resource "kubernetes_service" "annas-archive-stacks" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "annas-archive-stacks"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
labels = {
|
|
|
|
|
"app" = "annas-archive-stacks"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
spec {
|
|
|
|
|
selector = {
|
|
|
|
|
app = "annas-archive-stacks"
|
|
|
|
|
}
|
|
|
|
|
port {
|
|
|
|
|
name = "http"
|
|
|
|
|
port = "80"
|
|
|
|
|
target_port = 7788
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module "stacks-ingress" {
|
|
|
|
|
source = "../../modules/kubernetes/ingress_factory"
|
|
|
|
|
namespace = kubernetes_namespace.calibre.metadata[0].name
|
|
|
|
|
name = "stacks"
|
|
|
|
|
service_name = "annas-archive-stacks"
|
|
|
|
|
tls_secret_name = var.tls_secret_name
|
|
|
|
|
protected = true
|
|
|
|
|
rybbit_site_id = "ce5f8aed6bbb"
|
2026-03-07 16:41:36 +00:00
|
|
|
extra_annotations = {
|
|
|
|
|
"gethomepage.dev/enabled" = "false"
|
|
|
|
|
}
|
2026-02-22 13:56:34 +00:00
|
|
|
}
|