2021-04-17 19:19:04 +01:00
# DB as a service. Installs MySQL operator
variable " tls_secret_name " { }
2026-01-10 16:28:12 +00:00
variable " tier " { type = string }
2021-05-05 19:17:56 +01:00
variable " dbaas_root_password " { }
2021-05-03 14:59:17 +01:00
variable " cluster_master_service " {
2023-04-24 02:59:24 +01:00
default = " mysql "
2021-05-03 14:59:17 +01:00
}
2023-11-24 17:38:49 +00:00
variable " postgresql_root_password " { }
variable " pgadmin_password " { }
2021-05-05 19:17:56 +01:00
variable " prod " {
default = false
type = bool
}
[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
variable " nfs_server " { type = string }
2026-03-07 14:30:36 +00:00
variable " kube_config_path " {
type = string
sensitive = true
}
2021-04-17 19:19:04 +01:00
resource " kubernetes_namespace " " dbaas " {
metadata {
name = " dbaas "
2026-02-21 23:38:05 +00:00
labels = {
2026-03-01 15:47:11 +00:00
tier = var . tier
" resource-governance/custom-quota " = " true "
}
}
}
resource " kubernetes_resource_quota " " dbaas " {
metadata {
name = " dbaas-quota "
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
}
spec {
hard = {
" requests.cpu " = " 8 "
" requests.memory " = " 12Gi "
" limits.cpu " = " 32 "
" limits.memory " = " 64Gi "
pods = " 30 "
2026-02-21 23:38:05 +00:00
}
2021-04-17 19:19:04 +01:00
}
}
module " tls_secret " {
2026-02-22 14:38:14 +00:00
source = " ../../../../modules/kubernetes/setup_tls_secret "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2021-04-17 19:19:04 +01:00
tls_secret_name = var . tls_secret_name
}
2023-04-24 02:59:24 +01:00
2026-03-01 03:00:21 +00:00
#### MYSQL — InnoDB Cluster via MySQL Operator
#
# 3 MySQL servers with Group Replication + 1 MySQL Router for auto-failover.
# Operator installed in mysql-operator namespace (toleration for control-plane).
# Init containers are slow (~20 min each) due to mysqlsh plugin loading.
2026-02-28 22:53:33 +00:00
2026-03-01 03:00:21 +00:00
resource " helm_release " " mysql_operator " {
namespace = " mysql-operator "
create_namespace = true
name = " mysql-operator "
timeout = 300
repository = " https://mysql.github.io/mysql-operator/ "
chart = " mysql-operator "
version = " 2.2.7 "
2023-04-24 02:59:24 +01:00
}
2026-03-01 17:16:03 +00:00
# The mysql-sidecar ClusterRole created by the Helm chart is missing
# namespace and CRD list/watch permissions needed by the kopf framework
# in the sidecar container. Without these, the sidecar enters degraded
# mode and never completes InnoDB cluster join operations.
resource " kubernetes_cluster_role " " mysql_sidecar_extra " {
metadata {
name = " mysql-sidecar-extra "
}
rule {
api_groups = [ " " ]
resources = [ " namespaces " ]
verbs = [ "list " , " watch " ]
}
rule {
api_groups = [ " apiextensions.k8s.io " ]
resources = [ " customresourcedefinitions " ]
verbs = [ "list " , " watch " ]
}
}
resource " kubernetes_cluster_role_binding " " mysql_sidecar_extra " {
metadata {
name = " mysql-sidecar-extra "
}
role_ref {
api_group = " rbac.authorization.k8s.io "
kind = " ClusterRole "
name = kubernetes_cluster_role . mysql_sidecar_extra . metadata [ 0 ] . name
}
subject {
kind = " ServiceAccount "
name = " mysql-cluster-sa "
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
}
}
2026-03-01 03:00:21 +00:00
resource " helm_release " " mysql_cluster " {
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
create_namespace = false
name = " mysql-cluster "
timeout = 900
repository = " https://mysql.github.io/mysql-operator/ "
chart = " mysql-innodbcluster "
version = " 2.2.7 "
values = [ yamlencode ( {
serverInstances = 3
routerInstances = 1
2026-03-01 15:47:11 +00:00
serverVersion = " 8.4.4 "
2026-03-01 03:00:21 +00:00
credentials = {
root = {
user = " root "
password = var . dbaas_root_password
host = " % "
}
2023-11-12 17:47:50 +00:00
}
2026-03-01 03:00:21 +00:00
tls = {
useSelfSigned = true
2026-01-10 16:28:12 +00:00
}
2026-03-01 03:00:21 +00:00
datadirVolumeClaimTemplate = {
[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup
- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas
- Add democratic-csi iSCSI driver module for TrueNAS
- Add open-iscsi to cloud-init VM template
- Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0)
- Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh)
- Fix cluster healthcheck CronJob: always exit 0 to prevent circular
JobFailed alerts (reporting via Slack, not exit codes)
- Fix Uptime Kuma nested list handling in cluster-health.sh
- Add health probes to: audiobookshelf, immich ML, ntfy, headscale,
uptime-kuma, vaultwarden, rybbit (clickhouse + server + client),
shlink, shlink-web
- Add iSCSI storage documentation to CLAUDE.md
2026-03-06 19:54:21 +00:00
storageClassName = " iscsi-truenas "
2026-03-01 03:00:21 +00:00
resources = {
requests = {
storage = " 30Gi "
}
2023-04-24 02:59:24 +01:00
}
}
2026-03-01 03:00:21 +00:00
serverConfig = {
" my.cnf " = < < - EOT
[ mysqld ]
skip - name - resolve
EOT
2023-04-24 02:59:24 +01:00
}
2026-03-01 03:00:21 +00:00
resources = {
requests = {
cpu = " 250m "
memory = " 1Gi "
2023-04-24 02:59:24 +01:00
}
2026-03-01 03:00:21 +00:00
limits = {
cpu = " 2 "
2026-03-01 19:03:49 +00:00
memory = " 3Gi "
2026-03-01 03:00:21 +00:00
}
}
[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
2026-03-01 03:00:21 +00:00
podSpec = {
2026-03-01 15:47:11 +00:00
affinity = {
nodeAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = {
nodeSelectorTerms = [ {
matchExpressions = [ {
key = " kubernetes.io/hostname "
operator = " NotIn "
values = [ " k8s-node2 " ]
} ]
} ]
}
}
}
2026-03-01 03:00:21 +00:00
containers = [ {
name = " mysql "
resources = {
requests = {
memory = " 1Gi "
cpu = " 250m "
2023-04-24 02:59:24 +01:00
}
2026-03-01 03:00:21 +00:00
limits = {
2026-03-01 19:03:49 +00:00
memory = " 3Gi "
2026-03-01 03:00:21 +00:00
cpu = " 2 "
2026-02-23 22:43:05 +00:00
}
}
2026-03-01 03:00:21 +00:00
} ]
2026-03-01 15:47:11 +00:00
initContainers = [
{
name = " fixdatadir "
resources = {
requests = { memory = " 64Mi " , cpu = " 25m " }
limits = { memory = " 256Mi " , cpu = " 500m " }
}
} ,
{
name = " initconf "
resources = {
requests = { memory = " 256Mi " , cpu = " 50m " }
limits = { memory = " 1Gi " , cpu = " 1 " }
}
} ,
{
name = " initmysql "
resources = {
requests = { memory = " 512Mi " , cpu = " 250m " }
limits = { memory = " 2Gi " , cpu = " 2 " }
}
}
]
2023-04-24 02:59:24 +01:00
}
2026-03-01 03:00:21 +00:00
} ) ]
depends_on = [ helm_release . mysql_operator ]
}
2026-03-01 12:16:28 +00:00
# Compatibility service: mysql.dbaas points at InnoDB Cluster mysqld pods
# When router is available it handles failover, but we fall back to direct
# mysqld access to avoid total outage during partial cluster failures
2026-03-01 03:00:21 +00:00
resource " kubernetes_service " " mysql " {
metadata {
name = var . cluster_master_service
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2023-04-24 02:59:24 +01:00
}
2026-03-01 03:00:21 +00:00
spec {
2026-03-01 12:16:28 +00:00
publish_not_ready_addresses = true # bypass InnoDB Cluster readiness gate during partial failures
2026-03-01 03:00:21 +00:00
selector = {
[ci skip] right-size all pod resources based on VPA + live metrics audit
Full cluster resource audit: cross-referenced Goldilocks VPA recommendations,
live kubectl top metrics, and Terraform definitions for 100+ containers.
Critical fixes:
- dashy: CPU throttled at 98% (490m/500m) → 2 CPU limit
- stirling-pdf: CPU throttled at 99.7% (299m/300m) → 2 CPU limit
- traefik auth-proxy/bot-block-proxy: mem limit 32Mi → 128Mi
Added explicit resources to ~40 containers that had none:
- audiobookshelf, changedetection, cyberchef, dawarich, diun, echo,
excalidraw, freshrss, hackmd, isponsorblocktv, linkwarden, n8n,
navidrome, ntfy, owntracks, privatebin, send, shadowsocks, tandoor,
tor-proxy, wealthfolio, networking-toolbox, rybbit, mailserver,
cloudflared, pgadmin, phpmyadmin, crowdsec-web, xray, wireguard,
k8s-portal, tuya-bridge, ollama-ui, whisper, piper, immich-server,
immich-postgresql, osrm-foot
GPU containers: added CPU/mem alongside GPU limits:
- ollama: removed CPU/mem limits (models vary in size), keep GPU only
- frigate: req 500m/2Gi, lim 4/8Gi + GPU
- immich-ml: req 100m/1Gi, lim 2/4Gi + GPU
Right-sized ~25 over-provisioned containers:
- kms-web-page: 500m/512Mi → 50m/64Mi (was using 0m/10Mi)
- onlyoffice: CPU 8 → 2 (VPA upper 45m)
- realestate-crawler-api: CPU 2000m → 250m
- blog/travel-blog/webhook-handler: 500m → 100m
- coturn/health/plotting-book: reduced to match actual usage
Conservative methodology: limits = max(VPA upper * 2, live usage * 2)
2026-03-01 19:18:50 +00:00
" component " = " mysqld "
" mysql.oracle.com/cluster " = " mysql-cluster "
2026-03-01 15:47:11 +00:00
" mysql.oracle.com/cluster-role " = " PRIMARY "
2026-03-01 03:00:21 +00:00
}
port {
port = 3306
2026-03-01 12:16:28 +00:00
target_port = 3306
2026-03-01 03:00:21 +00:00
}
}
depends_on = [ helm_release . mysql_cluster ]
2023-04-24 02:59:24 +01:00
}
[ci skip] complete NFS CSI migration: complex stacks + platform modules
Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).
Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)
Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler
Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance
2026-03-02 01:24:07 +00:00
module " nfs_mysql_backup " {
source = " ../../../../modules/kubernetes/nfs_volume "
name = " dbaas-mysql-backup "
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
nfs_server = var . nfs_server
nfs_path = " /mnt/main/mysql-backup "
}
module " nfs_pgadmin " {
source = " ../../../../modules/kubernetes/nfs_volume "
name = " dbaas-pgadmin "
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
nfs_server = var . nfs_server
nfs_path = " /mnt/main/postgresql/pgadmin "
}
module " nfs_postgresql_backup " {
source = " ../../../../modules/kubernetes/nfs_volume "
name = " dbaas-postgresql-backup "
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
nfs_server = var . nfs_server
nfs_path = " /mnt/main/postgresql-backup "
}
2024-01-06 16:23:39 +00:00
resource " kubernetes_cron_job_v1 " " mysql-backup " {
metadata {
name = " mysql-backup "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2024-01-06 16:23:39 +00:00
}
spec {
concurrency_policy = " Replace "
failed_jobs_history_limit = 5
2025-05-10 18:52:57 +00:00
schedule = " 0 0 * * * "
2024-01-06 16:23:39 +00:00
# schedule = "* * * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 10
job_template {
metadata { }
spec {
backoff_limit = 3
ttl_seconds_after_finished = 10
template {
metadata { }
spec {
container {
name = " mysql-backup "
image = " mysql "
# TODO: would be nice to rotate at some point... Current size is 11MB so not really needed atm
2025-05-10 18:52:57 +00:00
command = [ " /bin/bash " , " -c " , < < - EOT
set - euxo pipefail
2024-01-06 16:23:39 +00:00
export now =$ ( date + " %Y_%m_%d_%H_%M " )
mysqldump - - all - data bases - u root - p $ { var . dbaas_root_password } - - host mysql . dbaas . svc . cluster . local > / backup / dump_ $ now . sql
# Rotate - delete last log file
cd / backup
find . - name " dump_*.sql " - type f - mtime + 14 - delete # 14 day retention of backups
2025-05-10 18:52:57 +00:00
echo Done
2024-01-06 16:23:39 +00:00
EOT
]
# To restore (from outside of the cluster):
# run kubectl port-forward to pod e.g.:
# > kb port-forward mysql-647cfd4969-46rmw --address 0.0.0.0 3307:3306
2025-12-06 21:41:21 +00:00
# run mysql import (and specify non-localhost address to avoid using unix socket): (password is in tfvars)
2025-10-07 11:34:41 +00:00
# > mysql -u root -p --host 10.0.10.10 --port 3307 < /mnt/nfs/2024_01_06_13_54.sql
2024-01-06 16:23:39 +00:00
volume_mount {
name = " mysql-backup "
mount_path = " /backup "
}
}
volume {
name = " mysql-backup "
[ci skip] complete NFS CSI migration: complex stacks + platform modules
Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).
Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)
Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler
Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance
2026-03-02 01:24:07 +00:00
persistent_volume_claim {
claim_name = module . nfs_mysql_backup . claim_name
2024-01-06 16:23:39 +00:00
}
}
}
}
}
}
}
}
2023-04-24 02:59:24 +01:00
# resource "kubernetes_persistent_volume" "mysql" {
# metadata {
# name = "mysql-pv"
# }
# spec {
# capacity = {
# "storage" = "10Gi"
# }
# access_modes = ["ReadWriteOnce"]
# persistent_volume_source {
# iscsi {
# target_portal = "iscsi.viktorbarzin.lan:3260"
# iqn = "iqn.2020-12.lan.viktorbarzin:storage:dbaas:mysql"
# lun = 0
# fs_type = "ext4"
# }
# }
# }
# }
2022-12-27 17:56:39 +02:00
# resource "helm_release" "mysql" {
2025-12-29 10:23:42 +00:00
# namespace = kubernetes_namespace.dbaas.metadata[0].name
2022-12-27 17:56:39 +02:00
# create_namespace = false
# name = "mysql"
# repository = "https://presslabs.github.io/charts"
# chart = "mysql-operator"
# # version = "v0.5.0-rc.3"
# values = [templatefile("${path.module}/mysql_chart_values.yaml", { secretName = var.tls_secret_name })]
# atomic = true
# depends_on = [kubernetes_namespace.dbaas]
# }
2023-04-23 23:48:24 +01:00
# # resource "helm_release" "mysql" {
2025-12-29 10:23:42 +00:00
# # namespace = kubernetes_namespace.dbaas.metadata[0].name
2023-04-23 23:48:24 +01:00
# # create_namespace = false
# # name = "mysql-operator"
# # repository = "https://mysql.github.io/mysql-operator/"
# # chart = "mysql-operator"
# # atomic = true
# # depends_on = [kubernetes_namespace.dbaas]
# # }
# # resource "helm_release" "innodb-cluster" {
2025-12-29 10:23:42 +00:00
# # namespace = kubernetes_namespace.dbaas.metadata[0].name
2023-04-23 23:48:24 +01:00
# # create_namespace = false
# # name = var.cluster_master_service
# # repository = "https://mysql.github.io/mysql-operator/"
# # chart = "mysql-innodbcluster"
# # atomic = true
# # depends_on = [kubernetes_namespace.dbaas]
# # values = [templatefile("${path.module}/chart_values.tpl", { root_password = var.dbaas_root_password })]
# # }
# resource "kubernetes_persistent_volume" "mysql-operator" {
# metadata {
# name = "mysql-operator-pv"
# }
# spec {
# capacity = {
# "storage" = "1Gi"
# }
# access_modes = ["ReadWriteOnce"]
# persistent_volume_source {
# iscsi {
# target_portal = "iscsi.viktorbarzin.lan:3260"
# iqn = "iqn.2020-12.lan.viktorbarzin:storage:dbaas:operator"
# lun = 0
# fs_type = "ext4"
# }
# }
# }
# }
2021-04-17 19:19:04 +01:00
2023-11-24 12:55:55 +00:00
resource " kubernetes_secret " " cluster-password " {
metadata {
name = " cluster-secret "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2023-11-24 12:55:55 +00:00
annotations = {
" reloader.stakater.com/match " = " true "
}
}
type = " Opaque "
data = {
" ROOT_PASSWORD " = var . dbaas_root_password
}
}
2021-05-03 14:59:17 +01:00
2023-11-15 18:12:11 +00:00
# resource "kubernetes_ingress_v1" "dbaas" {
# metadata {
# name = "orchestrator-ingress"
2025-12-29 10:23:42 +00:00
# namespace = kubernetes_namespace.dbaas.metadata[0].name
2023-11-15 18:12:11 +00:00
# annotations = {
# "kubernetes.io/ingress.class" = "nginx"
# "nginx.ingress.kubernetes.io/auth-tls-verify-client" = "on"
# "nginx.ingress.kubernetes.io/auth-tls-secret" = "default/ca-secret"
# }
# }
2021-05-03 14:59:17 +01:00
2023-11-15 18:12:11 +00:00
# spec {
# tls {
# hosts = ["db.viktorbarzin.me"]
# secret_name = var.tls_secret_name
# }
# rule {
# host = "db.viktorbarzin.me"
# http {
# path {
# path = "/"
# backend {
# service {
# name = "mysql-mysql-operator"
# port {
# number = 80
# }
# }
# }
# }
# }
# }
# }
# }
2021-05-03 14:59:17 +01:00
# PHPMyAdmin instance
resource " kubernetes_deployment " " phpmyadmin " {
metadata {
name = " phpmyadmin "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2021-05-03 14:59:17 +01:00
labels = {
" app " = " phpmyadmin "
2026-01-10 16:28:12 +00:00
tier = var . tier
2021-05-05 19:17:56 +01:00
}
annotations = {
" reloader.stakater.com/search " = " true "
2021-05-03 14:59:17 +01:00
}
}
spec {
replicas = " 1 "
selector {
match_labels = {
" app " = " phpmyadmin "
}
}
template {
metadata {
labels = {
" app " = " phpmyadmin "
}
}
spec {
container {
name = " phpmyadmin "
2025-12-27 20:47:41 +00:00
image = " phpmyadmin/phpmyadmin:5.2.3 "
2021-05-03 14:59:17 +01:00
port {
container_port = 80
}
env {
name = " PMA_HOST "
value = var . cluster_master_service
}
env {
name = " PMA_PORT "
value = " 3306 "
}
env {
name = " MYSQL_ROOT_PASSWORD "
value_from {
secret_key_ref {
name = " cluster-secret "
key = " ROOT_PASSWORD "
}
}
}
2023-11-10 23:57:30 +00:00
env {
name = " UPLOAD_LIMIT "
value = " 300M "
}
[ci skip] right-size all pod resources based on VPA + live metrics audit
Full cluster resource audit: cross-referenced Goldilocks VPA recommendations,
live kubectl top metrics, and Terraform definitions for 100+ containers.
Critical fixes:
- dashy: CPU throttled at 98% (490m/500m) → 2 CPU limit
- stirling-pdf: CPU throttled at 99.7% (299m/300m) → 2 CPU limit
- traefik auth-proxy/bot-block-proxy: mem limit 32Mi → 128Mi
Added explicit resources to ~40 containers that had none:
- audiobookshelf, changedetection, cyberchef, dawarich, diun, echo,
excalidraw, freshrss, hackmd, isponsorblocktv, linkwarden, n8n,
navidrome, ntfy, owntracks, privatebin, send, shadowsocks, tandoor,
tor-proxy, wealthfolio, networking-toolbox, rybbit, mailserver,
cloudflared, pgadmin, phpmyadmin, crowdsec-web, xray, wireguard,
k8s-portal, tuya-bridge, ollama-ui, whisper, piper, immich-server,
immich-postgresql, osrm-foot
GPU containers: added CPU/mem alongside GPU limits:
- ollama: removed CPU/mem limits (models vary in size), keep GPU only
- frigate: req 500m/2Gi, lim 4/8Gi + GPU
- immich-ml: req 100m/1Gi, lim 2/4Gi + GPU
Right-sized ~25 over-provisioned containers:
- kms-web-page: 500m/512Mi → 50m/64Mi (was using 0m/10Mi)
- onlyoffice: CPU 8 → 2 (VPA upper 45m)
- realestate-crawler-api: CPU 2000m → 250m
- blog/travel-blog/webhook-handler: 500m → 100m
- coturn/health/plotting-book: reduced to match actual usage
Conservative methodology: limits = max(VPA upper * 2, live usage * 2)
2026-03-01 19:18:50 +00:00
resources {
requests = {
cpu = " 15m "
memory = " 32Mi "
}
limits = {
cpu = " 250m "
memory = " 256Mi "
}
}
2021-05-03 14:59:17 +01:00
}
2026-02-23 22:43:05 +00:00
dns_config {
option {
name = " ndots "
value = " 2 "
}
}
2021-05-03 14:59:17 +01:00
}
}
}
}
resource " kubernetes_service " " phpmyadmin " {
metadata {
2025-01-14 22:53:04 +00:00
name = " pma "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2021-05-03 14:59:17 +01:00
}
spec {
selector = {
" app " = " phpmyadmin "
}
port {
name = " web "
port = 80
}
}
}
2025-01-14 22:53:04 +00:00
module " ingress " {
2026-02-22 14:38:14 +00:00
source = " ../../../../modules/kubernetes/ingress_factory "
2026-02-07 13:25:49 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
name = " pma "
tls_secret_name = var . tls_secret_name
protected = true
extra_annotations = { }
rybbit_site_id = " 942c76b8bd4d "
custom_content_security_policy = " script-src 'self' 'unsafe-inline' 'unsafe-eval' 'wasm-unsafe-eval' https://rybbit.viktorbarzin.me "
2021-04-17 19:19:04 +01:00
}
2021-05-03 14:59:17 +01:00
2021-05-08 13:43:09 +01:00
# resource "kubectl_manifest" "mysql-cluster" {
# yaml_body = <<-YAML
# apiVersion: mysql.presslabs.org/v1alpha1
# kind: MysqlCluster
# metadata:
# name: mysql-cluster
2025-12-29 10:23:42 +00:00
# namespace = kubernetes_namespace.dbaas.metadata[0].name
2021-05-08 13:43:09 +01:00
# spec:
# mysqlVersion: "5.7"
# replicas: 1
# secretName: cluster-secret
# mysqlConf:
# # read_only: 0 # mysql forms a single transaction for each sql statement, autocommit for each statement
# # automatic_sp_privileges: "ON" # automatically grants the EXECUTE and ALTER ROUTINE privileges to the creator of a stored routine
# # auto_generate_certs: "ON" # Auto Generation of Certificate
# # auto_increment_increment: 1 # Auto Incrementing value from +1
# # auto_increment_offset: 1 # Auto Increment Offset
# # binlog-format: "STATEMENT" # contains various options such ROW(SLOW,SAFE) STATEMENT(FAST,UNSAFE), MIXED(combination of both)
# # wait_timeout: 31536000 # 28800 number of seconds the server waits for activity on a non-interactive connection before closing it, You might encounter MySQL server has gone away error, you then tweak this value acccordingly
# # interactive_timeout: 28800 # The number of seconds the server waits for activity on an interactive connection before closing it.
# # max_allowed_packet: "512M" # Maximum size of MYSQL Network protocol packet that the server can create or read 4MB, 8MB, 16MB, 32MB
# # max-binlog-size: 1073741824 # binary logs contains the events that describe database changes, this parameter describe size for the bin_log file.
# # log_output: "TABLE" # Format in which the logout will be dumped
# # master-info-repository: "TABLE" # Format in which the master info will be dumped
# # relay_log_info_repository: "TABLE" # Format in which the relay info will be dumped
# volumeSpec:
# persistentVolumeClaim:
# accessModes:
# - ReadWriteOnce
# resources:
# requests:
# storage: 10Gi
# YAML
# depends_on = [helm_release.mysql]
# # manifest = {
# # apiVersion = "mysql.presslabs.org/v1alpha1"
# # kind = "MysqlCluster"
# # metadata = {
# # name = "mysql-cluster"
2025-12-29 10:23:42 +00:00
# # namespace = kubernetes_namespace.dbaas.metadata[0].name
2021-05-08 13:43:09 +01:00
# # }
# # spec = {
# # mysqlVersion = "5.7"
# # replicas = 1
# # secretName = "cluster-secret"
# # mysqlConf = {
# # read_only = 0
# # }
# # volumeSpec = {
# # persistentVolumeClaim = {
# # resources = {
# # requests = {
# # storage = "10Gi"
# # }
# # }
# # }
# # }
# # }
# # }
# }
2021-04-17 19:19:04 +01:00
2021-05-05 19:17:56 +01:00
# For some unknwown reason not all CRDs are installed. Add them manually
2021-05-08 13:43:09 +01:00
# resource "kubectl_manifest" "mysql-user" {
# yaml_body = <<-EOF
# apiVersion: apiextensions.k8s.io/v1
# kind: CustomResourceDefinition
# metadata:
# annotations:
# controller-gen.kubebuilder.io/version: v0.5.0
# helm.sh/hook: crd-install
# name: mysqlusers.mysql.presslabs.org
# labels:
# app: mysql-operator
# spec:
# group: mysql.presslabs.org
# names:
# kind: MysqlUser
# listKind: MysqlUserList
# plural: mysqlusers
# singular: mysqluser
2025-12-29 10:23:42 +00:00
# scope:namespace = kubernetes_namespace.dbaas.metadata[0].name
2021-05-08 13:43:09 +01:00
# versions:
# - additionalPrinterColumns:
# - description: The user status
# jsonPath: .status.conditions[?(@.type == 'Ready')].status
# name: Ready
# type: string
# - jsonPath: .spec.clusterRef.name
# name: Cluster
# type: string
# - jsonPath: .spec.user
# name: UserName
# type: string
# - jsonPath: .metadata.creationTimestamp
# name: Age
# type: date
# name: v1alpha1
# schema:
# openAPIV3Schema:
# description: MysqlUser is the Schema for the MySQL User API
# properties:
# apiVersion:
# description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
# type: string
# kind:
# description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
# type: string
# metadata:
# type: object
# spec:
# description: MysqlUserSpec defines the desired state of MysqlUserSpec
# properties:
# allowedHosts:
# description: AllowedHosts is the allowed host to connect from.
# items:
# type: string
# type: array
# clusterRef:
# description: ClusterRef represents a reference to the MySQL cluster. This field should be immutable.
# properties:
# name:
# description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?'
# type: string
2025-12-29 10:23:42 +00:00
# namespace = kubernetes_namespace.dbaas.metadata[0].name
# description:namespace = kubernetes_namespace.dbaas.metadata[0].name
2021-05-08 13:43:09 +01:00
# type: string
# type: object
# password:
# description: Password is the password for the user.
# properties:
# key:
# description: The key of the secret to select from. Must be a valid secret key.
# type: string
# name:
# description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?'
# type: string
# optional:
# description: Specify whether the Secret or its key must be defined
# type: boolean
# required:
# - key
# type: object
# permissions:
# description: Permissions is the list of roles that user has in the specified database.
# items:
# description: MysqlPermission defines a MySQL schema permission
# properties:
# permissions:
# description: Permissions represents the permissions granted on the schema/tables
# items:
# type: string
# type: array
# schema:
# description: Schema represents the schema to which the permission applies
# type: string
# tables:
# description: Tables represents the tables inside the schema to which the permission applies
# items:
# type: string
# type: array
# required:
# - permissions
# - schema
# - tables
# type: object
# type: array
# resourceLimits:
# additionalProperties:
# anyOf:
# - type: integer
# - type: string
# pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
# x-kubernetes-int-or-string: true
# description: 'ResourceLimits allow settings limit per mysql user as defined here: https://dev.mysql.com/doc/refman/5.7/en/user-resources.html'
# type: object
# user:
# description: User is the name of the user that will be created with will access the specified database. This field should be immutable.
# type: string
# required:
# - allowedHosts
# - clusterRef
# - password
# - user
# type: object
# status:
# description: MysqlUserStatus defines the observed state of MysqlUser
# properties:
# allowedHosts:
# description: AllowedHosts contains the list of hosts that the user is allowed to connect from.
# items:
# type: string
# type: array
# conditions:
# description: Conditions represents the MysqlUser resource conditions list.
# items:
# description: MySQLUserCondition defines the condition struct for a MysqlUser resource
# properties:
# lastTransitionTime:
# description: Last time the condition transitioned from one status to another.
# format: date-time
# type: string
# lastUpdateTime:
# description: The last time this condition was updated.
# format: date-time
# type: string
# message:
# description: A human readable message indicating details about the transition.
# type: string
# reason:
# description: The reason for the condition's last transition.
# type: string
# status:
# description: Status of the condition, one of True, False, Unknown.
# type: string
# type:
# description: Type of MysqlUser condition.
# type: string
# required:
# - lastTransitionTime
# - message
# - reason
# - status
# - type
# type: object
# type: array
# type: object
# type: object
# served: true
# storage: true
# subresources:
# status: {}
# EOF
# }
2023-11-24 17:38:49 +00:00
2026-02-28 19:23:36 +00:00
#### POSTGRESQL — CloudNativePG Cluster
#
# Migrated from single NFS-backed pod to CNPG on local-path storage.
# CNPG Cluster is managed via kubectl apply (not kubernetes_manifest)
# because the CNPG webhook mutates the spec on apply, causing
# Terraform provider "inconsistent result" errors.
#
# Rollback: apply old deployment yaml, revert service selector to app=postgresql.
# Ensure the CNPG cluster manifest exists (idempotent kubectl apply)
resource " null_resource " " pg_cluster " {
triggers = {
[ci skip] right-size all pod resources based on VPA + live metrics audit
Full cluster resource audit: cross-referenced Goldilocks VPA recommendations,
live kubectl top metrics, and Terraform definitions for 100+ containers.
Critical fixes:
- dashy: CPU throttled at 98% (490m/500m) → 2 CPU limit
- stirling-pdf: CPU throttled at 99.7% (299m/300m) → 2 CPU limit
- traefik auth-proxy/bot-block-proxy: mem limit 32Mi → 128Mi
Added explicit resources to ~40 containers that had none:
- audiobookshelf, changedetection, cyberchef, dawarich, diun, echo,
excalidraw, freshrss, hackmd, isponsorblocktv, linkwarden, n8n,
navidrome, ntfy, owntracks, privatebin, send, shadowsocks, tandoor,
tor-proxy, wealthfolio, networking-toolbox, rybbit, mailserver,
cloudflared, pgadmin, phpmyadmin, crowdsec-web, xray, wireguard,
k8s-portal, tuya-bridge, ollama-ui, whisper, piper, immich-server,
immich-postgresql, osrm-foot
GPU containers: added CPU/mem alongside GPU limits:
- ollama: removed CPU/mem limits (models vary in size), keep GPU only
- frigate: req 500m/2Gi, lim 4/8Gi + GPU
- immich-ml: req 100m/1Gi, lim 2/4Gi + GPU
Right-sized ~25 over-provisioned containers:
- kms-web-page: 500m/512Mi → 50m/64Mi (was using 0m/10Mi)
- onlyoffice: CPU 8 → 2 (VPA upper 45m)
- realestate-crawler-api: CPU 2000m → 250m
- blog/travel-blog/webhook-handler: 500m → 100m
- coturn/health/plotting-book: reduced to match actual usage
Conservative methodology: limits = max(VPA upper * 2, live usage * 2)
2026-03-01 19:18:50 +00:00
instances = " 2 "
image = " ghcr.io/cloudnative-pg/postgis:16 "
storage_size = " 20Gi "
[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup
- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas
- Add democratic-csi iSCSI driver module for TrueNAS
- Add open-iscsi to cloud-init VM template
- Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0)
- Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh)
- Fix cluster healthcheck CronJob: always exit 0 to prevent circular
JobFailed alerts (reporting via Slack, not exit codes)
- Fix Uptime Kuma nested list handling in cluster-health.sh
- Add health probes to: audiobookshelf, immich ML, ntfy, headscale,
uptime-kuma, vaultwarden, rybbit (clickhouse + server + client),
shlink, shlink-web
- Add iSCSI storage documentation to CLAUDE.md
2026-03-06 19:54:21 +00:00
storage_class = " iscsi-truenas "
[ci skip] right-size all pod resources based on VPA + live metrics audit
Full cluster resource audit: cross-referenced Goldilocks VPA recommendations,
live kubectl top metrics, and Terraform definitions for 100+ containers.
Critical fixes:
- dashy: CPU throttled at 98% (490m/500m) → 2 CPU limit
- stirling-pdf: CPU throttled at 99.7% (299m/300m) → 2 CPU limit
- traefik auth-proxy/bot-block-proxy: mem limit 32Mi → 128Mi
Added explicit resources to ~40 containers that had none:
- audiobookshelf, changedetection, cyberchef, dawarich, diun, echo,
excalidraw, freshrss, hackmd, isponsorblocktv, linkwarden, n8n,
navidrome, ntfy, owntracks, privatebin, send, shadowsocks, tandoor,
tor-proxy, wealthfolio, networking-toolbox, rybbit, mailserver,
cloudflared, pgadmin, phpmyadmin, crowdsec-web, xray, wireguard,
k8s-portal, tuya-bridge, ollama-ui, whisper, piper, immich-server,
immich-postgresql, osrm-foot
GPU containers: added CPU/mem alongside GPU limits:
- ollama: removed CPU/mem limits (models vary in size), keep GPU only
- frigate: req 500m/2Gi, lim 4/8Gi + GPU
- immich-ml: req 100m/1Gi, lim 2/4Gi + GPU
Right-sized ~25 over-provisioned containers:
- kms-web-page: 500m/512Mi → 50m/64Mi (was using 0m/10Mi)
- onlyoffice: CPU 8 → 2 (VPA upper 45m)
- realestate-crawler-api: CPU 2000m → 250m
- blog/travel-blog/webhook-handler: 500m → 100m
- coturn/health/plotting-book: reduced to match actual usage
Conservative methodology: limits = max(VPA upper * 2, live usage * 2)
2026-03-01 19:18:50 +00:00
memory_limit = " 4Gi "
cpu_limit = " 2 "
2023-11-24 17:38:49 +00:00
}
[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
2026-02-28 19:23:36 +00:00
provisioner " local-exec " {
command = < < - EOT
kubectl - - kubeconfig $ { var . kube_config_path } apply - f - < < ' EOF '
apiVersion : postgresql . cnpg . io / v1
kind : Cluster
metadata :
name : pg - cluster
namespace : dbaas
spec :
instances : 2
imageName : ghcr . io / cloudnative - pg / postgis : 16
postgresql :
parameters :
search_path : ' " $ user " , public '
enableAlterSystem : true
enableSuperuserAccess : true
storage :
size : 20 Gi
[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup
- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas
- Add democratic-csi iSCSI driver module for TrueNAS
- Add open-iscsi to cloud-init VM template
- Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0)
- Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh)
- Fix cluster healthcheck CronJob: always exit 0 to prevent circular
JobFailed alerts (reporting via Slack, not exit codes)
- Fix Uptime Kuma nested list handling in cluster-health.sh
- Add health probes to: audiobookshelf, immich ML, ntfy, headscale,
uptime-kuma, vaultwarden, rybbit (clickhouse + server + client),
shlink, shlink-web
- Add iSCSI storage documentation to CLAUDE.md
2026-03-06 19:54:21 +00:00
storageClass : iscsi - truenas
2026-02-28 19:23:36 +00:00
resources :
requests :
cpu : " 250m "
memory : " 512Mi "
limits :
cpu : " 2 "
memory : " 4Gi "
EOF
EOT
2023-11-24 17:38:49 +00:00
}
}
2026-02-28 19:23:36 +00:00
# Service that maintains the original postgresql.dbaas endpoint,
# now pointing at the CNPG primary pod instead of the old deployment.
2023-11-24 17:38:49 +00:00
resource " kubernetes_service " " postgresql " {
metadata {
name = " postgresql "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2023-11-24 17:38:49 +00:00
}
spec {
selector = {
2026-02-28 19:23:36 +00:00
" cnpg.io/cluster " = " pg-cluster "
" cnpg.io/instanceRole " = " primary "
2023-11-24 17:38:49 +00:00
}
port {
name = " postgresql "
port = 5432
target_port = 5432
}
}
}
2026-02-28 19:23:36 +00:00
# Old PostgreSQL deployment — kept commented for rollback reference
# resource "kubernetes_deployment" "postgres" {
# metadata {
# name = "postgresql"
# namespace = kubernetes_namespace.dbaas.metadata[0].name
# labels = { tier = var.tier }
# }
# spec {
# replicas = 0 # scaled to 0 during CNPG migration
# selector { match_labels = { app = "postgresql" } }
# strategy { type = "Recreate" }
# template {
# metadata { labels = { app = "postgresql" } }
# spec {
# container {
# image = "viktorbarzin/postgres:16-master"
# name = "postgresql"
# env { name = "POSTGRES_PASSWORD"; value = var.postgresql_root_password }
# env { name = "POSTGRES_USER"; value = "root" }
# port { container_port = 5432; protocol = "TCP"; name = "postgresql" }
# volume_mount { name = "postgresql-persistent-storage"; mount_path = "/var/lib/postgresql/data" }
# }
# volume {
# name = "postgresql-persistent-storage"
# nfs { path = "/mnt/main/postgresql/data"; server = var.nfs_server }
# }
# }
# }
# }
# }
2023-11-24 17:38:49 +00:00
#### PGADMIN
resource " kubernetes_deployment " " pgadmin " {
metadata {
name = " pgadmin "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2023-11-24 17:38:49 +00:00
annotations = {
" reloader.stakater.com/search " = " true "
}
2026-01-10 16:28:12 +00:00
labels = {
tier = var . tier
}
2023-11-24 17:38:49 +00:00
}
spec {
selector {
match_labels = {
app = " pgadmin "
}
}
template {
metadata {
labels = {
app = " pgadmin "
}
}
spec {
container {
image = " dpage/pgadmin4 "
name = " pgadmin "
env {
name = " PGADMIN_DEFAULT_EMAIL "
value = " me@viktorbarzin.me "
}
env {
name = " PGADMIN_DEFAULT_PASSWORD "
# Changed at startup
value = var . pgadmin_password
}
port {
container_port = 80
name = " web "
}
volume_mount {
name = " pgadmin "
mount_path = " /var/lib/pgadmin/ "
}
[ci skip] right-size all pod resources based on VPA + live metrics audit
Full cluster resource audit: cross-referenced Goldilocks VPA recommendations,
live kubectl top metrics, and Terraform definitions for 100+ containers.
Critical fixes:
- dashy: CPU throttled at 98% (490m/500m) → 2 CPU limit
- stirling-pdf: CPU throttled at 99.7% (299m/300m) → 2 CPU limit
- traefik auth-proxy/bot-block-proxy: mem limit 32Mi → 128Mi
Added explicit resources to ~40 containers that had none:
- audiobookshelf, changedetection, cyberchef, dawarich, diun, echo,
excalidraw, freshrss, hackmd, isponsorblocktv, linkwarden, n8n,
navidrome, ntfy, owntracks, privatebin, send, shadowsocks, tandoor,
tor-proxy, wealthfolio, networking-toolbox, rybbit, mailserver,
cloudflared, pgadmin, phpmyadmin, crowdsec-web, xray, wireguard,
k8s-portal, tuya-bridge, ollama-ui, whisper, piper, immich-server,
immich-postgresql, osrm-foot
GPU containers: added CPU/mem alongside GPU limits:
- ollama: removed CPU/mem limits (models vary in size), keep GPU only
- frigate: req 500m/2Gi, lim 4/8Gi + GPU
- immich-ml: req 100m/1Gi, lim 2/4Gi + GPU
Right-sized ~25 over-provisioned containers:
- kms-web-page: 500m/512Mi → 50m/64Mi (was using 0m/10Mi)
- onlyoffice: CPU 8 → 2 (VPA upper 45m)
- realestate-crawler-api: CPU 2000m → 250m
- blog/travel-blog/webhook-handler: 500m → 100m
- coturn/health/plotting-book: reduced to match actual usage
Conservative methodology: limits = max(VPA upper * 2, live usage * 2)
2026-03-01 19:18:50 +00:00
resources {
requests = {
cpu = " 25m "
memory = " 128Mi "
}
limits = {
cpu = " 500m "
memory = " 512Mi "
}
}
2023-11-24 17:38:49 +00:00
}
volume {
name = " pgadmin "
# config_map {
# name = "pgadmin-config"
# }
[ci skip] complete NFS CSI migration: complex stacks + platform modules
Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).
Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)
Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler
Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance
2026-03-02 01:24:07 +00:00
persistent_volume_claim {
claim_name = module . nfs_pgadmin . claim_name
2023-11-24 17:38:49 +00:00
}
}
2026-02-23 22:43:05 +00:00
dns_config {
option {
name = " ndots "
value = " 2 "
}
}
2023-11-24 17:38:49 +00:00
}
}
}
}
resource " kubernetes_service " " pgadmin " {
metadata {
name = " pgadmin "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2023-11-24 17:38:49 +00:00
}
spec {
selector = {
" app " = " pgadmin "
}
port {
name = " pgadmin "
port = 80
}
}
}
2025-01-14 22:53:04 +00:00
module " ingress-pgadmin " {
2026-02-22 14:38:14 +00:00
source = " ../../../../modules/kubernetes/ingress_factory "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2025-01-14 22:53:04 +00:00
name = " pgadmin "
tls_secret_name = var . tls_secret_name
protected = true
2026-02-07 13:25:49 +00:00
rybbit_site_id = " 7cef78e30485 "
2023-11-24 17:38:49 +00:00
}
2024-11-23 12:40:03 +00:00
2025-01-14 22:53:04 +00:00
2024-11-23 12:40:03 +00:00
resource " kubernetes_cron_job_v1 " " postgresql-backup " {
metadata {
name = " postgresql-backup "
2025-12-29 10:23:42 +00:00
namespace = kubernetes_namespace . dbaas . metadata [ 0 ] . name
2024-11-23 12:40:03 +00:00
}
spec {
concurrency_policy = " Replace "
failed_jobs_history_limit = 5
2025-05-10 18:52:57 +00:00
schedule = " 0 0 * * * "
2024-11-23 12:40:03 +00:00
# schedule = "* * * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 10
job_template {
metadata { }
spec {
backoff_limit = 3
ttl_seconds_after_finished = 10
template {
metadata { }
spec {
container {
name = " postgresql-backup "
image = " postgres:16.4-bullseye "
2026-02-28 19:23:36 +00:00
env {
name = " PGPASSWORD "
value_from {
secret_key_ref {
name = " pg-cluster-superuser "
key = " password "
}
}
}
2025-05-10 18:52:57 +00:00
command = [ " /bin/bash " , " -c " , < < - EOT
set - euxo pipefail
2024-11-23 12:40:03 +00:00
export now =$ ( date + " %Y_%m_%d_%H_%M " )
2026-02-28 19:23:36 +00:00
PGPASSWORD =$ PGPASSWORD pg_dumpall - h postgresql . dbaas - U postgres > / backup / dump_ $ now . sql
2024-11-23 12:40:03 +00:00
# Rotate - delete last log file
cd / backup
2025-04-06 18:14:17 +00:00
find . - name " dump_*.sql " - type f - mtime + 7 - delete # 7 day retention of backups
2025-05-10 18:52:57 +00:00
echo Done
2024-11-23 12:40:03 +00:00
EOT
]
volume_mount {
name = " postgresql-backup "
mount_path = " /backup "
}
}
volume {
name = " postgresql-backup "
[ci skip] complete NFS CSI migration: complex stacks + platform modules
Migrate remaining multi-volume stacks and all platform modules from
inline NFS volumes to CSI-backed PV/PVC with nfs-truenas StorageClass
(soft,timeo=30,retrans=3 mount options).
Complex stacks: openclaw (4 vols), immich (8 vols), frigate (2 vols),
nextcloud (2 vols + old PV replaced), rybbit (1 vol)
Remaining stacks: affine, ebook2audiobook, f1-stream, osm_routing,
real-estate-crawler
Platform modules: monitoring (prometheus, loki, alertmanager PVs
converted from native NFS to CSI), redis, dbaas, technitium,
headscale, vaultwarden, uptime-kuma, mailserver, infra-maintenance
2026-03-02 01:24:07 +00:00
persistent_volume_claim {
claim_name = module . nfs_postgresql_backup . claim_name
2024-11-23 12:40:03 +00:00
}
}
}
}
}
}
}
}