diff --git a/stacks/authentik/modules/authentik/pgbouncer.tf b/stacks/authentik/modules/authentik/pgbouncer.tf index 427d643d..bea34a4d 100644 --- a/stacks/authentik/modules/authentik/pgbouncer.tf +++ b/stacks/authentik/modules/authentik/pgbouncer.tf @@ -74,6 +74,36 @@ resource "kubernetes_deployment" "pgbouncer" { container_port = 6432 } + resources { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + memory = "512Mi" + } + } + + readiness_probe { + tcp_socket { + port = 6432 + } + initial_delay_seconds = 5 + period_seconds = 10 + timeout_seconds = 3 + failure_threshold = 3 + } + + liveness_probe { + tcp_socket { + port = 6432 + } + initial_delay_seconds = 30 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } + volume_mount { name = "config" mount_path = "/etc/pgbouncer/pgbouncer.ini" @@ -121,6 +151,25 @@ resource "kubernetes_deployment" "pgbouncer" { } } +# --- 3b️⃣ PodDisruptionBudget --- +# Protects auth against simultaneous node drains. With 3 replicas and +# minAvailable=2, a single drain rolls cleanly; a simultaneous two-node +# outage is correctly blocked. +resource "kubernetes_pod_disruption_budget_v1" "pgbouncer" { + metadata { + name = "pgbouncer" + namespace = "authentik" + } + spec { + min_available = 2 + selector { + match_labels = { + app = "pgbouncer" + } + } + } +} + # --- 4️⃣ Service --- resource "kubernetes_service" "pgbouncer" { metadata { diff --git a/stacks/authentik/modules/authentik/values.yaml b/stacks/authentik/modules/authentik/values.yaml index 2827a5a2..e8c7d5ea 100644 --- a/stacks/authentik/modules/authentik/values.yaml +++ b/stacks/authentik/modules/authentik/values.yaml @@ -14,9 +14,29 @@ authentik: port: 6432 user: authentik password: "" + # Persistent client-side connections (safe with PgBouncer session mode; + # must be < pgbouncer server_idle_timeout=600s). Cuts Django connection + # setup overhead off the ~70 sequential ORM ops per flow stage. + conn_max_age: 60 + conn_health_checks: true + cache: + # Cache flow plans for 30m and policy evaluations for 15m. Authentik 2026.2 + # moved cache storage from Redis to Postgres, so a TTL hit is still a + # SELECT — but a single indexed lookup beats re-evaluating PolicyBindings. + timeout_flows: 1800 + timeout_policies: 900 + web: + # Gunicorn: 3 workers × 4 threads per server pod (default 2×4). + # Pairs with the server memory bump to 2Gi (each worker preloads Django ~500Mi). + workers: 3 + threads: 4 + worker: + # Celery-equivalent worker threads per pod (default 2, renamed from + # AUTHENTIK_WORKER__CONCURRENCY in 2025.8). + threads: 4 server: - replicas: 2 + replicas: 3 strategy: type: RollingUpdate rollingUpdate: @@ -27,7 +47,7 @@ server: cpu: 100m memory: 1.5Gi limits: - memory: 1.5Gi + memory: 2Gi topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname @@ -44,12 +64,12 @@ server: diun.include_tags: "^202[0-9].[0-9]+.*$" # no need to annotate the worker as it uses the same image pdb: enabled: true - minAvailable: 1 + minAvailable: 2 global: addPrometheusAnnotations: true worker: - replicas: 2 + replicas: 3 strategy: type: RollingUpdate rollingUpdate: @@ -60,7 +80,7 @@ worker: cpu: 100m memory: 1.5Gi limits: - memory: 1.5Gi + memory: 2Gi topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname