authentik: fix episodic blank-screen + 30s-hang login (reliability R2)

The login screen would sometimes hang/blank for everyone for ~30s at a time. Root-caused: the readiness probe (/-/health/ready/) queries the DB, and on a transient PG/pgbouncer blip it 503s; with the chart-default ~30s tolerance all 3 goauthentik-server pods dropped out of the Service at once, so Traefik had no healthy backend -> 502/503/504. Compounded by a silent drift: the repo set the rollout strategy under `strategy:`, but the chart reads `deploymentStrategy:` — so live ran the chart-default 25%/25% and dropped a pod out of rotation on every roll. (Redis was removed upstream in authentik 2026.2, so sessions+cache are on PostgreSQL and request-serving is coupled to PG — verified there is no external-cache option to put back, so a SHORT transient is now survived but a total CNPG outage still takes authentik down.) Reliability package (R2, approved): - readinessProbe.failureThreshold 3->8 (~80s) — absorbs a full CNPG failover reconnect without dropping the whole fleet from the Service. - rename server+worker `strategy:` -> `deploymentStrategy:` (the real chart key) and set maxSurge:1/maxUnavailable:0 so a roll never dips below 3 ready. - gunicorn AUTHENTIK_WEB__MAX_REQUESTS 1000->10000 / JITTER 50->1000 so the 9 workers' recycles don't cluster on a DB blip. - / and /static ingresses switch to the dedicated authentik-rate-limit (100/1000) from the previous commit (skip_default_rate_limit) — fixes the cold-load 429 blank screen. Liveness intentionally left DB-coupled-but-shallow (LiveView always returns 200, so it can't kill a DB-blocked pod). CONN_MAX_AGE intentionally NOT set (pins the pgbouncer pool, reverted 2026-06-10). Docs: .claude/CLAUDE.md + authentication.md (also corrected a stale "60s persistent DB connections" note). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 09:17:05 +00:00 · 2026-06-28 09:17:05 +00:00 · 385dfff0e7
commit 385dfff0e7
parent b84b0021c2
4 changed files with 77 additions and 16 deletions
--- a/stacks/authentik/modules/authentik/main.tf
+++ b/stacks/authentik/modules/authentik/main.tf
@ -82,6 +82,11 @@ module "ingress" {
  service_name     = "goauthentik-server"
  tls_secret_name  = var.tls_secret_name
  anti_ai_scraping = false
+  # Swap the shared 10/50 default limiter for a dedicated 100/1000 carve-out:
+  # the login SPA + flow-executor API burst on a cold load otherwise 429s into
+  # a blank screen (see traefik middleware "authentik-rate-limit").
+  skip_default_rate_limit = true
+  extra_middlewares       = ["traefik-authentik-rate-limit@kubernetescrd"]
  extra_annotations = {
    "gethomepage.dev/enabled"      = "true"
    "gethomepage.dev/name"         = "Authentik"
@ -140,14 +145,21 @@ module "ingress-static" {
  # Same-host path carve-out of the public authentik UI ingress above, only
  # adding the cache-headers middleware for the static asset prefix.
  # auth = "none": versioned static assets of the (already public) Authentik login UI.
-  auth              = "none"
-  namespace         = kubernetes_namespace.authentik.metadata[0].name
-  name              = "authentik-static"
-  host              = "authentik"
-  service_name      = "goauthentik-server"
-  ingress_path      = ["/static"]
-  tls_secret_name   = var.tls_secret_name
-  anti_ai_scraping  = false
-  homepage_enabled  = false
-  extra_middlewares = ["authentik-static-cache-headers@kubernetescrd"]
+  auth             = "none"
+  namespace        = kubernetes_namespace.authentik.metadata[0].name
+  name             = "authentik-static"
+  host             = "authentik"
+  service_name     = "goauthentik-server"
+  ingress_path     = ["/static"]
+  tls_secret_name  = var.tls_secret_name
+  anti_ai_scraping = false
+  homepage_enabled = false
+  # /static serves ALL the SPA JS/CSS chunks; the default 10/50 limiter 429s the
+  # cold-load fan-out → blank screen. Dedicated 100/1000 carve-out (note the two
+  # namespaces: cache-headers is in ns authentik, rate-limit is in ns traefik).
+  skip_default_rate_limit = true
+  extra_middlewares = [
+    "authentik-static-cache-headers@kubernetescrd",
+    "traefik-authentik-rate-limit@kubernetescrd",
+  ]
 }
--- a/stacks/authentik/modules/authentik/values.yaml
+++ b/stacks/authentik/modules/authentik/values.yaml
@ -39,6 +39,16 @@ server:
      value: "3"
    - name: AUTHENTIK_WEB__THREADS
      value: "4"
+    # Gunicorn worker recycle hardening (defaults max_requests=1000/jitter=50).
+    # A worker recycle that coincides with a transient PG/pgbouncer blip stalls
+    # in-flight requests (sessions+cache are on PostgreSQL since Redis was removed
+    # in 2026.2), and with 9 workers recycling on a tight 50-jitter window the
+    # recycles cluster — feeding the episodic all-pods-NotReady 502/504 cascade.
+    # 10x rarer recycles + 20x wider jitter (1000) decorrelate them from DB blips.
+    - name: AUTHENTIK_WEB__MAX_REQUESTS
+      value: "10000"
+    - name: AUTHENTIK_WEB__MAX_REQUESTS_JITTER
+      value: "1000"
    # Cache flow plans for 30m and policy evaluations for 15m (defaults 300s).
    # Authentik 2026.2 stores cache in Postgres, so a TTL hit is still a
    # SELECT — but a single indexed lookup beats re-planning the flow
@ -87,11 +97,28 @@ server:
  livenessProbe:
    failureThreshold: 6
    timeoutSeconds: 5
-  strategy:
+  # Readiness widened from the chart default (3x10s/3s ~= 30s) to ~80s. The
+  # readiness probe (/-/health/ready/) queries the DB, so a sub-~60s PG/pgbouncer
+  # transient otherwise returns 503 and drops ALL 3 server pods from the Service
+  # at once -> Traefik has no healthy backend -> 502/504 (the episodic blank
+  # screen + 30s hang). 80s absorbs a full CNPG failover reconnect; liveness
+  # still reaps a truly hung pod. Partial override — the chart deep-merges the
+  # httpGet path /-/health/ready/ (same as the livenessProbe override above).
+  readinessProbe:
+    failureThreshold: 8
+    periodSeconds: 10
+    timeoutSeconds: 5
+  # RollingUpdate strategy. The chart key is `deploymentStrategy`, NOT `strategy`
+  # (authentik.server reads .Values.server.deploymentStrategy) — the old
+  # `strategy:` key was silently ignored, so live ran the chart default 25%/25%
+  # and every rolling event dropped a server pod out of rotation, amplifying the
+  # NotReady cascade. maxSurge:1 + maxUnavailable:0 keeps all 3 ready throughout
+  # a roll (PDB minAvailable:2 + ResourceQuota headroom allow the transient pod).
+  deploymentStrategy:
    type: RollingUpdate
    rollingUpdate:
-      maxSurge: 0
-      maxUnavailable: 1
+      maxSurge: 1
+      maxUnavailable: 0
  resources:
    requests:
      cpu: 100m
@ -166,7 +193,10 @@ worker:
        secretKeyRef:
          name: authentik-email
          key: AUTHENTIK_EMAIL__PASSWORD
-  strategy:
+  # Chart key is `deploymentStrategy`, not `strategy` (see server above). Workers
+  # serve no user traffic, so maxSurge:0/maxUnavailable:1 is fine — this is just
+  # the dead-key cleanup so the declared intent actually takes effect.
+  deploymentStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 0