authentik: fix episodic blank-screen + 30s-hang login (reliability R2)
The login screen would sometimes hang/blank for everyone for ~30s at a time. Root-caused: the readiness probe (/-/health/ready/) queries the DB, and on a transient PG/pgbouncer blip it 503s; with the chart-default ~30s tolerance all 3 goauthentik-server pods dropped out of the Service at once, so Traefik had no healthy backend -> 502/503/504. Compounded by a silent drift: the repo set the rollout strategy under `strategy:`, but the chart reads `deploymentStrategy:` — so live ran the chart-default 25%/25% and dropped a pod out of rotation on every roll. (Redis was removed upstream in authentik 2026.2, so sessions+cache are on PostgreSQL and request-serving is coupled to PG — verified there is no external-cache option to put back, so a SHORT transient is now survived but a total CNPG outage still takes authentik down.) Reliability package (R2, approved): - readinessProbe.failureThreshold 3->8 (~80s) — absorbs a full CNPG failover reconnect without dropping the whole fleet from the Service. - rename server+worker `strategy:` -> `deploymentStrategy:` (the real chart key) and set maxSurge:1/maxUnavailable:0 so a roll never dips below 3 ready. - gunicorn AUTHENTIK_WEB__MAX_REQUESTS 1000->10000 / JITTER 50->1000 so the 9 workers' recycles don't cluster on a DB blip. - / and /static ingresses switch to the dedicated authentik-rate-limit (100/1000) from the previous commit (skip_default_rate_limit) — fixes the cold-load 429 blank screen. Liveness intentionally left DB-coupled-but-shallow (LiveView always returns 200, so it can't kill a DB-blocked pod). CONN_MAX_AGE intentionally NOT set (pins the pgbouncer pool, reverted 2026-06-10). Docs: .claude/CLAUDE.md + authentication.md (also corrected a stale "60s persistent DB connections" note). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
b84b0021c2
commit
385dfff0e7
4 changed files with 77 additions and 16 deletions
|
|
@ -82,6 +82,11 @@ module "ingress" {
|
|||
service_name = "goauthentik-server"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
# Swap the shared 10/50 default limiter for a dedicated 100/1000 carve-out:
|
||||
# the login SPA + flow-executor API burst on a cold load otherwise 429s into
|
||||
# a blank screen (see traefik middleware "authentik-rate-limit").
|
||||
skip_default_rate_limit = true
|
||||
extra_middlewares = ["traefik-authentik-rate-limit@kubernetescrd"]
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "Authentik"
|
||||
|
|
@ -140,14 +145,21 @@ module "ingress-static" {
|
|||
# Same-host path carve-out of the public authentik UI ingress above, only
|
||||
# adding the cache-headers middleware for the static asset prefix.
|
||||
# auth = "none": versioned static assets of the (already public) Authentik login UI.
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.authentik.metadata[0].name
|
||||
name = "authentik-static"
|
||||
host = "authentik"
|
||||
service_name = "goauthentik-server"
|
||||
ingress_path = ["/static"]
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
homepage_enabled = false
|
||||
extra_middlewares = ["authentik-static-cache-headers@kubernetescrd"]
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.authentik.metadata[0].name
|
||||
name = "authentik-static"
|
||||
host = "authentik"
|
||||
service_name = "goauthentik-server"
|
||||
ingress_path = ["/static"]
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
homepage_enabled = false
|
||||
# /static serves ALL the SPA JS/CSS chunks; the default 10/50 limiter 429s the
|
||||
# cold-load fan-out → blank screen. Dedicated 100/1000 carve-out (note the two
|
||||
# namespaces: cache-headers is in ns authentik, rate-limit is in ns traefik).
|
||||
skip_default_rate_limit = true
|
||||
extra_middlewares = [
|
||||
"authentik-static-cache-headers@kubernetescrd",
|
||||
"traefik-authentik-rate-limit@kubernetescrd",
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,16 @@ server:
|
|||
value: "3"
|
||||
- name: AUTHENTIK_WEB__THREADS
|
||||
value: "4"
|
||||
# Gunicorn worker recycle hardening (defaults max_requests=1000/jitter=50).
|
||||
# A worker recycle that coincides with a transient PG/pgbouncer blip stalls
|
||||
# in-flight requests (sessions+cache are on PostgreSQL since Redis was removed
|
||||
# in 2026.2), and with 9 workers recycling on a tight 50-jitter window the
|
||||
# recycles cluster — feeding the episodic all-pods-NotReady 502/504 cascade.
|
||||
# 10x rarer recycles + 20x wider jitter (1000) decorrelate them from DB blips.
|
||||
- name: AUTHENTIK_WEB__MAX_REQUESTS
|
||||
value: "10000"
|
||||
- name: AUTHENTIK_WEB__MAX_REQUESTS_JITTER
|
||||
value: "1000"
|
||||
# Cache flow plans for 30m and policy evaluations for 15m (defaults 300s).
|
||||
# Authentik 2026.2 stores cache in Postgres, so a TTL hit is still a
|
||||
# SELECT — but a single indexed lookup beats re-planning the flow
|
||||
|
|
@ -87,11 +97,28 @@ server:
|
|||
livenessProbe:
|
||||
failureThreshold: 6
|
||||
timeoutSeconds: 5
|
||||
strategy:
|
||||
# Readiness widened from the chart default (3x10s/3s ~= 30s) to ~80s. The
|
||||
# readiness probe (/-/health/ready/) queries the DB, so a sub-~60s PG/pgbouncer
|
||||
# transient otherwise returns 503 and drops ALL 3 server pods from the Service
|
||||
# at once -> Traefik has no healthy backend -> 502/504 (the episodic blank
|
||||
# screen + 30s hang). 80s absorbs a full CNPG failover reconnect; liveness
|
||||
# still reaps a truly hung pod. Partial override — the chart deep-merges the
|
||||
# httpGet path /-/health/ready/ (same as the livenessProbe override above).
|
||||
readinessProbe:
|
||||
failureThreshold: 8
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
# RollingUpdate strategy. The chart key is `deploymentStrategy`, NOT `strategy`
|
||||
# (authentik.server reads .Values.server.deploymentStrategy) — the old
|
||||
# `strategy:` key was silently ignored, so live ran the chart default 25%/25%
|
||||
# and every rolling event dropped a server pod out of rotation, amplifying the
|
||||
# NotReady cascade. maxSurge:1 + maxUnavailable:0 keeps all 3 ready throughout
|
||||
# a roll (PDB minAvailable:2 + ResourceQuota headroom allow the transient pod).
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
maxSurge: 1
|
||||
maxUnavailable: 0
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
|
|
@ -166,7 +193,10 @@ worker:
|
|||
secretKeyRef:
|
||||
name: authentik-email
|
||||
key: AUTHENTIK_EMAIL__PASSWORD
|
||||
strategy:
|
||||
# Chart key is `deploymentStrategy`, not `strategy` (see server above). Workers
|
||||
# serve no user traffic, so maxSurge:0/maxUnavailable:1 is fine — this is just
|
||||
# the dead-key cleanup so the declared intent actually takes effect.
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue