The first apply of the signin-speedup change triggered a ~50min authentik outage (and a shared CNPG primary failover): the helm chart pin (2026.2.2) silently DOWNGRADED the Keel-managed live image (2026.2.4) against an already-migrated DB, default liveness probes kill-looped pods queuing on authentik's migration advisory lock, and kills mid-migration left ghost idle-in-transaction sessions holding that lock. Full analysis in docs/post-mortems/2026-06-10-authentik-downgrade-boot-storm.md. Hardening (all root causes): - values.yaml: pin global.image.tag to the Keel-managed live tag (2026.2.4) so helm applies can never downgrade under Keel again - values.yaml: server livenessProbe 6x10s/5s (was chart-default 3x10s/3s) - values.yaml: REMOVE AUTHENTIK_POSTGRESQL__CONN_MAX_AGE (session-mode pgbouncer pins persistent conns 1:1 -> pool saturation, 58s/s waits) - pgbouncer.ini: idle_transaction_timeout=300 reaps ghost lock holders; pgbouncer.tf gets a config-checksum annotation so ini changes roll pods - authentik_provider.tf: drop the completed import stanza (adoption rule) - traefik: suppress pre-existing keel.sh annotation/tier-label drift on auth-proxy/bot-block/x402/error-pages deployments (KEEL_LIFECYCLE_V1 pattern) so applies stop stripping live Keel state Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
147 lines
5.5 KiB
YAML
147 lines
5.5 KiB
YAML
authentik:
|
||
# NOTE: because we set existingSecret below, the chart does NOT render the
|
||
# authentik.* values into an AUTHENTIK_* env Secret — the live env comes
|
||
# from the orphaned, helm-keep-policy `goauthentik` Secret created by chart
|
||
# 2025.10.3. Anything under authentik.* here is effectively INERT. All new
|
||
# or tuned config MUST go through server.env / worker.env instead (see
|
||
# .claude/reference/authentik-state.md).
|
||
log_level: warning
|
||
# log_level: trace
|
||
secret_key: ""
|
||
existingSecret:
|
||
secretName: "goauthentik"
|
||
# This sends anonymous usage-data, stack traces on errors and
|
||
# performance data to authentik.error-reporting.a7k.io, and is fully opt-in
|
||
error_reporting:
|
||
enabled: false
|
||
postgresql:
|
||
# host: postgresql.dbaas
|
||
host: pgbouncer.authentik
|
||
port: 6432
|
||
user: authentik
|
||
password: ""
|
||
|
||
server:
|
||
replicas: 3
|
||
env:
|
||
# Anonymous Django sessions (no completed login: bots, healthcheckers,
|
||
# partial flows) expire in 2h. Default is days=1. Once login completes,
|
||
# UserLoginStage.session_duration takes over via request.session.set_expiry.
|
||
# Injected via server.env (not authentik.sessions.*) because we use
|
||
# authentik.existingSecret.secretName, which makes the chart skip
|
||
# rendering the AUTHENTIK_* secret — so the values block doesn't reach env.
|
||
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
|
||
value: "hours=2"
|
||
# Gunicorn: 3 workers × 4 threads per server pod (defaults 2×4).
|
||
# Pairs with the server memory limit of 2Gi (each worker preloads
|
||
# Django ~500Mi).
|
||
- name: AUTHENTIK_WEB__WORKERS
|
||
value: "3"
|
||
- name: AUTHENTIK_WEB__THREADS
|
||
value: "4"
|
||
# Cache flow plans for 30m and policy evaluations for 15m (defaults 300s).
|
||
# Authentik 2026.2 stores cache in Postgres, so a TTL hit is still a
|
||
# SELECT — but a single indexed lookup beats re-planning the flow
|
||
# (~70 sequential ORM ops per flow stage POST).
|
||
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
|
||
value: "1800"
|
||
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
|
||
value: "900"
|
||
# Do NOT set AUTHENTIK_POSTGRESQL__CONN_MAX_AGE here. With PgBouncer in
|
||
# session mode every persistent Django connection pins a server connection
|
||
# 1:1, so the 3x(20+5) pool saturated during the 2026-06-10 rolling
|
||
# restart (58s pool waits, readiness flapping, and the shared CNPG primary
|
||
# failed over mid-storm). The ~1-2ms/request connection-setup saving is
|
||
# not worth that risk on the shared PG substrate.
|
||
# Liveness budget sized for slow boots (2026-06-10 incident): during a
|
||
# rolling restart pods queue on authentik's DB migration lock; the go layer
|
||
# answers /-/health/live before the core is up, so with the default 3x10s
|
||
# budget kubelet kill-looped every booting pod and amplified the contention.
|
||
# Startup probe still bounds total boot time (60x10s).
|
||
livenessProbe:
|
||
failureThreshold: 6
|
||
timeoutSeconds: 5
|
||
strategy:
|
||
type: RollingUpdate
|
||
rollingUpdate:
|
||
maxSurge: 0
|
||
maxUnavailable: 1
|
||
resources:
|
||
requests:
|
||
cpu: 100m
|
||
memory: 1.5Gi
|
||
limits:
|
||
memory: 2Gi
|
||
topologySpreadConstraints:
|
||
- maxSkew: 1
|
||
topologyKey: kubernetes.io/hostname
|
||
whenUnsatisfiable: ScheduleAnyway
|
||
labelSelector:
|
||
matchLabels:
|
||
app.kubernetes.io/component: server
|
||
ingress:
|
||
enabled: false
|
||
# hosts:
|
||
# - authentik.viktorbarzin.me
|
||
podAnnotations:
|
||
diun.enable: true
|
||
diun.include_tags: "^202[0-9].[0-9]+.*$" # no need to annotate the worker as it uses the same image
|
||
pdb:
|
||
enabled: true
|
||
minAvailable: 2
|
||
global:
|
||
addPrometheusAnnotations: true
|
||
image:
|
||
# Pin to the Keel-managed live tag. Keel (diun-annotated, keel.sh/enrolled
|
||
# namespace) bumps the IMAGE between chart releases, while helm defaults
|
||
# the tag to the chart appVersion — so any helm upgrade silently
|
||
# DOWNGRADES the running pods to the chart pin (2026-06-10: a values-only
|
||
# apply rolled live 2026.2.4 back to 2026.2.2 against a 2026.2.4-migrated
|
||
# DB → boot storm, see docs/post-mortems/2026-06-10-authentik-downgrade-
|
||
# boot-storm.md). Keep this tag in sync with what Keel has deployed when
|
||
# touching this chart; clear it only when bumping the chart version itself.
|
||
tag: "2026.2.4"
|
||
|
||
worker:
|
||
# 2 replicas: workers handle background tasks (LDAP sync, email,
|
||
# certificate renewal) — no user-facing traffic, so 2-of-3 isn't
|
||
# needed for availability. Drop saves ~100m sustained CPU.
|
||
replicas: 2
|
||
env:
|
||
# Same unauthenticated_age cap as server — both the server (Django session
|
||
# middleware) and worker (cleanup tasks) need to see the value.
|
||
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
|
||
value: "hours=2"
|
||
# Dramatiq worker threads per pod (default 2).
|
||
- name: AUTHENTIK_WORKER__THREADS
|
||
value: "4"
|
||
# Keep cache settings in lockstep with server.env. (No CONN_MAX_AGE —
|
||
# see the server.env note: session-mode PgBouncer pins persistent conns.)
|
||
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
|
||
value: "1800"
|
||
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
|
||
value: "900"
|
||
strategy:
|
||
type: RollingUpdate
|
||
rollingUpdate:
|
||
maxSurge: 0
|
||
maxUnavailable: 1
|
||
resources:
|
||
requests:
|
||
cpu: 100m
|
||
memory: 1.5Gi
|
||
limits:
|
||
memory: 2Gi
|
||
topologySpreadConstraints:
|
||
- maxSkew: 1
|
||
topologyKey: kubernetes.io/hostname
|
||
whenUnsatisfiable: ScheduleAnyway
|
||
labelSelector:
|
||
matchLabels:
|
||
app.kubernetes.io/component: worker
|
||
pdb:
|
||
enabled: true
|
||
maxUnavailable: 1
|
||
|
||
postgresql:
|
||
enabled: false
|