infra/stacks/authentik/modules/authentik/values.yaml
Viktor Barzin 4e88298976 authentik: incident hardening after the signin-speedup rollout storm
The first apply of the signin-speedup change triggered a ~50min authentik
outage (and a shared CNPG primary failover): the helm chart pin (2026.2.2)
silently DOWNGRADED the Keel-managed live image (2026.2.4) against an
already-migrated DB, default liveness probes kill-looped pods queuing on
authentik's migration advisory lock, and kills mid-migration left ghost
idle-in-transaction sessions holding that lock. Full analysis in
docs/post-mortems/2026-06-10-authentik-downgrade-boot-storm.md.

Hardening (all root causes):
- values.yaml: pin global.image.tag to the Keel-managed live tag (2026.2.4)
  so helm applies can never downgrade under Keel again
- values.yaml: server livenessProbe 6x10s/5s (was chart-default 3x10s/3s)
- values.yaml: REMOVE AUTHENTIK_POSTGRESQL__CONN_MAX_AGE (session-mode
  pgbouncer pins persistent conns 1:1 -> pool saturation, 58s/s waits)
- pgbouncer.ini: idle_transaction_timeout=300 reaps ghost lock holders;
  pgbouncer.tf gets a config-checksum annotation so ini changes roll pods
- authentik_provider.tf: drop the completed import stanza (adoption rule)
- traefik: suppress pre-existing keel.sh annotation/tier-label drift on
  auth-proxy/bot-block/x402/error-pages deployments (KEEL_LIFECYCLE_V1
  pattern) so applies stop stripping live Keel state

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 00:26:52 +00:00

147 lines
5.5 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

authentik:
# NOTE: because we set existingSecret below, the chart does NOT render the
# authentik.* values into an AUTHENTIK_* env Secret — the live env comes
# from the orphaned, helm-keep-policy `goauthentik` Secret created by chart
# 2025.10.3. Anything under authentik.* here is effectively INERT. All new
# or tuned config MUST go through server.env / worker.env instead (see
# .claude/reference/authentik-state.md).
log_level: warning
# log_level: trace
secret_key: ""
existingSecret:
secretName: "goauthentik"
# This sends anonymous usage-data, stack traces on errors and
# performance data to authentik.error-reporting.a7k.io, and is fully opt-in
error_reporting:
enabled: false
postgresql:
# host: postgresql.dbaas
host: pgbouncer.authentik
port: 6432
user: authentik
password: ""
server:
replicas: 3
env:
# Anonymous Django sessions (no completed login: bots, healthcheckers,
# partial flows) expire in 2h. Default is days=1. Once login completes,
# UserLoginStage.session_duration takes over via request.session.set_expiry.
# Injected via server.env (not authentik.sessions.*) because we use
# authentik.existingSecret.secretName, which makes the chart skip
# rendering the AUTHENTIK_* secret — so the values block doesn't reach env.
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
value: "hours=2"
# Gunicorn: 3 workers × 4 threads per server pod (defaults 2×4).
# Pairs with the server memory limit of 2Gi (each worker preloads
# Django ~500Mi).
- name: AUTHENTIK_WEB__WORKERS
value: "3"
- name: AUTHENTIK_WEB__THREADS
value: "4"
# Cache flow plans for 30m and policy evaluations for 15m (defaults 300s).
# Authentik 2026.2 stores cache in Postgres, so a TTL hit is still a
# SELECT — but a single indexed lookup beats re-planning the flow
# (~70 sequential ORM ops per flow stage POST).
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
value: "1800"
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
value: "900"
# Do NOT set AUTHENTIK_POSTGRESQL__CONN_MAX_AGE here. With PgBouncer in
# session mode every persistent Django connection pins a server connection
# 1:1, so the 3x(20+5) pool saturated during the 2026-06-10 rolling
# restart (58s pool waits, readiness flapping, and the shared CNPG primary
# failed over mid-storm). The ~1-2ms/request connection-setup saving is
# not worth that risk on the shared PG substrate.
# Liveness budget sized for slow boots (2026-06-10 incident): during a
# rolling restart pods queue on authentik's DB migration lock; the go layer
# answers /-/health/live before the core is up, so with the default 3x10s
# budget kubelet kill-looped every booting pod and amplified the contention.
# Startup probe still bounds total boot time (60x10s).
livenessProbe:
failureThreshold: 6
timeoutSeconds: 5
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
resources:
requests:
cpu: 100m
memory: 1.5Gi
limits:
memory: 2Gi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app.kubernetes.io/component: server
ingress:
enabled: false
# hosts:
# - authentik.viktorbarzin.me
podAnnotations:
diun.enable: true
diun.include_tags: "^202[0-9].[0-9]+.*$" # no need to annotate the worker as it uses the same image
pdb:
enabled: true
minAvailable: 2
global:
addPrometheusAnnotations: true
image:
# Pin to the Keel-managed live tag. Keel (diun-annotated, keel.sh/enrolled
# namespace) bumps the IMAGE between chart releases, while helm defaults
# the tag to the chart appVersion — so any helm upgrade silently
# DOWNGRADES the running pods to the chart pin (2026-06-10: a values-only
# apply rolled live 2026.2.4 back to 2026.2.2 against a 2026.2.4-migrated
# DB → boot storm, see docs/post-mortems/2026-06-10-authentik-downgrade-
# boot-storm.md). Keep this tag in sync with what Keel has deployed when
# touching this chart; clear it only when bumping the chart version itself.
tag: "2026.2.4"
worker:
# 2 replicas: workers handle background tasks (LDAP sync, email,
# certificate renewal) — no user-facing traffic, so 2-of-3 isn't
# needed for availability. Drop saves ~100m sustained CPU.
replicas: 2
env:
# Same unauthenticated_age cap as server — both the server (Django session
# middleware) and worker (cleanup tasks) need to see the value.
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
value: "hours=2"
# Dramatiq worker threads per pod (default 2).
- name: AUTHENTIK_WORKER__THREADS
value: "4"
# Keep cache settings in lockstep with server.env. (No CONN_MAX_AGE —
# see the server.env note: session-mode PgBouncer pins persistent conns.)
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
value: "1800"
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
value: "900"
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
resources:
requests:
cpu: 100m
memory: 1.5Gi
limits:
memory: 2Gi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app.kubernetes.io/component: worker
pdb:
enabled: true
maxUnavailable: 1
postgresql:
enabled: false