authentik: incident hardening after the signin-speedup rollout storm
The first apply of the signin-speedup change triggered a ~50min authentik outage (and a shared CNPG primary failover): the helm chart pin (2026.2.2) silently DOWNGRADED the Keel-managed live image (2026.2.4) against an already-migrated DB, default liveness probes kill-looped pods queuing on authentik's migration advisory lock, and kills mid-migration left ghost idle-in-transaction sessions holding that lock. Full analysis in docs/post-mortems/2026-06-10-authentik-downgrade-boot-storm.md. Hardening (all root causes): - values.yaml: pin global.image.tag to the Keel-managed live tag (2026.2.4) so helm applies can never downgrade under Keel again - values.yaml: server livenessProbe 6x10s/5s (was chart-default 3x10s/3s) - values.yaml: REMOVE AUTHENTIK_POSTGRESQL__CONN_MAX_AGE (session-mode pgbouncer pins persistent conns 1:1 -> pool saturation, 58s/s waits) - pgbouncer.ini: idle_transaction_timeout=300 reaps ghost lock holders; pgbouncer.tf gets a config-checksum annotation so ini changes roll pods - authentik_provider.tf: drop the completed import stanza (adoption rule) - traefik: suppress pre-existing keel.sh annotation/tier-label drift on auth-proxy/bot-block/x402/error-pages deployments (KEEL_LIFECYCLE_V1 pattern) so applies stop stripping live Keel state Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
97ccdbecb8
commit
4e88298976
8 changed files with 156 additions and 23 deletions
|
|
@ -217,11 +217,6 @@ resource "authentik_stage_user_login" "default_login" {
|
|||
# screen and bypass the password field.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
import {
|
||||
to = authentik_stage_identification.default_identification
|
||||
id = "32aca5ab-106e-43f4-a4cc-4513d80e57f3"
|
||||
}
|
||||
|
||||
data "authentik_stage" "default_authentication_password" {
|
||||
name = "default-authentication-password"
|
||||
}
|
||||
|
|
@ -243,8 +238,6 @@ resource "authentik_stage_identification" "default_identification" {
|
|||
passwordless_flow,
|
||||
pretend_user_exists,
|
||||
captcha_stage,
|
||||
webauthn_stage,
|
||||
enable_remember_me,
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,3 +12,7 @@ default_pool_size = 20
|
|||
reserve_pool_size = 5
|
||||
reserve_pool_timeout = 5
|
||||
ignore_startup_parameters = extra_float_digits
|
||||
; Reap server connections stuck "idle in transaction" (e.g. an authentik pod
|
||||
; killed mid-migration leaves a ghost transaction holding the migration
|
||||
; advisory lock, serializing every subsequent pod boot — 2026-06-10 incident).
|
||||
idle_transaction_timeout = 300
|
||||
|
|
|
|||
|
|
@ -48,6 +48,11 @@ resource "kubernetes_deployment" "pgbouncer" {
|
|||
labels = {
|
||||
app = "pgbouncer"
|
||||
}
|
||||
annotations = {
|
||||
# pgbouncer reads its ini only at startup (subPath mount never
|
||||
# propagates updates anyway) — roll the pods on config change.
|
||||
"checksum/pgbouncer-config" = sha1(kubernetes_config_map.pgbouncer_config.data["pgbouncer.ini"])
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
|
|
|
|||
|
|
@ -47,13 +47,20 @@ server:
|
|||
value: "1800"
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
|
||||
value: "900"
|
||||
# Persistent client-side DB connections (safe with PgBouncer session mode;
|
||||
# must stay < pgbouncer server_idle_timeout=600s). Cuts per-request Django
|
||||
# connection setup off the auth hot path.
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_MAX_AGE
|
||||
value: "60"
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_HEALTH_CHECKS
|
||||
value: "true"
|
||||
# Do NOT set AUTHENTIK_POSTGRESQL__CONN_MAX_AGE here. With PgBouncer in
|
||||
# session mode every persistent Django connection pins a server connection
|
||||
# 1:1, so the 3x(20+5) pool saturated during the 2026-06-10 rolling
|
||||
# restart (58s pool waits, readiness flapping, and the shared CNPG primary
|
||||
# failed over mid-storm). The ~1-2ms/request connection-setup saving is
|
||||
# not worth that risk on the shared PG substrate.
|
||||
# Liveness budget sized for slow boots (2026-06-10 incident): during a
|
||||
# rolling restart pods queue on authentik's DB migration lock; the go layer
|
||||
# answers /-/health/live before the core is up, so with the default 3x10s
|
||||
# budget kubelet kill-looped every booting pod and amplified the contention.
|
||||
# Startup probe still bounds total boot time (60x10s).
|
||||
livenessProbe:
|
||||
failureThreshold: 6
|
||||
timeoutSeconds: 5
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
|
|
@ -84,6 +91,16 @@ server:
|
|||
minAvailable: 2
|
||||
global:
|
||||
addPrometheusAnnotations: true
|
||||
image:
|
||||
# Pin to the Keel-managed live tag. Keel (diun-annotated, keel.sh/enrolled
|
||||
# namespace) bumps the IMAGE between chart releases, while helm defaults
|
||||
# the tag to the chart appVersion — so any helm upgrade silently
|
||||
# DOWNGRADES the running pods to the chart pin (2026-06-10: a values-only
|
||||
# apply rolled live 2026.2.4 back to 2026.2.2 against a 2026.2.4-migrated
|
||||
# DB → boot storm, see docs/post-mortems/2026-06-10-authentik-downgrade-
|
||||
# boot-storm.md). Keep this tag in sync with what Keel has deployed when
|
||||
# touching this chart; clear it only when bumping the chart version itself.
|
||||
tag: "2026.2.4"
|
||||
|
||||
worker:
|
||||
# 2 replicas: workers handle background tasks (LDAP sync, email,
|
||||
|
|
@ -98,15 +115,12 @@ worker:
|
|||
# Dramatiq worker threads per pod (default 2).
|
||||
- name: AUTHENTIK_WORKER__THREADS
|
||||
value: "4"
|
||||
# Keep cache + DB-connection settings in lockstep with server.env.
|
||||
# Keep cache settings in lockstep with server.env. (No CONN_MAX_AGE —
|
||||
# see the server.env note: session-mode PgBouncer pins persistent conns.)
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
|
||||
value: "1800"
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
|
||||
value: "900"
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_MAX_AGE
|
||||
value: "60"
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_HEALTH_CHECKS
|
||||
value: "true"
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue