authentik: speed up first-time signin (single-screen login, live env tuning, asset caching, outpost+nginx hot path)

Viktor asked to review Authentik and the web tier and make first-time
signin to apps faster. Review found the slowness is screens and round
trips, not server time. Changes:

- values.yaml: the authentik.* Helm values (gunicorn workers, cache
  timeouts, conn_max_age) were silently INERT because existingSecret
  skips chart env rendering — pods ran defaults (2 workers, 300s
  caches, no persistent DB conns). Moved all tuning into
  server.env/worker.env, which actually reaches the pods.
- authentik_provider.tf: adopt the identification stage and pin
  password_stage so username+password render on ONE screen (the
  separate order-20 password binding is deleted via API — authentik
  requires that when embedding). Outpost log_level trace->info and
  1->2 replicas (it is on the hot path of every forward-auth request;
  PG-backed sessions make 2 replicas safe).
- authentik module: /static ingress carve-out with immutable
  Cache-Control (assets are version-fingerprinted but served with no
  max-age — internal split-horizon users got zero caching).
- traefik auth-proxy nginx: upstream keepalive 32 + HTTP/1.1 (was
  opening a fresh TCP connection to the outpost per subrequest) +
  config-checksum annotation so config changes roll the pods.
- docs: authentication.md + authentik-state.md updated; fixed stale
  'postgresql.dbaas has no endpoints' claim in CLAUDE.md/CONTEXT.md
  (it is a live CNPG primary-selector compatibility service).

Done via API in the same change (UI-managed objects): 6 OIDC providers
(Vault, Forgejo, Immich, Headscale, linkwarden, Cloudflare Access)
switched from explicit to implicit consent — all first-party, the
4-weekly consent screen only slowed first-time signin.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-10 21:58:10 +00:00
parent 93ba67c84a
commit 97ccdbecb8
8 changed files with 232 additions and 55 deletions

View file

@ -91,14 +91,21 @@ resource "authentik_outpost" "embedded" {
protocol_providers = [authentik_provider_proxy.catchall.id]
service_connection = "99e227a7-4562-4888-9660-4c27da678c50"
config = jsonencode({
log_level = "trace"
docker_labels = null
authentik_host = "https://authentik.viktorbarzin.me/"
docker_network = null
container_image = null
docker_map_ports = true
refresh_interval = "minutes=5"
kubernetes_replicas = 1
# info, not trace: the outpost sits on the hot path of every request to
# every auth="required" ingress trace logging is per-request overhead
# with no operational value (request access lines are emitted at info).
log_level = "info"
docker_labels = null
authentik_host = "https://authentik.viktorbarzin.me/"
docker_network = null
container_image = null
docker_map_ports = true
refresh_interval = "minutes=5"
# 2 replicas: removes the single-pod hot path for all forward-auth
# subrequests. Safe since sessions moved to the shared Postgres backend
# (authentik_providers_proxy_proxysession, 2026-05-10) no pod-local
# session state anymore.
kubernetes_replicas = 2
kubernetes_namespace = "authentik"
authentik_host_browser = ""
object_naming_template = "ak-outpost-%(name)s"
@ -198,3 +205,46 @@ resource "authentik_stage_user_login" "default_login" {
]
}
}
# -----------------------------------------------------------------------------
# Default Identification stage adopted 2026-06-10 to embed the password
# field on the identification screen (single-screen login: one round trip and
# one screen instead of two). Per authentik docs, when an Identification stage
# carries a password stage the Password stage must NOT be bound separately
# the redundant order-20 binding on default-authentication-flow (pk
# 0fc677db-a23f-4ee7-8648-da342e14573b) was deleted via the API in the same
# change. Social-login users are unaffected: source buttons stay on the same
# screen and bypass the password field.
# -----------------------------------------------------------------------------
import {
to = authentik_stage_identification.default_identification
id = "32aca5ab-106e-43f4-a4cc-4513d80e57f3"
}
data "authentik_stage" "default_authentication_password" {
name = "default-authentication-password"
}
resource "authentik_stage_identification" "default_identification" {
name = "default-authentication-identification"
password_stage = data.authentik_stage.default_authentication_password.id
lifecycle {
# Pin only password_stage; everything else stays UI-managed (same pattern
# as authentik_stage_user_login.default_login above).
ignore_changes = [
user_fields,
case_insensitive_matching,
show_matched_user,
show_source_labels,
sources,
enrollment_flow,
recovery_flow,
passwordless_flow,
pretend_user_exists,
captcha_stage,
webauthn_stage,
enable_remember_me,
]
}
}

View file

@ -29,7 +29,7 @@ resource "kubernetes_namespace" "authentik" {
labels = {
tier = var.tier
"resource-governance/custom-quota" = "true"
"keel.sh/enrolled" = "true"
"keel.sh/enrolled" = "true"
}
}
lifecycle {
@ -111,3 +111,44 @@ module "ingress-outpost" {
anti_ai_scraping = false
exclude_crowdsec = true
}
# Immutable caching for the flow-executor static assets. Authentik serves
# /static/dist/* with version-fingerprinted filenames (e.g. poly-2026.2.4.js)
# but no max-age, so browsers re-validate the login JS bundle on every signin
# and split-horizon internal users (direct to Traefik, no Cloudflare) get no
# edge cache at all. Long-lived immutable caching is safe: every authentik
# upgrade changes the asset URLs.
resource "kubernetes_manifest" "static_cache_headers" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "static-cache-headers"
namespace = kubernetes_namespace.authentik.metadata[0].name
}
spec = {
headers = {
customResponseHeaders = {
"Cache-Control" = "public, max-age=31536000, immutable"
}
}
}
}
}
module "ingress-static" {
source = "../../../../modules/kubernetes/ingress_factory"
# Same-host path carve-out of the public authentik UI ingress above, only
# adding the cache-headers middleware for the static asset prefix.
# auth = "none": versioned static assets of the (already public) Authentik login UI.
auth = "none"
namespace = kubernetes_namespace.authentik.metadata[0].name
name = "authentik-static"
host = "authentik"
service_name = "goauthentik-server"
ingress_path = ["/static"]
tls_secret_name = var.tls_secret_name
anti_ai_scraping = false
homepage_enabled = false
extra_middlewares = ["authentik-static-cache-headers@kubernetescrd"]
}

View file

@ -1,4 +1,10 @@
authentik:
# NOTE: because we set existingSecret below, the chart does NOT render the
# authentik.* values into an AUTHENTIK_* env Secret — the live env comes
# from the orphaned, helm-keep-policy `goauthentik` Secret created by chart
# 2025.10.3. Anything under authentik.* here is effectively INERT. All new
# or tuned config MUST go through server.env / worker.env instead (see
# .claude/reference/authentik-state.md).
log_level: warning
# log_level: trace
secret_key: ""
@ -14,38 +20,40 @@ authentik:
port: 6432
user: authentik
password: ""
# Persistent client-side connections (safe with PgBouncer session mode;
# must be < pgbouncer server_idle_timeout=600s). Cuts Django connection
# setup overhead off the ~70 sequential ORM ops per flow stage.
conn_max_age: 60
conn_health_checks: true
cache:
# Cache flow plans for 30m and policy evaluations for 15m. Authentik 2026.2
# moved cache storage from Redis to Postgres, so a TTL hit is still a
# SELECT — but a single indexed lookup beats re-evaluating PolicyBindings.
timeout_flows: 1800
timeout_policies: 900
web:
# Gunicorn: 3 workers × 4 threads per server pod (default 2×4).
# Pairs with the server memory bump to 2Gi (each worker preloads Django ~500Mi).
workers: 3
threads: 4
worker:
# Celery-equivalent worker threads per pod (default 2, renamed from
# AUTHENTIK_WORKER__CONCURRENCY in 2025.8).
threads: 4
server:
replicas: 3
# Anonymous Django sessions (no completed login: bots, healthcheckers,
# partial flows) expire in 2h. Default is days=1. Once login completes,
# UserLoginStage.session_duration takes over via request.session.set_expiry.
# Injected via server.env (not authentik.sessions.*) because we use
# authentik.existingSecret.secretName, which makes the chart skip
# rendering the AUTHENTIK_* secret — so the values block doesn't reach env.
env:
# Anonymous Django sessions (no completed login: bots, healthcheckers,
# partial flows) expire in 2h. Default is days=1. Once login completes,
# UserLoginStage.session_duration takes over via request.session.set_expiry.
# Injected via server.env (not authentik.sessions.*) because we use
# authentik.existingSecret.secretName, which makes the chart skip
# rendering the AUTHENTIK_* secret — so the values block doesn't reach env.
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
value: "hours=2"
# Gunicorn: 3 workers × 4 threads per server pod (defaults 2×4).
# Pairs with the server memory limit of 2Gi (each worker preloads
# Django ~500Mi).
- name: AUTHENTIK_WEB__WORKERS
value: "3"
- name: AUTHENTIK_WEB__THREADS
value: "4"
# Cache flow plans for 30m and policy evaluations for 15m (defaults 300s).
# Authentik 2026.2 stores cache in Postgres, so a TTL hit is still a
# SELECT — but a single indexed lookup beats re-planning the flow
# (~70 sequential ORM ops per flow stage POST).
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
value: "1800"
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
value: "900"
# Persistent client-side DB connections (safe with PgBouncer session mode;
# must stay < pgbouncer server_idle_timeout=600s). Cuts per-request Django
# connection setup off the auth hot path.
- name: AUTHENTIK_POSTGRESQL__CONN_MAX_AGE
value: "60"
- name: AUTHENTIK_POSTGRESQL__CONN_HEALTH_CHECKS
value: "true"
strategy:
type: RollingUpdate
rollingUpdate:
@ -82,11 +90,23 @@ worker:
# certificate renewal) — no user-facing traffic, so 2-of-3 isn't
# needed for availability. Drop saves ~100m sustained CPU.
replicas: 2
# Same unauthenticated_age cap as server — both the server (Django session
# middleware) and worker (cleanup tasks) need to see the value.
env:
# Same unauthenticated_age cap as server — both the server (Django session
# middleware) and worker (cleanup tasks) need to see the value.
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
value: "hours=2"
# Dramatiq worker threads per pod (default 2).
- name: AUTHENTIK_WORKER__THREADS
value: "4"
# Keep cache + DB-connection settings in lockstep with server.env.
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
value: "1800"
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
value: "900"
- name: AUTHENTIK_POSTGRESQL__CONN_MAX_AGE
value: "60"
- name: AUTHENTIK_POSTGRESQL__CONN_HEALTH_CHECKS
value: "true"
strategy:
type: RollingUpdate
rollingUpdate:

View file

@ -720,6 +720,11 @@ resource "kubernetes_config_map" "auth_proxy_config" {
"default.conf" = <<-EOT
upstream authentik {
server ak-outpost-authentik-embedded-outpost.authentik.svc.cluster.local:9000;
# Reuse connections to the outpost. Without this every forward-auth
# subrequest (= every request to every auth="required" ingress) opens
# a fresh TCP connection. Requires HTTP/1.1 + cleared Connection
# header on the proxy_pass locations below.
keepalive 32;
}
server {
listen 9000;
@ -734,6 +739,8 @@ resource "kubernetes_config_map" "auth_proxy_config" {
location /outpost.goauthentik.io/auth/traefik {
proxy_pass http://authentik;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_connect_timeout 3s;
proxy_read_timeout 5s;
proxy_send_timeout 5s;
@ -764,6 +771,8 @@ resource "kubernetes_config_map" "auth_proxy_config" {
location /outpost.goauthentik.io/ {
proxy_pass http://authentik;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_connect_timeout 3s;
proxy_read_timeout 10s;
proxy_set_header Host $host;
@ -820,6 +829,11 @@ resource "kubernetes_deployment" "auth_proxy" {
labels = {
app = "auth-proxy"
}
annotations = {
# nginx only reads its config at startup roll the pods whenever
# the ConfigMap content changes.
"checksum/auth-proxy-config" = sha1(kubernetes_config_map.auth_proxy_config.data["default.conf"])
}
}
spec {
topology_spread_constraint {