authentik: speed up first-time signin (single-screen login, live env tuning, asset caching, outpost+nginx hot path)
Viktor asked to review Authentik and the web tier and make first-time signin to apps faster. Review found the slowness is screens and round trips, not server time. Changes: - values.yaml: the authentik.* Helm values (gunicorn workers, cache timeouts, conn_max_age) were silently INERT because existingSecret skips chart env rendering — pods ran defaults (2 workers, 300s caches, no persistent DB conns). Moved all tuning into server.env/worker.env, which actually reaches the pods. - authentik_provider.tf: adopt the identification stage and pin password_stage so username+password render on ONE screen (the separate order-20 password binding is deleted via API — authentik requires that when embedding). Outpost log_level trace->info and 1->2 replicas (it is on the hot path of every forward-auth request; PG-backed sessions make 2 replicas safe). - authentik module: /static ingress carve-out with immutable Cache-Control (assets are version-fingerprinted but served with no max-age — internal split-horizon users got zero caching). - traefik auth-proxy nginx: upstream keepalive 32 + HTTP/1.1 (was opening a fresh TCP connection to the outpost per subrequest) + config-checksum annotation so config changes roll the pods. - docs: authentication.md + authentik-state.md updated; fixed stale 'postgresql.dbaas has no endpoints' claim in CLAUDE.md/CONTEXT.md (it is a live CNPG primary-selector compatibility service). Done via API in the same change (UI-managed objects): 6 OIDC providers (Vault, Forgejo, Immich, Headscale, linkwarden, Cloudflare Access) switched from explicit to implicit consent — all first-party, the 4-weekly consent screen only slowed first-time signin. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
93ba67c84a
commit
97ccdbecb8
8 changed files with 232 additions and 55 deletions
|
|
@ -29,7 +29,7 @@ resource "kubernetes_namespace" "authentik" {
|
|||
labels = {
|
||||
tier = var.tier
|
||||
"resource-governance/custom-quota" = "true"
|
||||
"keel.sh/enrolled" = "true"
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
|
|
@ -111,3 +111,44 @@ module "ingress-outpost" {
|
|||
anti_ai_scraping = false
|
||||
exclude_crowdsec = true
|
||||
}
|
||||
|
||||
# Immutable caching for the flow-executor static assets. Authentik serves
|
||||
# /static/dist/* with version-fingerprinted filenames (e.g. poly-2026.2.4.js)
|
||||
# but no max-age, so browsers re-validate the login JS bundle on every signin
|
||||
# — and split-horizon internal users (direct to Traefik, no Cloudflare) get no
|
||||
# edge cache at all. Long-lived immutable caching is safe: every authentik
|
||||
# upgrade changes the asset URLs.
|
||||
resource "kubernetes_manifest" "static_cache_headers" {
|
||||
manifest = {
|
||||
apiVersion = "traefik.io/v1alpha1"
|
||||
kind = "Middleware"
|
||||
metadata = {
|
||||
name = "static-cache-headers"
|
||||
namespace = kubernetes_namespace.authentik.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
headers = {
|
||||
customResponseHeaders = {
|
||||
"Cache-Control" = "public, max-age=31536000, immutable"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module "ingress-static" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
# Same-host path carve-out of the public authentik UI ingress above, only
|
||||
# adding the cache-headers middleware for the static asset prefix.
|
||||
# auth = "none": versioned static assets of the (already public) Authentik login UI.
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.authentik.metadata[0].name
|
||||
name = "authentik-static"
|
||||
host = "authentik"
|
||||
service_name = "goauthentik-server"
|
||||
ingress_path = ["/static"]
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
homepage_enabled = false
|
||||
extra_middlewares = ["authentik-static-cache-headers@kubernetescrd"]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,10 @@
|
|||
authentik:
|
||||
# NOTE: because we set existingSecret below, the chart does NOT render the
|
||||
# authentik.* values into an AUTHENTIK_* env Secret — the live env comes
|
||||
# from the orphaned, helm-keep-policy `goauthentik` Secret created by chart
|
||||
# 2025.10.3. Anything under authentik.* here is effectively INERT. All new
|
||||
# or tuned config MUST go through server.env / worker.env instead (see
|
||||
# .claude/reference/authentik-state.md).
|
||||
log_level: warning
|
||||
# log_level: trace
|
||||
secret_key: ""
|
||||
|
|
@ -14,38 +20,40 @@ authentik:
|
|||
port: 6432
|
||||
user: authentik
|
||||
password: ""
|
||||
# Persistent client-side connections (safe with PgBouncer session mode;
|
||||
# must be < pgbouncer server_idle_timeout=600s). Cuts Django connection
|
||||
# setup overhead off the ~70 sequential ORM ops per flow stage.
|
||||
conn_max_age: 60
|
||||
conn_health_checks: true
|
||||
cache:
|
||||
# Cache flow plans for 30m and policy evaluations for 15m. Authentik 2026.2
|
||||
# moved cache storage from Redis to Postgres, so a TTL hit is still a
|
||||
# SELECT — but a single indexed lookup beats re-evaluating PolicyBindings.
|
||||
timeout_flows: 1800
|
||||
timeout_policies: 900
|
||||
web:
|
||||
# Gunicorn: 3 workers × 4 threads per server pod (default 2×4).
|
||||
# Pairs with the server memory bump to 2Gi (each worker preloads Django ~500Mi).
|
||||
workers: 3
|
||||
threads: 4
|
||||
worker:
|
||||
# Celery-equivalent worker threads per pod (default 2, renamed from
|
||||
# AUTHENTIK_WORKER__CONCURRENCY in 2025.8).
|
||||
threads: 4
|
||||
|
||||
server:
|
||||
replicas: 3
|
||||
# Anonymous Django sessions (no completed login: bots, healthcheckers,
|
||||
# partial flows) expire in 2h. Default is days=1. Once login completes,
|
||||
# UserLoginStage.session_duration takes over via request.session.set_expiry.
|
||||
# Injected via server.env (not authentik.sessions.*) because we use
|
||||
# authentik.existingSecret.secretName, which makes the chart skip
|
||||
# rendering the AUTHENTIK_* secret — so the values block doesn't reach env.
|
||||
env:
|
||||
# Anonymous Django sessions (no completed login: bots, healthcheckers,
|
||||
# partial flows) expire in 2h. Default is days=1. Once login completes,
|
||||
# UserLoginStage.session_duration takes over via request.session.set_expiry.
|
||||
# Injected via server.env (not authentik.sessions.*) because we use
|
||||
# authentik.existingSecret.secretName, which makes the chart skip
|
||||
# rendering the AUTHENTIK_* secret — so the values block doesn't reach env.
|
||||
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
|
||||
value: "hours=2"
|
||||
# Gunicorn: 3 workers × 4 threads per server pod (defaults 2×4).
|
||||
# Pairs with the server memory limit of 2Gi (each worker preloads
|
||||
# Django ~500Mi).
|
||||
- name: AUTHENTIK_WEB__WORKERS
|
||||
value: "3"
|
||||
- name: AUTHENTIK_WEB__THREADS
|
||||
value: "4"
|
||||
# Cache flow plans for 30m and policy evaluations for 15m (defaults 300s).
|
||||
# Authentik 2026.2 stores cache in Postgres, so a TTL hit is still a
|
||||
# SELECT — but a single indexed lookup beats re-planning the flow
|
||||
# (~70 sequential ORM ops per flow stage POST).
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
|
||||
value: "1800"
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
|
||||
value: "900"
|
||||
# Persistent client-side DB connections (safe with PgBouncer session mode;
|
||||
# must stay < pgbouncer server_idle_timeout=600s). Cuts per-request Django
|
||||
# connection setup off the auth hot path.
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_MAX_AGE
|
||||
value: "60"
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_HEALTH_CHECKS
|
||||
value: "true"
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
|
|
@ -82,11 +90,23 @@ worker:
|
|||
# certificate renewal) — no user-facing traffic, so 2-of-3 isn't
|
||||
# needed for availability. Drop saves ~100m sustained CPU.
|
||||
replicas: 2
|
||||
# Same unauthenticated_age cap as server — both the server (Django session
|
||||
# middleware) and worker (cleanup tasks) need to see the value.
|
||||
env:
|
||||
# Same unauthenticated_age cap as server — both the server (Django session
|
||||
# middleware) and worker (cleanup tasks) need to see the value.
|
||||
- name: AUTHENTIK_SESSIONS__UNAUTHENTICATED_AGE
|
||||
value: "hours=2"
|
||||
# Dramatiq worker threads per pod (default 2).
|
||||
- name: AUTHENTIK_WORKER__THREADS
|
||||
value: "4"
|
||||
# Keep cache + DB-connection settings in lockstep with server.env.
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_FLOWS
|
||||
value: "1800"
|
||||
- name: AUTHENTIK_CACHE__TIMEOUT_POLICIES
|
||||
value: "900"
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_MAX_AGE
|
||||
value: "60"
|
||||
- name: AUTHENTIK_POSTGRESQL__CONN_HEALTH_CHECKS
|
||||
value: "true"
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue