diff --git a/.gitignore b/.gitignore index 460d943f..3475f32a 100755 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ cloudflare_provider.tf tiers.tf stacks/*/cloudflare_provider.tf stacks/*/tiers.tf +stacks/*/terragrunt_rendered.json # Kubernetes config (sensitive) config diff --git a/docs/post-mortems/2026-04-18-authentik-outpost-shm-full.md b/docs/post-mortems/2026-04-18-authentik-outpost-shm-full.md index 9b735a97..5cd22fd1 100644 --- a/docs/post-mortems/2026-04-18-authentik-outpost-shm-full.md +++ b/docs/post-mortems/2026-04-18-authentik-outpost-shm-full.md @@ -109,9 +109,9 @@ Contributing distractions: | Priority | Action | Type | Details | Status | |----------|--------|------|---------|--------| -| P1 | Prometheus alert on outpost `/dev/shm` usage > 80% | Alert | Metric: `container_fs_usage_bytes{container!="",namespace="authentik",pod=~"ak-outpost-.*"} / container_fs_limit_bytes > 0.8`. Firing threshold 15 min, severity warning. | TODO | -| P1 | Prometheus alert on sustained 400 rate on forward-auth middleware | Alert | `increase(traefik_service_requests_total{code="400",service=~".*-viktorbarzin-me@.*"}[15m]) > 100` — catches mass-failure patterns at the Traefik level before the outpost is silently broken. | TODO | +| P1 | Prometheus alerts on outpost `/dev/shm` fill (two thresholds) | Alert | Group `Authentik Outpost` added in `stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl`. `AuthentikOutpostMemoryHigh` (warning, working set > 1.5 GiB for 15m) + `AuthentikOutpostMemoryCritical` (critical, > 1.8 GiB for 5m) + `AuthentikOutpostRestarts` (warning, > 2 restarts in 30m). Applied 2026-04-18 13:16 UTC; loaded in Prometheus, state=inactive. | **DONE** | | P1 | Uptime-Kuma meta-monitor: "N+ external monitors down simultaneously" | Alert | Either a Prometheus rule over `uptime_kuma_monitor_status == 0` counts, or a dedicated external probe. Very strong signal of shared-infra failure. | TODO | +| P1 | Bump tmpfs `sizeLimit` from 512Mi → 2Gi | Config | Patched outpost `kubernetes_json_patches` via Authentik API. 2026-04-18 13:06 UTC. Gives ~8× growth headroom at current probe rate before needing reconsideration. | **DONE** | ### P2 — Codify the fix so it survives drift diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 6e6a6d04..5188c1ca 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1907,6 +1907,38 @@ serverFiles: severity: warning annotations: summary: "{{ $value | printf \"%.0f\" }} service(s) externally unreachable but internally healthy — check Cloudflare tunnel, DNS, or Traefik routing" + - name: "Authentik Outpost" + # Guards against the 2026-04-18 incident where /dev/shm filled with + # gorilla/sessions FileStore files (~44k files at ~1.5KB each) and the + # outpost returned HTTP 400 on every forward-auth request. + # See docs/post-mortems/2026-04-18-authentik-outpost-shm-full.md. + rules: + - alert: AuthentikOutpostMemoryHigh + # Working set includes /dev/shm tmpfs contents (session files). + # sizeLimit on the outpost emptyDir is 2Gi; warn at 75% to leave + # plenty of headroom for mitigation before ENOSPC. + expr: container_memory_working_set_bytes{namespace="authentik", pod=~"ak-outpost-.*", container="proxy"} > 1.5 * 1024 * 1024 * 1024 + for: 15m + labels: + severity: warning + annotations: + summary: "Authentik outpost working set {{ $value | humanize1024 }} — /dev/shm may be filling with session files (threshold 1.5 GiB of 2 GiB sizeLimit)" + - alert: AuthentikOutpostMemoryCritical + expr: container_memory_working_set_bytes{namespace="authentik", pod=~"ak-outpost-.*", container="proxy"} > 1.8 * 1024 * 1024 * 1024 + for: 5m + labels: + severity: critical + annotations: + summary: "Authentik outpost near /dev/shm fill ({{ $value | humanize1024 }}) — imminent forward-auth failure. Restart pod: kubectl -n authentik delete pod -l goauthentik.io/outpost-name=authentik-embedded-outpost" + - alert: AuthentikOutpostRestarts + # Pod restarts on a stateless outpost usually mean OOM or crash. + # Normal is 0; we expect one manual rollout per incident/upgrade. + expr: increase(kube_pod_container_status_restarts_total{namespace="authentik", pod=~"ak-outpost-.*"}[30m]) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Authentik outpost restarted {{ $value | printf \"%.0f\" }} times in 30m — check for OOM or crash loop" extraScrapeConfigs: | - job_name: 'proxmox-host'