diff --git a/.claude/reference/authentik-state.md b/.claude/reference/authentik-state.md index d9622033..30094776 100644 --- a/.claude/reference/authentik-state.md +++ b/.claude/reference/authentik-state.md @@ -138,3 +138,54 @@ Notes: - `ProxyProvider.remember_me_offset` stays UI-managed via `ignore_changes`. - The Authentik provider's resource schema does **not** expose the `Outpost.managed` field. We rely on TF's "write only fields it knows about" semantic: the server-set `goauthentik.io/outposts/embedded` value is preserved across applies because Terraform never writes `managed`. Don't change the resource provider schema expectations without verifying this assumption holds. - The `unauthenticated_age` env var is injected via `server.env` / `worker.env` (not `authentik.sessions.unauthenticated_age`) because we set `authentik.existingSecret.secretName: goauthentik`, which makes the chart skip rendering its own `AUTHENTIK_*` Secret. The `authentik.*` value block is therefore inert in this stack — anything new under `authentik.*` must use the `*.env` arrays instead. The same applies to the existing `authentik.cache.*`, `authentik.web.*`, `authentik.worker.*` blocks (currently inert; live values come from the orphaned, helm-keep-policy `goauthentik` Secret created by chart 2025.10.3 before `existingSecret` was introduced). + +## Upgrade Validation Checklist + +Run after **any** of these: +- Authentik chart version bump in `stacks/authentik/modules/authentik/main.tf` (the `version = "..."` line on `helm_release.authentik`). +- `goauthentik/authentik` Terraform provider version bump. +- Outpost pod recreation (kured reboot, eviction, manual `rollout restart`, scheduler move). + +The fragile surfaces are the `kubernetes_json_patches` and the `Outpost.managed` field — both rely on assumptions that can silently break across upgrades. The checklist exercises the same path the alerts watch, so it doubles as a smoke test for the alerts. + +```bash +# 1. Service routes to the outpost pod (NOT the server pods). +# Empty endpoints => auth-proxy fallback fires; expected: ONE pod IP, ports 9000/9300/9443. +kubectl -n authentik get endpoints ak-outpost-authentik-embedded-outpost + +# 2. Service selector still excludes the server pods. Expected: includes +# `app.kubernetes.io/name: authentik-outpost-proxy`. If it flips to +# `name: authentik`, the goauthentik upstream bug came back or our +# JSON patch was unset. +kubectl -n authentik get svc ak-outpost-authentik-embedded-outpost -o jsonpath='{.spec.selector}' + +# 3. Outpost mode + session backend. Expected log lines on startup: +# {"embedded":true,"event":"Outpost mode",...} +# {"event":"using PostgreSQL session backend",...} +# If embedded=false or `using filesystem session backend`, the postgres +# fix is broken — likely `Outpost.managed` got cleared, or the upstream +# schema started exposing `managed` and TF reset it. +kubectl -n authentik logs deploy/ak-outpost-authentik-embedded-outpost | grep -E '"Outpost mode"|"session backend"' | head -3 + +# 4. /dev/shm is essentially empty (postgres backend = no filesystem use). +# A row count > a few dozen indicates filesystem fallback is firing. +kubectl -n authentik exec deploy/ak-outpost-authentik-embedded-outpost -- sh -c 'df -h /dev/shm; ls /dev/shm | wc -l' + +# 5. Postgres session table is growing with traffic. Expected: rows with +# `expires` ~28 days out (matches access_token_validity = weeks=4). +kubectl -n authentik exec deploy/goauthentik-server -- ak shell -c " +from django.db import connection; c = connection.cursor() +c.execute('SELECT COUNT(*), MAX(expires) FROM authentik_providers_proxy_proxysession') +print(c.fetchone())" + +# 6. Edge auth flow: should be 302 → authentik. NOT 401 with WWW-Authenticate. +curl -sS -o /dev/null -D - 'https://terminal.viktorbarzin.me/' -H 'User-Agent: Mozilla/5.0' \ + | grep -iE '^HTTP|^location|x-auth-fallback|www-authenticate' + +# 7. Terraform plan-to-zero on the whole authentik stack. +( cd stacks/authentik && /home/wizard/code/infra/scripts/tg plan ) | grep -E 'No changes|Plan:' +``` + +Steps 1, 3, 6 cover the failure modes the Prometheus alerts trigger on (`AuthentikForwardAuthFallbackActive`, `AuthentikOutpostForwardAuth400Spike`). Steps 4 and 5 cover the silent-regression case (filesystem fallback) where the alerts don't fire but the system loses its postgres-backed session persistence on the next pod restart. + +If step 2 shows the controller restored `app.kubernetes.io/name=authentik`, watch goauthentik/authentik issue tracker for fixes around `internal/outpost/controllers/k8s/service.py:52` — the upstream patch might let us drop our `kubernetes_json_patches.service` workaround. diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 27b53083..3ec9d0d5 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1789,6 +1789,89 @@ serverFiles: severity: critical annotations: summary: "Calico: only {{ $value | printf \"%.0f\" }} of desired calico-node pods ready — networking degraded" + # Upgrade Gates: any firing alert here halts kured rolling reboots via + # --prometheus-url + alertFilterRegexp ignore-list (see stacks/kured/main.tf). + # These are silent-failure detectors and cluster-health velocity signals + # that catch cascade-style failures (March 2026 26h outage class). + - name: "Upgrade Gates" + rules: + - alert: KubeAPIServerDown + expr: up{job="kubernetes-apiservers"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Kubernetes apiserver {{ $labels.instance }} is down — control plane degraded, blocks kured" + - alert: KubeStateMetricsDown + expr: absent(kube_node_info) + for: 5m + labels: + severity: critical + annotations: + summary: "kube-state-metrics not responding — many alerts are SILENT until this is fixed" + - alert: PrometheusRuleEvaluationFailing + expr: increase(prometheus_rule_evaluation_failures_total[10m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Prometheus rule evaluation failing — alerting itself is degraded ({{ $value | printf \"%.0f\" }} failures in 10m)" + - alert: PVCStuckPending + expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} stuck Pending for 10m+" + - alert: RecentNodeReboot + expr: (time() - process_start_time_seconds{job="kubernetes-nodes"}) < 86400 + for: 0m + labels: + severity: info + annotations: + summary: "Node {{ $labels.node }} kubelet started {{ $value | humanizeDuration }} ago — 24h soak window halts further reboots" + - alert: MysqlStandaloneDown + expr: kube_statefulset_status_replicas_ready{statefulset="mysql-standalone"} < 1 + for: 2m + labels: + severity: critical + annotations: + summary: "mysql-standalone has 0 ready replicas — DB-dependent apps will fail" + - alert: ClusterPodReadyRatioDropped + expr: | + ( + sum(kube_pod_status_ready{condition="true"}) + / sum(kube_pod_status_phase{phase="Running"}) + ) < 0.9 + and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 + for: 5m + labels: + severity: warning + annotations: + summary: "Cluster pod-ready ratio is {{ $value | printf \"%.1f\" }} (threshold: 0.9) — possible cascade" + - alert: NodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.node }} reports MemoryPressure=true — kubelet may evict pods" + - alert: NodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.node }} reports DiskPressure=true — image GC may not keep up" + - alert: KubeQuotaAlmostFull + expr: | + kube_resourcequota{type="used"} + / on(namespace, resource) kube_resourcequota{type="hard"} > 0.95 + for: 15m + labels: + severity: warning + annotations: + summary: "ResourceQuota {{ $labels.namespace }}/{{ $labels.resource }} at {{ $value | printf \"%.1f\" }} — workloads may fail to reschedule" - name: "Traefik Ingress" rules: - alert: TraefikDown @@ -2368,12 +2451,41 @@ serverFiles: # Sudden 400 spike from the outpost means forward-auth is broken # for all protected services. The /dev/shm ENOSPC class of failures # manifests as the outpost returning 400 on /outpost.goauthentik.io/auth/traefik. - expr: sum by (service) (increase(traefik_service_requests_total{code="400", service=~"authentik-authentik-outpost.*"}[5m])) > 10 + # Service label format is `authentik-ak-outpost-authentik-embedded-outpost-9000@kubernetes` + # — the original regex `authentik-authentik-outpost.*` never matched anything (fixed 2026-05-10). + expr: sum by (service) (increase(traefik_service_requests_total{code="400", service=~"authentik-ak-outpost-.*"}[5m])) > 10 for: 2m labels: severity: critical annotations: summary: "Authentik outpost returning {{ $value | printf \"%.0f\" }} 400s in 5m on {{ $labels.service }} — forward-auth broken for all 43 protected services" + - alert: AuthentikForwardAuthFallbackActive + # Catches the auth-proxy "Emergency Access" Basic-Auth fallback firing + # at the edge — symptom of the outpost service having zero ready + # endpoints (selector mismatch, label drift, controller bug). The + # auth-proxy nginx returns 401 with `WWW-Authenticate: Basic` and + # `X-Auth-Fallback: true` in that case; Traefik proxies the 401 + # back through the websecure entrypoint. + # + # Why this rule and not `kube_endpoint_address_available == 0`: + # kube-state-metrics endpoint metrics are silently dropped by the + # Prometheus pipeline in this cluster (kube_endpoint_* series + # exist but never have current values). Detecting the failure + # signal at the edge is more reliable than instrumenting the + # broken middle. + # + # Baseline 401/s on websecure is ~0.02 (linkwarden API). Threshold + # of 5 leaves ~250x headroom; fallback firing on a busy site + # immediately pushes 401/s well above that. + # + # See `.claude/reference/authentik-state.md` for the upgrade + # validation checklist that exercises the same path. + expr: sum(rate(traefik_entrypoint_requests_total{code="401",entrypoint="websecure"}[5m])) > 5 + for: 5m + labels: + severity: critical + annotations: + summary: "websecure 401 rate {{ $value | printf \"%.1f\" }}/s for 5m — Authentik forward-auth Emergency Access fallback likely firing. Check `kubectl -n authentik get endpoints ak-outpost-authentik-embedded-outpost`." - alert: AuthentikServerReplicasMismatch # With 3 replicas + PDB minAvailable=2, a sustained drop to <3 # means a node is unschedulable, image pull failing, or quota hit.