From 16d9fd8bde0fe8d5fc11a79f8f34f577b11b4a9b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 22:48:26 +0000 Subject: [PATCH 1/3] [infra] Adopt Authentik catch-all Proxy Provider + Application into TF (Wave 6a) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Wave 6a of the state-drift consolidation plan. The Domain wide catch all Proxy Provider (pk=5) + its wrapping Application (slug=domain-wide-catch-all) + the embedded outpost (uuid 0eecac07-97c7-443c-8925-05f2f4fe3e47) have run for a year as pure UI-created state. When the 2026-04-18 outpost SEV2 hit, it was harder to reason about the config than it should have been — the only source of truth was the Authentik admin UI. Bringing the provider + application under Terraform means future changes are reviewable in PRs and recoverable from git if the admin UI misbehaves. ## This change Adds the `goauthentik/authentik` provider to the repo's central `terragrunt.hcl` `required_providers` (side-effect: every stack can now declare authentik resources; this stack is the only current consumer). Stack-local `stacks/authentik/authentik_provider.tf` holds the provider instance configuration + API token wiring + two resources + their flow data-source lookups. ### Auth - API token stored in Vault at `secret/authentik/tf_api_token`, identifier `terraform-infra-stack`, intent=API, user=akadmin, no expiry. Rotatable by rewriting the Vault KV + any running TF apply picks it up on next plan. ### Imports (both landed zero-diff) - `authentik_application.catchall` ← id `domain-wide-catch-all` - `authentik_provider_proxy.catchall` ← id `5` ### Flow references Authorization + invalidation flows are looked up via `data "authentik_flow"` by slug (`default-provider-authorization-implicit-consent` + `default-provider-invalidation-flow`). Keeping them as data sources rather than hardcoded UUIDs means a flow recreation (slug unchanged) doesn't require an HCL edit. ### `lifecycle { ignore_changes }` scope On `authentik_provider_proxy.catchall`: - `property_mappings` (5 UUIDs), `jwt_federation_sources` (1 UUID) — the live state references complex many-to-many relations that are easier to manage from the Authentik UI than to serialise in HCL. Drift suppressed. - `skip_path_regex`, `internal_host`, all `basic_auth_*`, `intercept_header_auth`, `access_token_validity` — either defaults or UI-only tuning knobs that aren't part of Terraform's concern for this catch-all provider. On `authentik_application.catchall`: - `meta_description`, `meta_launch_url`, `meta_icon`, `group`, `backchannel_providers`, `policy_engine_mode`, `open_in_new_tab` — cosmetic/non-functional attributes; the Authentik UI is the right place to edit these and drift on them isn't interesting. ## What is NOT in this change - Outpost-binding resource — the embedded outpost's provider list is a single-row many-to-many that the Authentik UI manages cleanly; adding TF there would fight the UI without reducing drift. - Property mappings and JWT federation source — managed via UI, drift suppressed. A future wave can bring them in when someone actually wants to edit them through code review. - Other Authentik entities (Flows, Stages, Groups, RBAC policies) — same rationale: UI is the natural editing surface. Adopt incrementally as they become interesting to code-review. ## Verification ``` $ cd stacks/authentik && ../../scripts/tg plan | grep Plan: Plan: 0 to add, 1 to change, 0 to destroy. # module.authentik.kubernetes_deployment.pgbouncer — pre-existing drift, # unrelated to this commit (image_pull_policy Always -> IfNotPresent) $ ../../scripts/tg state list | grep authentik_ authentik_application.catchall authentik_provider_proxy.catchall data.authentik_flow.default_authorization_implicit_consent data.authentik_flow.default_provider_invalidation ``` ## Reproduce locally 1. `git pull && cd stacks/authentik && ../../scripts/tg init` 2. Terraform pulls goauthentik/authentik provider (first time). 3. `tg plan` — expect only pgbouncer drift; authentik resources read-only. Refs: Wave 6a of the state-drift consolidation (code-hl1) Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/authentik/authentik_provider.tf | 59 ++++++++++++++++++++++++++ terragrunt.hcl | 4 ++ 2 files changed, 63 insertions(+) create mode 100644 stacks/authentik/authentik_provider.tf diff --git a/stacks/authentik/authentik_provider.tf b/stacks/authentik/authentik_provider.tf new file mode 100644 index 00000000..e9db3985 --- /dev/null +++ b/stacks/authentik/authentik_provider.tf @@ -0,0 +1,59 @@ +# goauthentik/authentik Terraform provider. +# +# Adopted 2026-04-18 (Wave 6a of the state-drift consolidation plan) to bring +# the catch-all Proxy Provider — previously managed only via the Authentik UI +# — under Terraform management. API token lives in Vault +# `secret/authentik/tf_api_token` (token identifier `terraform-infra-stack`, +# intent API, user akadmin, no expiry). Required-providers declaration sits +# in the central terragrunt.hcl so every stack has it available; only this +# stack configures a provider block. + +data "vault_kv_secret_v2" "authentik_tf" { + mount = "secret" + name = "authentik" +} + +provider "authentik" { + url = "https://authentik.viktorbarzin.me" + token = data.vault_kv_secret_v2.authentik_tf.data["tf_api_token"] +} + +data "authentik_flow" "default_authorization_implicit_consent" { + slug = "default-provider-authorization-implicit-consent" +} + +data "authentik_flow" "default_provider_invalidation" { + slug = "default-provider-invalidation-flow" +} + +# ----------------------------------------------------------------------------- +# Catch-all Proxy Provider + Application. +# +# Created via the Authentik UI ~a year ago; adopted into Terraform 2026-04-18 +# (Wave 6a). The proxy provider is consumed by the embedded outpost +# (uuid 0eecac07-97c7-443c-8925-05f2f4fe3e47) via an outpost-level binding +# that stays in the UI — it's a single toggle with no drift risk. +# ----------------------------------------------------------------------------- + +resource "authentik_application" "catchall" { + name = "Domain wide catch all" + slug = "domain-wide-catch-all" + protocol_provider = authentik_provider_proxy.catchall.id + lifecycle { + ignore_changes = [meta_description, meta_launch_url, meta_icon, group, backchannel_providers, policy_engine_mode, open_in_new_tab] + } +} + +resource "authentik_provider_proxy" "catchall" { + name = "Provider for Domain wide catch all" + mode = "forward_domain" + external_host = "https://authentik.viktorbarzin.me" + cookie_domain = "viktorbarzin.me" + # Flow UUIDs resolved dynamically so a flow re-creation (keeping the slug) + # doesn't require an HCL edit. + authorization_flow = data.authentik_flow.default_authorization_implicit_consent.id + invalidation_flow = data.authentik_flow.default_provider_invalidation.id + lifecycle { + ignore_changes = [property_mappings, jwt_federation_sources, skip_path_regex, internal_host, basic_auth_enabled, basic_auth_password_attribute, basic_auth_username_attribute, intercept_header_auth, access_token_validity] + } +} diff --git a/terragrunt.hcl b/terragrunt.hcl index d11f8b93..0376f6ed 100644 --- a/terragrunt.hcl +++ b/terragrunt.hcl @@ -62,6 +62,10 @@ terraform { source = "cloudflare/cloudflare" version = "~> 4" } + authentik = { + source = "goauthentik/authentik" + version = "~> 2024.10" + } } } From 11082f7e834f9967fc6353744bb3d45ee26ee35a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 22:52:56 +0000 Subject: [PATCH 2/3] [infra] Partial Calico adoption: namespaces only (Wave 5b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Wave 5b of the state-drift consolidation plan. Calico has run this cluster's pod networking since 2024-07-30, installed via raw kubectl manifests — tigera-operator Deployment + ~20 CRDs + an Installation CR. The plan flagged Calico as HIGH BLAST because the operator + Installation CR sit on the critical path for pod scheduling; any mistake during adoption can break CNI and block new pods cluster-wide within seconds. This session takes the safe sub-step: adopt only the three namespaces. Namespaces are label containers — TF managing their names + PSA labels cannot disrupt Calico networking. Getting the operator, Installation CR, and CRDs under TF requires dedicated prep (picking the right `ignore_changes` fields to absorb operator-generated defaults in the Installation CR, decoupling from the embedded PSA labels applied at admission, and a low-traffic window). Deferred to `code-3ad`. ## This change New Tier 1 stack `stacks/calico/` adopting via import `{}` blocks (Wave 8 convention, commit 8a99be11): - `kubernetes_namespace.calico_system` ← id `calico-system` - `kubernetes_namespace.calico_apiserver` ← id `calico-apiserver` - `kubernetes_namespace.tigera_operator` ← id `tigera-operator` Apply: `3 imported, 0 added, 0 changed, 0 destroyed.` Followed by a second `tg plan` that returns `No changes`. Zero cluster impact — namespaces stayed exactly as they were cluster-side. ### terragrunt dependency choice Deliberately no `dependency "platform"` clause — Calico is lower in the stack than platform, so introducing a `platform → calico` or `calico → platform` edge would invite cycle-like pain on first bootstrap. The plan on this stack is always safe to run standalone. ### `ignore_changes` scope on each namespace - `goldilocks.fairwinds.com/vpa-update-mode` — Kyverno ClusterPolicy stamp (Wave 3B sweep, commit 8b43692a). - `pod-security.kubernetes.io/enforce` + `-version` — tigera-operator stamps these on `calico-system` + `calico-apiserver` to opt them out of PSA. These labels aren't surfaced by the kubernetes provider as part of the import (they arrive through a different field manager), so left unmanaged to keep the plan clean. `tigera-operator` ns doesn't get the PSA labels so they aren't ignored there. ## What is NOT in this change - The three live workloads: `tigera-operator` Deployment in `tigera-operator` ns, `calico-kube-controllers`/`calico-node`/ `calico-typha` workloads in `calico-system`, the `calico-apiserver` in `calico-apiserver`. These are all reconciled by the tigera-operator from the Installation CR — importing them into TF is redundant with importing the CR itself. - The `Installation` CR (`default`, apiVersion `operator.tigera.io/v1`) — the user-authored minimal spec has since been filled to 104 lines of operator-generated defaults. Adopting it requires a well-scoped `ignore_changes` list on the `manifest` field. Separate follow-up `code-3ad`. - `.sops.yaml` / `tier0_stacks` updates — the original plan suggested Tier 0 (local SOPS state) for the full Calico stack on the theory that "network underpins all". With only three namespaces in the stack, the argument doesn't hold: a failed Tier 1 plan on calico namespaces cannot break networking, so no need to pay the Tier 0 tax. ## Verification ``` $ cd stacks/calico && ../../scripts/tg plan No changes. Your infrastructure matches the configuration. $ kubectl get pods -n calico-system NAME READY STATUS RESTARTS calico-kube-controllers-... 1/1 Running 0 calico-node-... 1/1 Running 0 ... (all healthy, pre-existing) ``` Follow-up: code-3ad for operator + Installation CR adoption (needs low-traffic window + ignore_changes scoping). Closes: code-hl1 scope of Wave 5b (namespaces). Remaining subwave in code-3ad. Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/calico/main.tf | 67 ++++++++++++++++++++++++++++++++++++ stacks/calico/secrets | 1 + stacks/calico/terragrunt.hcl | 6 ++++ 3 files changed, 74 insertions(+) create mode 100644 stacks/calico/main.tf create mode 120000 stacks/calico/secrets create mode 100644 stacks/calico/terragrunt.hcl diff --git a/stacks/calico/main.tf b/stacks/calico/main.tf new file mode 100644 index 00000000..79bc756b --- /dev/null +++ b/stacks/calico/main.tf @@ -0,0 +1,67 @@ +# Calico CNI +# +# Calico has underpinned this cluster's pod networking since 2024-07-30, installed +# as raw kubectl manifests (tigera-operator Deployment + CRDs + Installation CR). +# Bringing the full stack under Terraform is high-blast — the operator and its +# Deployment must never flap during node pressure or during any apply, because +# new pod scheduling breaks within ~seconds of a CNI outage. +# +# This stack (created 2026-04-18 Wave 5b) adopts the three namespaces only: +# calico-system, calico-apiserver, tigera-operator. The `tigera-operator` +# Deployment, the 20+ CRDs it manages, and the `Installation` CR itself are +# intentionally *not* adopted yet — they require a low-traffic window and a +# careful ignore_changes set to cover operator-generated defaults on the +# Installation CR. Follow-up tracked in beads code-3ad. +# +# The namespaces are safe to adopt (no networking impact — they're just label +# containers) and give TF an audit trail entry for the labels/tier Kyverno +# cares about. + +resource "kubernetes_namespace" "calico_system" { + metadata { + name = "calico-system" + labels = { + name = "calico-system" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode label on every namespace. + # pod-security.kubernetes.io/* labels are applied by the tigera-operator + # reconciler on calico-system + calico-apiserver for PSA 'privileged'. + ignore_changes = [ + metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"], + metadata[0].labels["pod-security.kubernetes.io/enforce"], + metadata[0].labels["pod-security.kubernetes.io/enforce-version"], + ] + } +} + +resource "kubernetes_namespace" "calico_apiserver" { + metadata { + name = "calico-apiserver" + labels = { + name = "calico-apiserver" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1 + PSA labels applied by tigera-operator (see calico_system). + ignore_changes = [ + metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"], + metadata[0].labels["pod-security.kubernetes.io/enforce"], + metadata[0].labels["pod-security.kubernetes.io/enforce-version"], + ] + } +} + +resource "kubernetes_namespace" "tigera_operator" { + metadata { + name = "tigera-operator" + labels = { + name = "tigera-operator" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} diff --git a/stacks/calico/secrets b/stacks/calico/secrets new file mode 120000 index 00000000..ca54a7cf --- /dev/null +++ b/stacks/calico/secrets @@ -0,0 +1 @@ +../../secrets \ No newline at end of file diff --git a/stacks/calico/terragrunt.hcl b/stacks/calico/terragrunt.hcl new file mode 100644 index 00000000..eb956424 --- /dev/null +++ b/stacks/calico/terragrunt.hcl @@ -0,0 +1,6 @@ +include "root" { + path = find_in_parent_folders() +} + +# No platform dependency — Calico provides the cluster network the rest +# of the platform runs on. This stack must not introduce a dep cycle. From 471e9461337d0ffc2b9ca65b722a4ff28cabc57c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 23:03:22 +0000 Subject: [PATCH 3/3] [monitoring] Put uk-payslip dashboard in Finance folder Grafana can't auto-create the reserved 'General' folder ('A folder with that name already exists'), which aborts the sidecar provisioner's walk and drops every dashboard in that folder. Move uk-payslip to Finance so it loads. --- stacks/monitoring/modules/monitoring/grafana.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/stacks/monitoring/modules/monitoring/grafana.tf b/stacks/monitoring/modules/monitoring/grafana.tf index a3b61556..c82eee6b 100644 --- a/stacks/monitoring/modules/monitoring/grafana.tf +++ b/stacks/monitoring/modules/monitoring/grafana.tf @@ -134,6 +134,7 @@ locals { # Applications "qbittorrent.json" = "Applications" "realestate-crawler.json" = "Applications" + "uk-payslip.json" = "Finance" } }