From ccbcebb67085064d7819cd8a5070f9d2421d3b61 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 17 Mar 2026 23:15:25 +0000 Subject: [PATCH] feat(vault): automate SOPS onboarding for namespace-owners - Add Transit mount + per-stack Transit keys to vault stack TF - Auto-create sops-user- policy scoping decrypt to owned stacks - Auto-create sops- external group + alias for Authentik mapping - Add sops-admin policy to authentik-admins group - Attach sops-user policy to namespace-owner identity entities - Update add-user skill with SOPS onboarding steps and Authentik group - Adding a user to k8s_users + applying vault stack = full SOPS access [ci skip] --- .claude/skills/add-user/SKILL.md | 73 +++++++++++++++++++++-------- stacks/vault/main.tf | 79 ++++++++++++++++++++++++++++++-- 2 files changed, 127 insertions(+), 25 deletions(-) diff --git a/.claude/skills/add-user/SKILL.md b/.claude/skills/add-user/SKILL.md index 9b833021..212344c6 100644 --- a/.claude/skills/add-user/SKILL.md +++ b/.claude/skills/add-user/SKILL.md @@ -12,6 +12,8 @@ description: | Add a new namespace-owner to the cluster. No code changes needed — only Vault KV update + stack applies. +SOPS state encryption access is **automatically provisioned** by the vault stack — per-stack Transit keys, policies, identity groups, and group aliases are all created from the `k8s_users` map. No manual SOPS setup required. + ## Workflow ### Step 1: Collect Information @@ -31,9 +33,10 @@ Ask the user for ALL of the following before proceeding: Also confirm: - Has the user been added to the **`kubernetes-namespace-owners`** group in [Authentik](https://authentik.viktorbarzin.me)? (Manual step — admin must do this in the UI) +- Has the user been added to the **`sops-USERNAME`** group in Authentik? (Required for terraform state decrypt — the vault stack creates the Vault external group, but the Authentik group must exist and the user must be in it) - Does the user need VPN access? If yes, also add to **`Headscale Users`** group in Authentik. -**Do NOT proceed until the Authentik group assignment is confirmed.** +**Do NOT proceed until the Authentik group assignments are confirmed.** ### Step 2: Update Vault KV @@ -76,27 +79,39 @@ vault kv get -field=k8s_users secret/platform | jq '.USERNAME' ### Step 3: Apply Stacks -Apply in order. Use the `scripts/tg` wrapper or `terragrunt` directly. +Apply in order. Use the `scripts/tg` wrapper. ```bash cd /Users/viktorbarzin/code/infra -# 1. Vault stack — creates namespace, Vault policy, identity entity, deployer role -cd stacks/vault && terragrunt apply --non-interactive +# 1. Vault stack — creates namespace, Vault policy, identity entity, deployer role, +# SOPS Transit key, SOPS policy, SOPS identity group + alias +cd stacks/vault && ../../scripts/tg apply --non-interactive cd ../.. -# 2. Platform stack — creates RBAC bindings, ResourceQuota, TLS secret, DNS records -cd stacks/platform && terragrunt apply --non-interactive +# 2. RBAC stack — creates RBAC bindings, ResourceQuota, TLS secret +cd stacks/rbac && ../../scripts/tg apply --non-interactive cd ../.. # 3. Woodpecker stack — adds user to Woodpecker admin list -cd stacks/woodpecker && terragrunt apply --non-interactive +cd stacks/woodpecker && ../../scripts/tg apply --non-interactive cd ../.. ``` -Use the `devops-engineer` agent for each apply to get background pod health monitoring. +### Step 4: Create Per-Stack Encrypted State -### Step 4: Verify +For each of the user's namespaces, ensure the Transit key is used for state encryption. New stacks created for the user will automatically use per-stack keys via `scripts/state-sync`. + +If the user's stack already has state, re-encrypt it with the new per-stack key: +```bash +# Force re-encrypt (delete old .enc, state-sync will use per-stack Transit key) +rm state/stacks/NAMESPACE/terraform.tfstate.enc +scripts/state-sync encrypt NAMESPACE +git add state/stacks/NAMESPACE/terraform.tfstate.enc +git commit -m "state(NAMESPACE): re-encrypt with per-stack Transit key" +``` + +### Step 5: Verify ```bash # Namespace exists @@ -105,25 +120,38 @@ kubectl get namespace USERNAME_NAMESPACE # ResourceQuota applied kubectl describe resourcequota -n USERNAME_NAMESPACE -# Vault policy exists +# Vault policy exists (namespace-owner + SOPS) vault policy read namespace-owner-USERNAME +vault policy read sops-user-USERNAME -# Vault identity entity exists +# Vault identity entity exists (with both policies) vault read identity/entity/name/USERNAME +# SOPS group exists +vault read identity/group/name/sops-USERNAME + # K8s deployer role works vault write kubernetes/creds/NAMESPACE-deployer kubernetes_namespace=NAMESPACE +# SOPS Transit key exists +vault read transit/keys/sops-state-NAMESPACE + # DNS record (if domains specified) dig DOMAIN.viktorbarzin.me ``` -### Step 5: Notify User +### Step 6: Notify User Tell the user to share these onboarding instructions with the new user: - K8s Portal: `https://k8s-portal.viktorbarzin.me/onboarding?role=namespace-owner` - README: `https://github.com/ViktorBarzin/infra#new-user-onboarding` +The user can decrypt their stack's state with: +```bash +vault login -method=oidc # authenticates via Authentik SSO +scripts/state-sync decrypt NAMESPACE # decrypts only their stack +``` + ## What Gets Auto-Generated | Resource | Stack | Driven by | @@ -132,20 +160,25 @@ Tell the user to share these onboarding instructions with the new user: | Vault policy (`namespace-owner-{user}`) | vault | user key | | Vault identity entity + OIDC alias | vault | user email | | K8s deployer Role + Vault K8s role | vault | `namespaces` list | -| RBAC RoleBinding (namespace admin) | platform | `namespaces` list | -| RBAC ClusterRoleBinding (cluster read-only) | platform | user role | -| ResourceQuota | platform | `quota` object | -| TLS secret in namespace | platform | `namespaces` list | -| Cloudflare DNS records | platform | `domains` list | +| **SOPS Transit key** (`sops-state-{ns}`) | vault | `namespaces` list | +| **SOPS Vault policy** (`sops-user-{user}`) | vault | user key + namespaces | +| **SOPS identity group** (`sops-{user}`) | vault | user key | +| **SOPS group alias** (maps Authentik group) | vault | user key | +| RBAC RoleBinding (namespace admin) | rbac | `namespaces` list | +| RBAC ClusterRoleBinding (cluster read-only) | rbac | user role | +| ResourceQuota | rbac | `quota` object | +| TLS secret in namespace | rbac | `namespaces` list | +| Cloudflare DNS records | cloudflared | `domains` list | | Woodpecker admin access | woodpecker | user key | ## Checklist - [ ] Authentik: user added to `kubernetes-namespace-owners` group +- [ ] Authentik: user added to `sops-USERNAME` group (for SOPS state decrypt) - [ ] Authentik: user added to `Headscale Users` group (if VPN needed) - [ ] Vault KV: `k8s_users` entry added to `secret/platform` -- [ ] Vault stack applied — namespace + policy + identity + deployer role created -- [ ] Platform stack applied — RBAC + quota + TLS + DNS created +- [ ] Vault stack applied — namespace + policy + identity + deployer role + SOPS Transit key + SOPS policy + SOPS group created +- [ ] RBAC stack applied — RBAC + quota + TLS created - [ ] Woodpecker stack applied — admin list updated -- [ ] Verification: namespace, quota, policy, deployer role all confirmed +- [ ] Verification: namespace, quota, policies (namespace-owner + sops-user), deployer role, Transit key all confirmed - [ ] User notified with onboarding link diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index f0a19819..77517420 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -184,10 +184,19 @@ resource "vault_policy" "admin" { EOT } +resource "vault_policy" "sops_admin" { + name = "sops-admin" + policy = <<-EOT + path "transit/encrypt/sops-state-*" { capabilities = ["update"] } + path "transit/decrypt/sops-state-*" { capabilities = ["update"] } + path "transit/keys/sops-state-*" { capabilities = ["read"] } + EOT +} + resource "vault_identity_group" "admins" { name = "authentik-admins" type = "external" - policies = [vault_policy.admin.name] + policies = [vault_policy.admin.name, vault_policy.sops_admin.name] } resource "vault_identity_group_alias" "admins" { @@ -694,7 +703,7 @@ locals { } resource "kubernetes_namespace" "user_namespace" { - for_each = local.user_namespaces + for_each = nonsensitive(local.user_namespaces) metadata { name = each.value @@ -736,6 +745,66 @@ resource "vault_policy" "namespace_owner" { EOT } +# ============================================================================= +# Transit Secrets Engine — SOPS State Encryption +# ============================================================================= + +resource "vault_mount" "transit" { + path = "transit" + type = "transit" + depends_on = [helm_release.vault] +} + +# --- SOPS State Encryption — Per-Stack Transit Keys --- +# Namespace-owners get Transit keys for their stacks only. +# Admin gets a wildcard policy via vault-admin. + +resource "vault_transit_secret_backend_key" "sops_user_stack" { + for_each = nonsensitive(local.user_namespaces) + + backend = vault_mount.transit.path + name = "sops-state-${each.value}" + depends_on = [vault_mount.transit] +} + +resource "vault_policy" "sops_user" { + for_each = nonsensitive({ + for name, user in local.k8s_users : name => user + if user.role == "namespace-owner" + }) + + name = "sops-user-${each.key}" + policy = join("\n", [ + for ns in each.value.namespaces : <<-EOT + path "transit/encrypt/sops-state-${ns}" { capabilities = ["update"] } + path "transit/decrypt/sops-state-${ns}" { capabilities = ["update"] } + path "transit/keys/sops-state-${ns}" { capabilities = ["read"] } + EOT + ]) +} + +resource "vault_identity_group" "sops_user" { + for_each = nonsensitive({ + for name, user in local.k8s_users : name => user + if user.role == "namespace-owner" + }) + + name = "sops-${each.key}" + type = "external" + policies = [vault_policy.sops_user[each.key].name] +} + +resource "vault_identity_group_alias" "sops_user" { + for_each = nonsensitive({ + for name, user in local.k8s_users : name => user + if user.role == "namespace-owner" + }) + + name = "sops-${each.key}" + mount_accessor = vault_jwt_auth_backend.oidc.accessor + canonical_id = vault_identity_group.sops_user[each.key].id +} + resource "vault_identity_entity" "namespace_owner" { for_each = nonsensitive({ for name, user in local.k8s_users : name => user @@ -743,7 +812,7 @@ resource "vault_identity_entity" "namespace_owner" { }) name = each.key - policies = [vault_policy.namespace_owner[each.key].name] + policies = [vault_policy.namespace_owner[each.key].name, vault_policy.sops_user[each.key].name] } resource "vault_identity_entity_alias" "namespace_owner" { @@ -758,7 +827,7 @@ resource "vault_identity_entity_alias" "namespace_owner" { } resource "kubernetes_role" "user_deployer" { - for_each = local.user_namespaces + for_each = nonsensitive(local.user_namespaces) metadata { name = "${each.value}-deployer" @@ -779,7 +848,7 @@ resource "kubernetes_role" "user_deployer" { } resource "vault_kubernetes_secret_backend_role" "user_deployer" { - for_each = local.user_namespaces + for_each = nonsensitive(local.user_namespaces) backend = vault_kubernetes_secret_backend.k8s.path name = "${each.value}-deployer"