diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 3451fe78..da5f0f51 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -66,7 +66,7 @@ Violations cause state drift, which causes future applies to break or silently r - **ESO (External Secrets Operator)**: `stacks/external-secrets/` — 43 ExternalSecrets + 9 DB-creds ExternalSecrets. API version `v1beta1`. Two ClusterSecretStores: `vault-kv` and `vault-database`. - **Plan-time pattern**: Former plan-time stacks use `data "kubernetes_secret"` to read ESO-created K8s Secrets at plan time (no Vault dependency). First-apply gotcha: must `terragrunt apply -target=kubernetes_manifest.external_secret` first, then full apply. `count` on resources using secret values fails — remove conditional counts. - **14 hybrid stacks** still keep `data "vault_kv_secret_v2"` for plan-time needs (job commands, Helm templatefile, module inputs). Platform has 48 plan-time refs — no migration possible without restructuring modules. -- **Database rotation**: Vault DB engine rotates passwords every 7 days (604800s). MySQL: speedtest, wrongmove, codimd, nextcloud, shlink, grafana, phpipam. PostgreSQL: health, linkwarden, affine, woodpecker, claude_memory, crowdsec, technitium, matrix. Excluded: authentik (PgBouncer), root users. **Apps that read a rotated secret only at startup** (env var / initContainer, not a hot-reloaded mount) MUST carry a Reloader annotation (`secret.reloader.stakater.com/reload: `) or they keep the stale password and silently fail DB auth on each rotation until manually restarted — matrix's Synapse `inject-db-password` initContainer hit exactly this (found via Loki 2026-06-05, ~12.9k auth-fail lines/hr). Technitium uses a password-sync CronJob (every 6h) to push rotated password to the Technitium app config via API, disable SQLite + MySQL logging, check PG plugin is loaded, configure PG query logging (90-day retention), and disable SQLite on secondary/tertiary instances. +- **Database rotation**: Vault DB engine rotates passwords every 7 days (604800s). MySQL: speedtest, wrongmove, codimd, nextcloud, shlink, grafana, phpipam. PostgreSQL: health, linkwarden, affine, woodpecker, claude_memory, crowdsec, technitium. Excluded: authentik (PgBouncer), root users. **Apps that read a rotated secret only at startup** (env var / initContainer, not a hot-reloaded mount) MUST carry a Reloader annotation (`secret.reloader.stakater.com/reload: `) or they keep the stale password and silently fail DB auth on each rotation until manually restarted — matrix's Synapse `inject-db-password` initContainer hit exactly this (found via Loki 2026-06-05, ~12.9k auth-fail lines/hr); matrix has since migrated to tuwunel (RocksDB, no Postgres) on 2026-06-08 and is no longer in the rotation list above. Technitium uses a password-sync CronJob (every 6h) to push rotated password to the Technitium app config via API, disable SQLite + MySQL logging, check PG plugin is loaded, configure PG query logging (90-day retention), and disable SQLite on secondary/tertiary instances. - **K8s credentials**: Vault K8s secrets engine. Roles: `dashboard-admin`, `ci-deployer`, `openclaw`, `local-admin`. Use `vault write kubernetes/creds/ROLE kubernetes_namespace=NS`. Helper: `scripts/vault-kubeconfig`. - **CI/CD (GHA + Woodpecker)**: Docker builds run on **GitHub Actions** (free on public repos). Woodpecker is **deploy-only** — receives image tag via API POST, runs `kubectl set image`. Woodpecker authenticates via K8s SA JWT → Vault K8s auth. Sync CronJob pushes `secret/ci/global` → Woodpecker API every 6h. Shell scripts in HCL heredocs: escape `$` → `$$`, `%{}` → `%%{}`. - **Platform cannot depend on vault** (circular). Apply order: vault first, then platform. Platform has 48 vault refs, all in module inputs — no ESO migration possible. diff --git a/.claude/reference/authentik-state.md b/.claude/reference/authentik-state.md index 28ac87ba..31fb102c 100644 --- a/.claude/reference/authentik-state.md +++ b/.claude/reference/authentik-state.md @@ -14,7 +14,7 @@ | Kubernetes | OAuth2/OIDC (public) | implicit consent | | Kubernetes Dashboard | OAuth2/OIDC (confidential) | implicit consent | | linkwarden | OAuth2/OIDC | explicit consent | -| Matrix | OAuth2/OIDC | implicit consent | +| Matrix | OAuth2/OIDC | ⚠️ orphaned — Matrix migrated to tuwunel 2026-06-08 (native password auth); this OAuth app is unused | | wrongmove | OAuth2/OIDC | implicit consent | > **Kubernetes Dashboard** (TF-managed in `stacks/k8s-dashboard/authentik.tf`): diff --git a/.claude/reference/service-catalog.md b/.claude/reference/service-catalog.md index 85fa1cf1..1a722452 100644 --- a/.claude/reference/service-catalog.md +++ b/.claude/reference/service-catalog.md @@ -86,7 +86,7 @@ | diun | Docker image update notifier — detects new versions, fires webhook to n8n upgrade agent | diun | | meshcentral | Remote management | meshcentral | | homepage | Dashboard/startpage | homepage | -| matrix | Matrix chat server | matrix | +| matrix | Matrix homeserver (tuwunel — Rust, RocksDB; native password auth) | matrix | | linkwarden | Bookmark manager | linkwarden | | changedetection | Web change detection | changedetection | | tandoor | Recipe manager | tandoor | diff --git a/.claude/reference/upgrade-config.json b/.claude/reference/upgrade-config.json index cfbb5ec8..7f2c4712 100644 --- a/.claude/reference/upgrade-config.json +++ b/.claude/reference/upgrade-config.json @@ -7,7 +7,6 @@ "docker.io/mailserver/docker-mailserver": "docker-mailserver/docker-mailserver", "mailserver/docker-mailserver": "docker-mailserver/docker-mailserver", "docker.n8n.io/n8nio/n8n": "n8n-io/n8n", - "matrixdotorg/synapse": "element-hq/synapse", "headscale/headscale": "juanfont/headscale", "technitium/dns-server": "TechnitiumSoftware/DnsServer", "ghcr.io/paperless-ngx/paperless-ngx": "paperless-ngx/paperless-ngx", @@ -82,7 +81,6 @@ "dawarich": { "type": "postgresql", "db_name": "dawarich", "shared": true }, "health": { "type": "postgresql", "db_name": "health", "shared": true }, "linkwarden": { "type": "postgresql", "db_name": "linkwarden", "shared": true }, - "matrix": { "type": "postgresql", "db_name": "matrix", "shared": true }, "n8n": { "type": "postgresql", "db_name": "n8n", "shared": true }, "netbox": { "type": "postgresql", "db_name": "netbox", "shared": true }, "rybbit": { "type": "postgresql", "db_name": "rybbit", "shared": true }, diff --git a/docs/architecture/authentication.md b/docs/architecture/authentication.md index e87b586f..bd0b5941 100644 --- a/docs/architecture/authentication.md +++ b/docs/architecture/authentication.md @@ -102,7 +102,7 @@ Authentik provides OIDC for 10 applications: | Kubernetes | OIDC (public client) | K8s API authentication (kubectl / kubelogin CLI) | | Kubernetes Dashboard | OIDC (confidential) | Built for dashboard SSO — currently **idle** (apiserver OIDC blocked; dashboard uses forward-auth + token-paste) | | Linkwarden | OIDC | Bookmark manager SSO | -| Matrix | OIDC | Matrix homeserver SSO | +| Matrix | OIDC | ⚠️ Legacy/orphaned — Synapse→tuwunel migration 2026-06-08; tuwunel uses native password auth, OIDC SSO not wired | | Wrongmove | OIDC | Real estate app SSO | ### Kubernetes API authentication (OIDC) — CURRENTLY NON-FUNCTIONAL diff --git a/docs/plans/2026-06-08-matrix-synapse-to-tuwunel-design.md b/docs/plans/2026-06-08-matrix-synapse-to-tuwunel-design.md new file mode 100644 index 00000000..2a9cbdeb --- /dev/null +++ b/docs/plans/2026-06-08-matrix-synapse-to-tuwunel-design.md @@ -0,0 +1,52 @@ +# Matrix: Synapse → tuwunel migration — Design + +**Date:** 2026-06-08 +**Status:** Implemented +**Stack:** `stacks/matrix` (+ `stacks/vault` cleanup) + +## Context + +The `matrix` homeserver ran **Synapse** (`matrixdotorg/synapse:v1.151.0`) on a +cramped `256Mi/512Mi` allocation. Synapse (Python) wants 1–2 GB; at 512Mi it was +starved. During a Slack-vs-Discord-vs-Matrix evaluation Viktor confirmed Slack +stays his primary hub, but wanted a **working, federated Matrix server kept +available "in case I need it."** The resource pain was Synapse-specific — not +inherent to Matrix — so the fix was to swap the homeserver implementation, not +abandon Matrix. + +## Decision + +Replace Synapse with **tuwunel v1.7.1** (Rust, RocksDB) — the +enterprise/Swiss-government-backed official successor to the (archived 2026-01-19) +conduwuit. + +| Choice | Decision | Rationale | +|---|---|---| +| Homeserver | **tuwunel** (vs continuwuity) | Corporate-backed, full-time staff → best longevity for a set-and-forget server | +| Data | **Fresh start** (no migration) | No supported Synapse(Postgres)→RocksDB path; Viktor confirmed old rooms/messages disposable | +| Federation | **ON** | A backup server is only useful if it can reach the wider Matrix network | +| `server_name` | **unchanged** (`matrix.viktorbarzin.me`) | Element clients keep pointing at the same place; only a re-login needed | +| Database | **embedded RocksDB** on the existing encrypted PVC | Drops the entire CNPG dependency; local-SSD LUKS2 suits RocksDB's small writes (NFS would be wrong) | +| Registration | token-gated, then **disabled** | First user = admin; locked down after registering `@viktor` | +| Auth | **native password** | tuwunel OIDC SSO not wired — Authentik Matrix OAuth app is now orphaned (harmless) | +| Media cap | **50 MiB** | Kept under Cloudflare's 100 MB proxied-request ceiling | + +## Alternatives considered + +- **Keep Synapse, bump to 2 GB** — zero-migration, but stays the heavy Python + server; rejected in favour of the lightweight Rust target Viktor asked for. +- **continuwuity** — community continuation; viable and lighter-community, but + tuwunel's corporate backing won on longevity. +- **Synapse → tuwunel data migration** — not possible (different storage + engines); fresh start is the only path. + +## As-built + +- Fully env-var configured (`TUWUNEL_*`, `__` for nested) — no TOML ConfigMap. +- tuwunel serves its own `.well-known/matrix/{client,server}` → federation + resolves to Cloudflare-proxied `:443` (no 8448 / SRV needed). +- Ingress unchanged: `auth = "none"` (Matrix uses bearer/signed requests), + `dns_type = "proxied"`. +- Pod `securityContext` `runAsUser/runAsGroup/fsGroup = 1000` so uid 1000 can + write the encrypted RocksDB PVC. +- Image kept under Keel + diun semver management (`^v\d+\.\d+\.\d+$`). diff --git a/docs/plans/2026-06-08-matrix-synapse-to-tuwunel-plan.md b/docs/plans/2026-06-08-matrix-synapse-to-tuwunel-plan.md new file mode 100644 index 00000000..e8633886 --- /dev/null +++ b/docs/plans/2026-06-08-matrix-synapse-to-tuwunel-plan.md @@ -0,0 +1,58 @@ +# Matrix: Synapse → tuwunel migration — Plan (executed) + +**Date:** 2026-06-08 · **Companion:** `2026-06-08-matrix-synapse-to-tuwunel-design.md` + +## Executed steps + +1. **Vault** — generated a 32-byte `registration_token`, stored at + `secret/matrix`. +2. **`stacks/matrix` rewrite** — replaced Synapse with tuwunel: removed the + `matrix-db-creds` ExternalSecret, both init-containers (`install-psycopg2`, + `inject-db-password`), the `extra-packages` volume, and the Reloader + annotation; added the `matrix-secrets` ExternalSecret (vault-kv `dataFrom`), + the `TUWUNEL_*` env, `securityContext` 1000, and the tuwunel image. Encrypted + PVC, Service (`80→8008`), and ingress (`auth="none"`, proxied) unchanged. + - The image is in the deployment's `ignore_changes` (KEEL_IGNORE_IMAGE); it + was **temporarily un-ignored** for this base-image swap, then re-added at + step 4 so Keel resumes tag management. + - `tg init -reconfigure` was required first (Tier-1 PG-backend creds rotate + weekly → "Backend configuration block has changed"). +3. **Apply** — `Plan: 1 to add, 2 to change, 1 to destroy`. tuwunel 1.7.1 came up + 1/1, created a fresh RocksDB on the encrypted PVC (no permission errors — + fsGroup worked). +4. **Verify** — all `200`: `/_tuwunel/server_version`, `.well-known/matrix/ + {client,server}`, `/_matrix/client/versions`, `/_matrix/federation/v1/version`. + Registered `@viktor:matrix.viktorbarzin.me` (first user → admin) via the token + flow; `whoami` confirmed. Creds stored at `secret/matrix` + (`admin_user`, `admin_password`). +5. **Lock down** — `TUWUNEL_ALLOW_REGISTRATION=false` + re-added image + `ignore_changes`; applied. Registration now returns `403 M_FORBIDDEN`. +6. **Cleanup** — + - `stacks/vault`: removed the `pg_matrix` static role + its `allowed_roles` + entry (targeted apply — the full plan also wanted an **unrelated** OIDC + `tune`-TTL change, deliberately NOT applied; see residual items). + - Dropped the orphaned `matrix` Postgres DB (16 MB) + `matrix` role on the + CNPG primary (`pg-cluster-2`). + - Docs updated: `.claude/CLAUDE.md` (PG-rotation list), `service-catalog.md`, + `upgrade-config.json` (removed synapse image-rename + matrix PG entry), + `authentication.md` + `authentik-state.md` (Matrix OIDC → orphaned). + +## Rollback + +Fresh start was confirmed, so there is no Synapse data to preserve. To revert the +*service*: restore the Synapse `main.tf` from git, re-add the `pg_matrix` Vault +role, and restore the `matrix` Postgres DB from the daily per-db dump +(`/backup/per-db/matrix/`). The reused encrypted PVC still holds Synapse's old +`homeserver.yaml` / signing key / media at the volume root alongside the new +RocksDB dir. + +## Residual / follow-up items (flagged to user) + +- **Authentik Matrix OAuth2 app is now orphaned** — tuwunel uses native password + auth (OIDC SSO not wired). Harmless; can be removed from the authentik stack + later if desired. +- **Pre-existing drift in `stacks/vault`**: `vault_jwt_auth_backend.oidc` shows a + `tune` diff (explicit `768h` default/max lease TTLs being dropped). This + predates this migration and was **not** applied. Resolve separately. +- **Synapse leftover files** remain on the encrypted PVC volume root (unused by + tuwunel). Can be `rm`'d after confidence in the new server. diff --git a/stacks/matrix/main.tf b/stacks/matrix/main.tf index b83fe49a..b3c545c7 100644 --- a/stacks/matrix/main.tf +++ b/stacks/matrix/main.tf @@ -10,7 +10,7 @@ resource "kubernetes_namespace" "matrix" { name = "matrix" labels = { "istio-injection" : "disabled" - tier = local.tiers.aux + tier = local.tiers.aux "keel.sh/enrolled" = "true" } } @@ -20,34 +20,30 @@ resource "kubernetes_namespace" "matrix" { } } -# DB credentials from Vault database engine (rotated every 24h) -resource "kubernetes_manifest" "db_external_secret" { +# Registration token from Vault KV (secret/matrix). Token-gated registration: +# enabled transiently to register the admin account, then allow_registration is +# flipped to false. The token stays in Vault so registration can be re-opened +# later (e.g. to add family) without regenerating it. +resource "kubernetes_manifest" "secrets_external_secret" { manifest = { apiVersion = "external-secrets.io/v1beta1" kind = "ExternalSecret" metadata = { - name = "matrix-db-creds" + name = "matrix-secrets" namespace = "matrix" } spec = { refreshInterval = "15m" secretStoreRef = { - name = "vault-database" + name = "vault-kv" kind = "ClusterSecretStore" } target = { - name = "matrix-db-creds" - template = { - data = { - DB_PASSWORD = "{{ .password }}" - } - } + name = "matrix-secrets" } - data = [{ - secretKey = "password" - remoteRef = { - key = "static-creds/pg-matrix" - property = "password" + dataFrom = [{ + extract = { + key = "matrix" } }] } @@ -61,6 +57,8 @@ module "tls_secret" { tls_secret_name = var.tls_secret_name } +# RocksDB lives here. proxmox-lvm-encrypted (local SSD, LUKS2) suits the +# homeserver DB's many small writes; NFS would be the wrong backend. resource "kubernetes_persistent_volume_claim" "data_encrypted" { wait_until_bound = false metadata { @@ -98,18 +96,6 @@ resource "kubernetes_deployment" "matrix" { app = "matrix" tier = local.tiers.aux } - annotations = { - # Synapse reads the DB password ONLY at startup: the inject-db-password - # initContainer seds matrix-db-creds into homeserver.yaml. That secret is - # rotated by Vault via the ESO above (15m refresh), so without an - # auto-reload the running pod keeps a stale password and Synapse's DB - # auth fails on every rotation until a manual `rollout restart` (observed - # 2026-06-05). Reloader watches the named secret and rolls the deployment - # when it changes. Explicit form (not auto/search) because the secret is - # referenced only in an initContainer env var, not a mount/envFrom, so - # Reloader's reference auto-discovery is unreliable here. - "secret.reloader.stakater.com/reload" = "matrix-db-creds" - } } spec { replicas = 1 @@ -127,79 +113,99 @@ resource "kubernetes_deployment" "matrix" { app = "matrix" } annotations = { - "diun.enable" = "true" - "diun.include_tags" = "^v\\d+\\.\\d+\\.\\d+$" - "dependency.kyverno.io/wait-for" = "pg-cluster-rw.dbaas:5432" + "diun.enable" = "true" + "diun.include_tags" = "^v\\d+\\.\\d+\\.\\d+$" } } spec { - init_container { - name = "install-psycopg2" - image = "matrixdotorg/synapse:v1.151.0" - command = ["/bin/sh", "-c", "pip install --target=/extra-packages psycopg2-binary 2>/dev/null"] - volume_mount { - name = "extra-packages" - mount_path = "/extra-packages" - } - } - init_container { - name = "inject-db-password" - image = "busybox:1.37" - command = ["/bin/sh", "-c", <<-EOF - # Update database config in homeserver.yaml with current Vault-managed password - sed -i "s|host: .*dbaas.*|host: pg-cluster-rw.dbaas.svc.cluster.local|" /data/homeserver.yaml - sed -i "s|user: .*|user: matrix|" /data/homeserver.yaml - sed -i "s|password: .*|password: $DB_PASSWORD|" /data/homeserver.yaml - echo "DB password injected" - EOF - ] - env { - name = "DB_PASSWORD" - value_from { - secret_key_ref { - name = "matrix-db-creds" - key = "DB_PASSWORD" - } - } - } - volume_mount { - name = "data" - mount_path = "/data" - } + # tuwunel runs as an unprivileged static binary; fsGroup makes the + # encrypted RocksDB volume group-writable so uid 1000 can write it + # (avoids the init-chown/fsGroup mismatch that parked hermes-agent). + security_context { + run_as_user = 1000 + run_as_group = 1000 + fs_group = 1000 } container { - image = "matrixdotorg/synapse:v1.151.0" + image = "ghcr.io/matrix-construct/tuwunel:v1.7.1" name = "matrix" port { container_port = 8008 } env { - name = "SYNAPSE_SERVER_NAME" + name = "TUWUNEL_SERVER_NAME" value = "matrix.viktorbarzin.me" } env { - name = "SYNAPSE_REPORT_STATS" - value = "yes" + name = "TUWUNEL_DATABASE_PATH" + value = "/var/lib/tuwunel" } env { - name = "PYTHONPATH" - value = "/extra-packages" + name = "TUWUNEL_PORT" + value = "8008" + } + env { + name = "TUWUNEL_ADDRESS" + value = "0.0.0.0" + } + env { + name = "TUWUNEL_ALLOW_FEDERATION" + value = "true" + } + env { + name = "TUWUNEL_TRUSTED_SERVERS" + value = jsonencode(["matrix.org"]) + } + # Registration disabled. To add a user later: set "true", apply, + # register with the Vault token (secret/matrix), then set back to "false". + env { + name = "TUWUNEL_ALLOW_REGISTRATION" + value = "false" + } + env { + name = "TUWUNEL_REGISTRATION_TOKEN" + value_from { + secret_key_ref { + name = "matrix-secrets" + key = "registration_token" + } + } + } + # 50 MiB — kept under Cloudflare's 100 MB proxied-request ceiling. + env { + name = "TUWUNEL_MAX_REQUEST_SIZE" + value = "52428800" + } + # tuwunel serves its own .well-known so federation resolves to 443 + # (Cloudflare-proxied) without a separate 8448 / SRV record. + env { + name = "TUWUNEL_WELL_KNOWN__CLIENT" + value = "https://matrix.viktorbarzin.me" + } + env { + name = "TUWUNEL_WELL_KNOWN__SERVER" + value = "matrix.viktorbarzin.me:443" + } + # Real client IP for rate-limiting: behind Cloudflare's CF-Connecting-IP. + env { + name = "TUWUNEL_IP_SOURCE" + value = "cf_connecting_ip" + } + env { + name = "TUWUNEL_LOG" + value = "warn,tuwunel=info" } volume_mount { name = "data" - mount_path = "/data" - } - volume_mount { - name = "extra-packages" - mount_path = "/extra-packages" + mount_path = "/var/lib/tuwunel" } resources { requests = { - cpu = "25m" + cpu = "100m" memory = "256Mi" } limits = { - memory = "512Mi" + memory = "1Gi" } } } @@ -209,10 +215,6 @@ resource "kubernetes_deployment" "matrix" { claim_name = kubernetes_persistent_volume_claim.data_encrypted.metadata[0].name } } - volume { - name = "extra-packages" - empty_dir {} - } } } } @@ -224,8 +226,6 @@ resource "kubernetes_deployment" "matrix" { metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 metadata[0].annotations["keel.sh/match-tag"], spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates - spec[0].template[0].spec[0].init_container[0].image, - spec[0].template[0].spec[0].init_container[1].image, metadata[0].annotations["kubernetes.io/change-cause"], metadata[0].annotations["deployment.kubernetes.io/revision"], spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 @@ -269,12 +269,9 @@ module "ingress" { extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "Matrix" - "gethomepage.dev/description" = "Secure messaging" + "gethomepage.dev/description" = "Secure messaging (tuwunel)" "gethomepage.dev/icon" = "matrix.png" "gethomepage.dev/group" = "Other" "gethomepage.dev/pod-selector" = "" } } - -# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed) -# CI retrigger v2 2026-05-16T13:46:35+00:00 diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index d17c9e01..394a6577 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -660,7 +660,7 @@ resource "vault_database_secret_backend_connection" "postgresql" { "pg-postiz", "pg-instagram-poster", "pg-recruiter-responder", "pg-tripit", "pg-nextcloud-todos", - "pg-matrix", "pg-technitium", + "pg-technitium", ] postgresql { @@ -870,14 +870,6 @@ resource "vault_database_secret_backend_static_role" "pg_tripit" { rotation_period = 604800 } -resource "vault_database_secret_backend_static_role" "pg_matrix" { - backend = vault_mount.database.path - db_name = vault_database_secret_backend_connection.postgresql.name - name = "pg-matrix" - username = "matrix" - rotation_period = 86400 -} - resource "vault_database_secret_backend_static_role" "pg_technitium" { backend = vault_mount.database.path db_name = vault_database_secret_backend_connection.postgresql.name