From 116c4d9c30263c41f2a2c1937e66d79a93085da5 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 22 Feb 2026 15:23:27 +0000 Subject: [PATCH] [ci skip] Remove legacy files and orphaned modules Delete 20 orphaned module directories and 3 stray files from modules/kubernetes/ that are no longer referenced by any stack. Remove 7 root-level legacy files including the empty tfstate, 27MB terraform zip, commented-out main.tf, and migration notes. Clean up commented-out dockerhub_secret and oauth-proxy references in blog, travel_blog, and city-guesser stacks. Remove stale frigate config.yaml entry from .gitignore. Remove ephemeral docs/plans/ directory. --- .claude/CLAUDE.md | 5 +- .gitignore | 2 - .terraform.lock.hcl | 130 -- corefils.yaml | 42 - ...02-13-centralized-log-collection-design.md | 140 -- ...6-02-13-centralized-log-collection-plan.md | 532 ------- ...2026-02-17-multi-user-k8s-access-design.md | 154 -- .../2026-02-17-multi-user-k8s-access-plan.md | 1175 ---------------- ...026-02-21-openclaw-cluster-agent-design.md | 111 -- .../2026-02-21-openclaw-cluster-agent-plan.md | 800 ----------- .../2026-02-22-terragrunt-migration-design.md | 387 ------ .../2026-02-22-terragrunt-migration-plan.md | 1235 ----------------- main.tf | 326 ----- migrate_tfstate.txt | 4 - modules/kubernetes/authelia/main.tf | 178 --- .../kubernetes/authelia/users_database.yml | 10 - modules/kubernetes/authelia/values.yaml | 24 - .../bind/deployment-factory/main.tf | 93 -- modules/kubernetes/bind/extra/viktorbarzin.me | 180 --- modules/kubernetes/bind/main.tf | 77 - .../kubernetes/bind/service-factory/main.tf | 28 - modules/kubernetes/bind/variables.tf | 98 -- modules/kubernetes/discount-bandit/main.tf | 107 -- modules/kubernetes/dnscat2/main.tf | 80 -- modules/kubernetes/dnscrypt/main.tf | 92 -- modules/kubernetes/dockerhub_secret/main.tf | 23 - modules/kubernetes/finance_app/main.tf | 315 ----- .../home_assistant_chart_values.tpl | 74 - modules/kubernetes/home_assistant/main.tf | 238 ---- modules/kubernetes/idrac-power-cycle.sh | 12 - modules/kubernetes/istio/base.yaml | 40 - modules/kubernetes/istio/istiod.yaml | 520 ------- modules/kubernetes/istio/kiali.yaml | 122 -- modules/kubernetes/istio/main.tf | 116 -- modules/kubernetes/jellyfin/main.tf | 117 -- .../kubernetes/kafka/kafka_chart_values.tpl | 9 - modules/kubernetes/kafka/main.tf | 142 -- modules/kubernetes/keyserver/index.md | 73 - modules/kubernetes/keyserver/inventory.ini | 2 - modules/kubernetes/kured/main.tf | 31 - modules/kubernetes/kured/values.yaml | 12 - modules/kubernetes/localai/chart_values.tpl | 93 -- modules/kubernetes/localai/main.tf | 21 - modules/kubernetes/main.tf | 3 - modules/kubernetes/mcaptcha/main.tf | 310 ----- modules/kubernetes/oauth-proxy/main.tf | 400 ------ modules/kubernetes/openid_help_page/main.tf | 87 -- modules/kubernetes/pihole/main.tf | 201 --- modules/kubernetes/vault/chart_values.tpl | 23 - modules/kubernetes/vault/main.tf | 61 - modules/kubernetes/versions.tf | 8 - modules/kubernetes/vikunja/main.tf | 216 --- stacks/blog/main.tf | 8 - stacks/city-guesser/main.tf | 62 - stacks/travel_blog/main.tf | 6 - versions.tf | 49 - 56 files changed, 2 insertions(+), 9402 deletions(-) delete mode 100644 .terraform.lock.hcl delete mode 100644 corefils.yaml delete mode 100644 docs/plans/2026-02-13-centralized-log-collection-design.md delete mode 100644 docs/plans/2026-02-13-centralized-log-collection-plan.md delete mode 100644 docs/plans/2026-02-17-multi-user-k8s-access-design.md delete mode 100644 docs/plans/2026-02-17-multi-user-k8s-access-plan.md delete mode 100644 docs/plans/2026-02-21-openclaw-cluster-agent-design.md delete mode 100644 docs/plans/2026-02-21-openclaw-cluster-agent-plan.md delete mode 100644 docs/plans/2026-02-22-terragrunt-migration-design.md delete mode 100644 docs/plans/2026-02-22-terragrunt-migration-plan.md delete mode 100644 main.tf delete mode 100644 migrate_tfstate.txt delete mode 100644 modules/kubernetes/authelia/main.tf delete mode 100644 modules/kubernetes/authelia/users_database.yml delete mode 100644 modules/kubernetes/authelia/values.yaml delete mode 100644 modules/kubernetes/bind/deployment-factory/main.tf delete mode 100644 modules/kubernetes/bind/extra/viktorbarzin.me delete mode 100644 modules/kubernetes/bind/main.tf delete mode 100644 modules/kubernetes/bind/service-factory/main.tf delete mode 100644 modules/kubernetes/bind/variables.tf delete mode 100644 modules/kubernetes/discount-bandit/main.tf delete mode 100644 modules/kubernetes/dnscat2/main.tf delete mode 100644 modules/kubernetes/dnscrypt/main.tf delete mode 100644 modules/kubernetes/dockerhub_secret/main.tf delete mode 100644 modules/kubernetes/finance_app/main.tf delete mode 100644 modules/kubernetes/home_assistant/home_assistant_chart_values.tpl delete mode 100644 modules/kubernetes/home_assistant/main.tf delete mode 100644 modules/kubernetes/idrac-power-cycle.sh delete mode 100644 modules/kubernetes/istio/base.yaml delete mode 100644 modules/kubernetes/istio/istiod.yaml delete mode 100644 modules/kubernetes/istio/kiali.yaml delete mode 100644 modules/kubernetes/istio/main.tf delete mode 100644 modules/kubernetes/jellyfin/main.tf delete mode 100644 modules/kubernetes/kafka/kafka_chart_values.tpl delete mode 100644 modules/kubernetes/kafka/main.tf delete mode 100644 modules/kubernetes/keyserver/index.md delete mode 100644 modules/kubernetes/keyserver/inventory.ini delete mode 100644 modules/kubernetes/kured/main.tf delete mode 100644 modules/kubernetes/kured/values.yaml delete mode 100644 modules/kubernetes/localai/chart_values.tpl delete mode 100644 modules/kubernetes/localai/main.tf delete mode 100644 modules/kubernetes/main.tf delete mode 100644 modules/kubernetes/mcaptcha/main.tf delete mode 100644 modules/kubernetes/oauth-proxy/main.tf delete mode 100644 modules/kubernetes/openid_help_page/main.tf delete mode 100644 modules/kubernetes/pihole/main.tf delete mode 100644 modules/kubernetes/vault/chart_values.tpl delete mode 100644 modules/kubernetes/vault/main.tf delete mode 100644 modules/kubernetes/versions.tf delete mode 100644 modules/kubernetes/vikunja/main.tf delete mode 100644 versions.tf diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 87927201..4fe01c15 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -39,9 +39,8 @@ Terragrunt-based infrastructure repository managing a home Kubernetes cluster on - **Per-stack state**: `state/stacks//terraform.tfstate` - Per-stack state files (gitignored) - **Service resources**: `stacks//main.tf` - Service resources defined directly in stack root - **Platform modules**: `stacks/platform/modules//` - Platform service modules -- **Shared modules**: `modules/kubernetes/ingress_factory/`, `modules/kubernetes/setup_tls_secret/`, `modules/kubernetes/dockerhub_secret/`, `modules/kubernetes/oauth-proxy/` +- **Shared modules**: `modules/kubernetes/ingress_factory/`, `modules/kubernetes/setup_tls_secret/` - **Secrets**: `secrets/` - git-crypt encrypted TLS certs and keys -- **Legacy (unused)**: `main.tf`, `modules/kubernetes/main.tf` - Old monolithic entry points (kept for reference) ## Network Topology (Static IPs) ``` @@ -81,7 +80,7 @@ Terragrunt-based infrastructure repository managing a home Kubernetes cluster on - `stacks/platform/` - Core infrastructure (22 services in `stacks/platform/modules/`) - `stacks//` - Individual service stacks (resources directly in `main.tf`) - `stacks/platform/modules//` - Platform service module source code -- `modules/kubernetes/` - **Only shared utility modules**: `ingress_factory/`, `setup_tls_secret/`, `dockerhub_secret/`, `oauth-proxy/` +- `modules/kubernetes/` - **Only shared utility modules**: `ingress_factory/`, `setup_tls_secret/` - `modules/create-vm/` - Proxmox VM creation module - `state/` - Per-stack Terraform state files (gitignored) - `secrets/` - Encrypted secrets (TLS certs, keys) via git-crypt diff --git a/.gitignore b/.gitignore index 11e07321..24e79937 100755 --- a/.gitignore +++ b/.gitignore @@ -35,8 +35,6 @@ override.tf.json git_crypt.key -modules/kubernetes/frigate/config.yaml - # Claude Code - temporary/sensitive files .claude/cmd_input.txt .claude/cmd_output.txt diff --git a/.terraform.lock.hcl b/.terraform.lock.hcl deleted file mode 100644 index 68e2effc..00000000 --- a/.terraform.lock.hcl +++ /dev/null @@ -1,130 +0,0 @@ -# This file is maintained automatically by "terraform init". -# Manual edits may be lost in future updates. - -provider "registry.terraform.io/cloudflare/cloudflare" { - version = "4.52.5" - constraints = "~> 4.0" - hashes = [ - "h1:+rfzF+16ZcWZWnTyW/p1HHTzYbPKX8Zt2nIFtR/+f+E=", - "h1:18bXaaOSq8MWKuMxo/4y7EB7/i7G90y5QsKHZRmkoDo=", - "zh:1a3400cb38863b2585968d1876706bcfc67a148e1318a1d325c6c7704adc999b", - "zh:4c5062cb9e9da1676f06ae92b8370186d98976cc4c7030d3cd76df12af54282a", - "zh:52110f493b5f0587ef77a1cfd1a67001fd4c617b14c6502d732ab47352bdc2f7", - "zh:5aa536f9eaeb43823aaf2aa80e7d39b25ef2b383405ed034aa16a28b446a9238", - "zh:5cc39459a1c6be8a918f17054e4fbba573825ed5597dcada588fe99614d98a5b", - "zh:629ae6a7ba298815131da826474d199312d21cec53a4d5ded4fa56a692e6f072", - "zh:719cc7c75dc1d3eb30c22ff5102a017996d9788b948078c7e1c5b3446aeca661", - "zh:8698635a3ca04383c1e93b21d6963346bdae54d27177a48e4b1435b7f731731c", - "zh:890df766e9b839623b1f0437355032a3c006226a6c200cd911e15ee1a9014e9f", - "zh:8a9993f1dcadf1dd6ca43b23348abe374605d29945a2fafc07fb3457644e6a54", - "zh:b1b9a1e6bcc24d5863a664a411d2dc906373ae7a2399d2d65548ce7377057852", - "zh:b270184cdeec277218e84b94cb136fead753da717f9b9dc378e51907f3f00bb0", - "zh:dff2bc10071210181726ce270f954995fe42c696e61e2e8f874021fed02521e5", - "zh:e8e87b40b6a87dc097b0fdc20d3f725cec0d82abc9cc3755c1f89f8f6e8b0036", - "zh:ee964a6573d399a5dd22ce328fb38ca1207797a02248f14b2e4913ee390e7803", - ] -} - -provider "registry.terraform.io/hashicorp/helm" { - version = "3.1.1" - hashes = [ - "h1:47CqNwkxctJtL/N/JuEj+8QMg8mRNI/NWeKO5/ydfZU=", - "h1:5b2ojWKT0noujHiweCds37ZreRFRQLNaErdJLusJN88=", - "zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275", - "zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a", - "zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29", - "zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104", - "zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990", - "zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34", - "zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8", - "zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1", - "zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b", - "zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903", - "zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4", - "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", - ] -} - -provider "registry.terraform.io/hashicorp/kubernetes" { - version = "3.0.1" - constraints = "3.0.1" - hashes = [ - "h1:P0c8knzZnouTNFIRij8IS7+pqd0OKaFDYX0j4GRsiqo=", - "h1:vyHdH0p6bf9xp1NPePObAJkXTJb/I09FQQmmevTzZe0=", - "zh:02d55b0b2238fd17ffa12d5464593864e80f402b90b31f6e1bd02249b9727281", - "zh:20b93a51bfeed82682b3c12f09bac3031f5bdb4977c47c97a042e4df4fb2f9ba", - "zh:6e14486ecfaee38c09ccf33d4fdaf791409f90795c1b66e026c226fad8bc03c7", - "zh:8d0656ff422df94575668e32c310980193fccb1c28117e5c78dd2d4050a760a6", - "zh:9795119b30ec0c1baa99a79abace56ac850b6e6fbce60e7f6067792f6eb4b5f4", - "zh:b388c87acc40f6bd9620f4e23f01f3c7b41d9b88a68d5255dec0a72f0bdec249", - "zh:b59abd0a980649c2f97f172392f080eaeb18e486b603f83bf95f5d93aeccc090", - "zh:ba6e3060fddf4a022087d8f09e38aa0001c705f21170c2ded3d1c26c12f70d97", - "zh:c12626d044b1d5501cf95ca78cbe507c13ad1dd9f12d4736df66eb8e5f336eb8", - "zh:c55203240d50f4cdeb3df1e1760630d677679f5b1a6ffd9eba23662a4ad05119", - "zh:ea206a5a32d6e0d6e32f1849ad703da9a28355d9c516282a8458b5cf1502b2a1", - "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", - ] -} - -provider "registry.terraform.io/hashicorp/null" { - version = "3.2.4" - hashes = [ - "h1:L5V05xwp/Gto1leRryuesxjMfgZwjb7oool4WS1UEFQ=", - "h1:hkf5w5B6q8e2A42ND2CjAvgvSN3puAosDmOJb3zCVQM=", - "zh:59f6b52ab4ff35739647f9509ee6d93d7c032985d9f8c6237d1f8a59471bbbe2", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:795c897119ff082133150121d39ff26cb5f89a730a2c8c26f3a9c1abf81a9c43", - "zh:7b9c7b16f118fbc2b05a983817b8ce2f86df125857966ad356353baf4bff5c0a", - "zh:85e33ab43e0e1726e5f97a874b8e24820b6565ff8076523cc2922ba671492991", - "zh:9d32ac3619cfc93eb3c4f423492a8e0f79db05fec58e449dee9b2d5873d5f69f", - "zh:9e15c3c9dd8e0d1e3731841d44c34571b6c97f5b95e8296a45318b94e5287a6e", - "zh:b4c2ab35d1b7696c30b64bf2c0f3a62329107bd1a9121ce70683dec58af19615", - "zh:c43723e8cc65bcdf5e0c92581dcbbdcbdcf18b8d2037406a5f2033b1e22de442", - "zh:ceb5495d9c31bfb299d246ab333f08c7fb0d67a4f82681fbf47f2a21c3e11ab5", - "zh:e171026b3659305c558d9804062762d168f50ba02b88b231d20ec99578a6233f", - "zh:ed0fe2acdb61330b01841fa790be00ec6beaac91d41f311fb8254f74eb6a711f", - ] -} - -provider "registry.terraform.io/hashicorp/random" { - version = "3.8.1" - hashes = [ - "h1:Eexl06+6J+s75uD46+WnZtpJZYRVUMB0AiuPBifK6Jc=", - "h1:u8AKlWVDTH5r9YLSeswoVEjiY72Rt4/ch7U+61ZDkiQ=", - "zh:08dd03b918c7b55713026037c5400c48af5b9f468f483463321bd18e17b907b4", - "zh:0eee654a5542dc1d41920bbf2419032d6f0d5625b03bd81339e5b33394a3e0ae", - "zh:229665ddf060aa0ed315597908483eee5b818a17d09b6417a0f52fd9405c4f57", - "zh:2469d2e48f28076254a2a3fc327f184914566d9e40c5780b8d96ebf7205f8bc0", - "zh:37d7eb334d9561f335e748280f5535a384a88675af9a9eac439d4cfd663bcb66", - "zh:741101426a2f2c52dee37122f0f4a2f2d6af6d852cb1db634480a86398fa3511", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:a902473f08ef8df62cfe6116bd6c157070a93f66622384300de235a533e9d4a9", - "zh:b85c511a23e57a2147355932b3b6dce2a11e856b941165793a0c3d7578d94d05", - "zh:c5172226d18eaac95b1daac80172287b69d4ce32750c82ad77fa0768be4ea4b8", - "zh:dab4434dba34aad569b0bc243c2d3f3ff86dd7740def373f2a49816bd2ff819b", - "zh:f49fd62aa8c5525a5c17abd51e27ca5e213881d58882fd42fec4a545b53c9699", - ] -} - -provider "registry.terraform.io/telmate/proxmox" { - version = "3.0.2-rc07" - constraints = "3.0.2-rc07" - hashes = [ - "h1:0UpRJ8PFsu9lhD3p2KUdUNVsDPbjZLPR46wYRpt1dxc=", - "h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=", - "zh:2ee860cd0a368b3eaa53f4a9ea46f16dab8a97929e813ea6ef55183f8112c2ca", - "zh:415965fd915bae2040d7f79e45f64d6e3ae61149c10114efeac1b34687d7296c", - "zh:6584b2055df0e32062561c615e3b6b2c291ca8c959440adda09ef3ec1e1436bd", - "zh:65dcfad71928e0a8dd9befc22524ed686be5020b0024dc5cca5184c7420eeb6b", - "zh:7253dc29bd265d33f2791ac4f779c5413f16720bb717de8e6c5fcb2c858648ea", - "zh:7ec8993da10a47606670f9f67cfd10719a7580641d11c7aa761121c4a2bd66fb", - "zh:999a3f7a9dcf517967fc537e6ec930a8172203642fb01b8e1f78f908373db210", - "zh:a50e6df7280eb6584a5fd2456e3f5b6df13b2ec8a7fa4605511e438e1863be42", - "zh:b25b329a1e42681c509d027fee0365414f0cc5062b65690cfc3386aab16132ae", - "zh:c028877fdb438ece48f7bc02b65bbae9ca7b7befbd260e519ccab6c0cbb39f26", - "zh:cf0eaa3ea9fcc6d62793637947f1b8d7c885b6ad74695ab47e134e4ff132190f", - "zh:d5ade3fae031cc629b7c512a7b60e46570f4c41665e88a595d7efd943dde5ab2", - "zh:f388c15ad1ecfc09e7361e3b98bae9b627a3a85f7b908c9f40650969c949901c", - "zh:f415cc6f735a3971faae6ac24034afdb9ee83373ef8de19a9631c187d5adc7db", - ] -} diff --git a/corefils.yaml b/corefils.yaml deleted file mode 100644 index d797ee9e..00000000 --- a/corefils.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: v1 -data: - Corefile: | - .:53 { - #log - errors - health { - lameduck 5s - } - ready - kubernetes cluster.local in-addr.arpa ip6.arpa { - pods insecure - fallthrough in-addr.arpa ip6.arpa - ttl 30 - } - prometheus :9153 - #forward . 1.1.1.1 - forward . 10.0.20.1 - #forward . /etc/resolv.conf - cache { - success 10000 300 6 - denial 10000 300 60 - } - loop - reload - loadbalance - } - viktorbarzin.lan:53 { - #log - errors - #forward . 10.102.184.76 - forward . 10.0.20.101:30053 # this must be the same as the technitium nodeport svc - #forward . technitium-dns.technitium.svc.cluster.local - cache { - success 10000 300 6 - denial 10000 300 60 - } - } -kind: ConfigMap -metadata: - name: coredns - namespace: kube-system diff --git a/docs/plans/2026-02-13-centralized-log-collection-design.md b/docs/plans/2026-02-13-centralized-log-collection-design.md deleted file mode 100644 index 5f8ca12c..00000000 --- a/docs/plans/2026-02-13-centralized-log-collection-design.md +++ /dev/null @@ -1,140 +0,0 @@ -# Centralized Log Collection Design - -## Date: 2026-02-13 - -## Goal - -Centrally collect logs from all Kubernetes pods for monitoring and alerting. Minimize disk I/O by holding logs in memory for extended periods, flushing to NFS once daily. Alert on log patterns via existing Alertmanager pipeline. - -## Requirements - -- **Primary use case**: Monitoring and alerting (log-based alert rules evaluated in real-time) -- **Retention**: 7 days on disk after flush -- **Memory budget**: 4-8GB total (~6.6GB used) -- **Disk strategy**: 24h in-memory chunks, WAL on tmpfs, single daily flush to NFS -- **Crash policy**: Accept up to 24h log loss on pod/node crash (alerts still fire in real-time before flush) -- **Alert delivery**: Loki Ruler -> existing Alertmanager -> Slack/email - -## Architecture - -``` -┌──────────────────┐ ┌──────────────────────┐ ┌──────────────┐ -│ Alloy DaemonSet │ │ Loki SingleBinary │ │ Grafana │ -│ 5 pods, 128Mi ea │────>│ 1 pod, 6Gi RAM │<────│ (existing) │ -│ tails /var/log/ │ │ │ │ + Loki │ -│ pods on each node│ │ Ingester: 24h chunks │ │ datasource │ -└──────────────────┘ │ WAL: tmpfs (in-memory) │ └──────────────┘ - │ Storage: NFS 15Gi │ -┌──────────────────┐ │ Ruler ──> Alertmanager │ -│ Sysctl DaemonSet │ └──────────────────────┘ -│ 5 pods (pause) │ -│ sets inotify │ -│ limits on nodes │ -└──────────────────┘ -``` - -## Components - -### 1. Sysctl DaemonSet - -Solves the `too many open files` / fsnotify watcher exhaustion problem that previously blocked Alloy. - -- Privileged init container runs `sysctl -w` on each node -- Settings: `fs.inotify.max_user_watches=1048576`, `fs.inotify.max_user_instances=512`, `fs.inotify.max_queued_events=1048576` -- Main container: `pause` image (near-zero resources) -- Survives node reboots (DaemonSet recreates pod) -- Namespace: `monitoring` - -### 2. Loki (Helm Release) - -Single-binary deployment. Existing Helm chart config in `loki.yaml`, updated with: - -**Ingester tuning (disk-friendly):** -- `chunk_idle_period: 12h` — don't flush idle streams quickly -- `max_chunk_age: 24h` — hold chunks in memory for full day -- `chunk_retain_period: 1m` — brief retain after flush -- `chunk_target_size: 1572864` (1.5MB) — larger chunks = fewer writes -- WAL: tmpfs emptyDir (`medium: Memory`, 2Gi limit) - -**Retention:** -- `retention_period: 168h` (7 days) -- Compactor enabled for retention enforcement - -**Ruler:** -- Evaluates LogQL alert rules in real-time (before chunk flush) -- Fires to `http://prometheus-alertmanager.monitoring.svc.cluster.local:9093` - -**Storage:** -- NFS PV/PVC at `/mnt/main/loki/loki` (15Gi, existing) -- TSDB index with 24h period - -**Resources:** -- Memory: 6Gi limit -- CPU: 1 limit - -### 3. Alloy (Helm Release) - -DaemonSet log collector. Existing config in `alloy.yaml` is complete: -- Discovers pods via `discovery.kubernetes` -- Labels: namespace, pod, container, app, job, container_runtime, cluster -- Tails `/var/log/pods/` on each node -- Forwards to `http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push` - -**Resources per pod:** -- Memory: 128Mi limit -- CPU: 200m limit - -### 4. Grafana Datasource - -ConfigMap with label `grafana_datasource: "1"` for sidecar auto-discovery: -- Name: Loki -- Type: loki -- URL: `http://loki.monitoring.svc.cluster.local:3100` -- Existing `loki.json` dashboard already in dashboards directory - -### 5. Starter Alert Rules - -Configured in Loki Ruler (evaluated in real-time, before disk flush): - -| Alert | LogQL Expression | Severity | -|-------|-----------------|----------| -| HighErrorRate | `sum(rate({namespace=~".+"} \|= "error" [5m])) by (namespace) > 10` | warning | -| PodCrashLoopBackOff | `count_over_time({namespace=~".+"} \|= "CrashLoopBackOff" [5m]) > 0` | critical | -| OOMKilled | `count_over_time({namespace=~".+"} \|= "OOMKilled" [5m]) > 0` | critical | - -## Memory Budget - -| Component | Per-pod | Pods | Total | -|-----------|---------|------|-------| -| Alloy | 128Mi | 5 | 640Mi | -| Loki | 6Gi | 1 | 6Gi | -| Sysctl DS | ~0 (pause) | 5 | ~0 | -| **Total** | | | **~6.6 GB** | - -## Files to Change - -| File | Action | -|------|--------| -| `modules/kubernetes/monitoring/loki.tf` | Uncomment Loki + Alloy helm releases, add sysctl DaemonSet, add Grafana Loki datasource ConfigMap | -| `modules/kubernetes/monitoring/loki.yaml` | Update with ingester tuning, ruler config, retention, resource limits | -| `modules/kubernetes/monitoring/alloy.yaml` | Add resource limits in Helm values wrapper | -| `secrets/nfs_directories.txt` | Ensure `/mnt/main/loki` entries exist | - -## Implementation Steps - -1. Add sysctl DaemonSet to `loki.tf` -2. Update `loki.yaml` with disk-friendly tuning, ruler, retention, resources -3. Update `alloy.yaml` with resource limits -4. Uncomment Loki Helm release in `loki.tf`, wire up NFS PV/PVC -5. Uncomment Alloy Helm release in `loki.tf` -6. Add Grafana Loki datasource ConfigMap to `loki.tf` -7. Add alert rules to Loki config -8. Ensure NFS exports exist in `secrets/nfs_directories.txt` -9. `terraform apply -target=module.kubernetes_cluster.module.monitoring` -10. Verify: Grafana Explore -> Loki datasource -> query `{namespace="monitoring"}` - -## Risks - -- **24h data loss on crash**: Accepted trade-off. Alerts fire in real-time before flush, so alert coverage is not affected — only historical log browsing is at risk. -- **Memory pressure**: 6Gi for Loki on a 16GB node is significant. Monitor with existing Prometheus memory alerts. -- **Log volume spikes**: A chatty pod could cause Loki to OOM. Alloy can be configured with rate limiting if needed (future enhancement). diff --git a/docs/plans/2026-02-13-centralized-log-collection-plan.md b/docs/plans/2026-02-13-centralized-log-collection-plan.md deleted file mode 100644 index 3cf38fdb..00000000 --- a/docs/plans/2026-02-13-centralized-log-collection-plan.md +++ /dev/null @@ -1,532 +0,0 @@ -# Centralized Log Collection Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Deploy Loki + Alloy for centralized Kubernetes log collection with 24h in-memory chunks, 7-day disk retention, and log-based alerting via existing Alertmanager. - -**Architecture:** Alloy DaemonSet tails pod logs on all 5 nodes, forwards to single-binary Loki which holds chunks in 6Gi RAM for 24h before flushing to NFS. Loki Ruler evaluates LogQL alert rules in real-time and fires to Alertmanager. Grafana gets a Loki datasource via sidecar auto-provisioning. - -**Tech Stack:** Terraform, Helm (Loki chart, Alloy chart), Kubernetes DaemonSet, NFS, Grafana - -**Design doc:** `docs/plans/2026-02-13-centralized-log-collection-design.md` - ---- - -### Task 1: Add sysctl DaemonSet for inotify limits - -Alloy uses fsnotify to tail log files. Default kernel limits cause "too many open files" errors. This DaemonSet sets the limits on every node persistently. - -**Files:** -- Modify: `modules/kubernetes/monitoring/loki.tf` (replace the comment block at lines 67-71) - -**Step 1: Write the sysctl DaemonSet resource** - -Replace lines 67-71 (the comment block about sysctl) with this Terraform resource in `loki.tf`: - -```hcl -resource "kubernetes_daemon_set_v1" "sysctl-inotify" { - metadata { - name = "sysctl-inotify" - namespace = kubernetes_namespace.monitoring.metadata[0].name - labels = { - app = "sysctl-inotify" - } - } - spec { - selector { - match_labels = { - app = "sysctl-inotify" - } - } - template { - metadata { - labels = { - app = "sysctl-inotify" - } - } - spec { - init_container { - name = "sysctl" - image = "busybox:1.37" - command = [ - "sh", "-c", - "sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=512 && sysctl -w fs.inotify.max_queued_events=1048576" - ] - security_context { - privileged = true - } - } - container { - name = "pause" - image = "registry.k8s.io/pause:3.10" - resources { - requests = { - cpu = "1m" - memory = "4Mi" - } - limits = { - cpu = "1m" - memory = "4Mi" - } - } - } - host_pid = true - toleration { - operator = "Exists" - } - } - } - } -} -``` - -**Step 2: Run terraform fmt** - -Run: `terraform fmt -recursive modules/kubernetes/monitoring/` - -**Step 3: Run terraform plan to verify** - -Run: `terraform plan -target=module.kubernetes_cluster.module.monitoring -var="kube_config_path=$(pwd)/config" 2>&1 | tail -30` -Expected: Plan shows 1 resource to add (kubernetes_daemon_set_v1.sysctl-inotify) - -**Step 4: Commit** - -```bash -git add modules/kubernetes/monitoring/loki.tf -git commit -m "[ci skip] Add sysctl DaemonSet for inotify limits" -``` - ---- - -### Task 2: Update Loki Helm values with disk-friendly tuning - -Configure ingester for 24h in-memory chunks, WAL on tmpfs, 7-day retention, ruler for alerting, and resource limits. - -**Files:** -- Modify: `modules/kubernetes/monitoring/loki.yaml` (full rewrite) - -**Step 1: Write updated loki.yaml** - -Replace entire contents of `loki.yaml` with: - -```yaml -loki: - commonConfig: - replication_factor: 1 - schemaConfig: - configs: - - from: "2025-04-01" - store: tsdb - object_store: filesystem - schema: v13 - index: - prefix: loki_index_ - period: 24h - ingester: - chunk_idle_period: 12h - max_chunk_age: 24h - chunk_retain_period: 1m - chunk_target_size: 1572864 - wal: - dir: /loki-wal - pattern_ingester: - enabled: true - limits_config: - allow_structured_metadata: true - volume_enabled: true - retention_period: 168h - compactor: - retention_enabled: true - working_directory: /loki/compactor - compaction_interval: 1h - delete_request_store: filesystem - ruler: - enable_api: true - storage: - type: local - local: - directory: /loki/rules - alertmanager_url: http://alertmanager.monitoring.svc.cluster.local:9093 - ring: - kvstore: - store: inmemory - rule_path: /loki/scratch - storage: - type: "filesystem" - auth_enabled: false - -minio: - enabled: false - -deploymentMode: SingleBinary - -singleBinary: - replicas: 1 - persistence: - enabled: true - size: 15Gi - storageClass: "" - extraVolumes: - - name: wal - emptyDir: - medium: Memory - sizeLimit: 2Gi - - name: rules - configMap: - name: loki-alert-rules - extraVolumeMounts: - - name: wal - mountPath: /loki-wal - - name: rules - mountPath: /loki/rules/fake - resources: - requests: - cpu: 250m - memory: 4Gi - limits: - cpu: "1" - memory: 6Gi - -# Zero out replica counts of other deployment modes -backend: - replicas: 0 -read: - replicas: 0 -write: - replicas: 0 -ingester: - replicas: 0 -querier: - replicas: 0 -queryFrontend: - replicas: 0 -queryScheduler: - replicas: 0 -distributor: - replicas: 0 -compactor: - replicas: 0 -indexGateway: - replicas: 0 -bloomCompactor: - replicas: 0 -bloomGateway: - replicas: 0 -``` - -**Step 2: Commit** - -```bash -git add modules/kubernetes/monitoring/loki.yaml -git commit -m "[ci skip] Update Loki config with disk-friendly tuning and ruler" -``` - ---- - -### Task 3: Update Alloy Helm values with resource limits - -The Alloy config content is already complete. Wrap it in proper Helm values with resource limits. - -**Files:** -- Modify: `modules/kubernetes/monitoring/alloy.yaml` (add resource limits) - -**Step 1: Add resource limits to alloy.yaml** - -Append after the existing `alloy.configMap.content` block (after the last line): - -```yaml - - # Resource limits for DaemonSet pods - resources: - requests: - cpu: 50m - memory: 64Mi - limits: - cpu: 200m - memory: 128Mi -``` - -The final file should have the `alloy.configMap.content` block unchanged, with `alloy.resources` added as a sibling under `alloy:`. - -**Step 2: Commit** - -```bash -git add modules/kubernetes/monitoring/alloy.yaml -git commit -m "[ci skip] Add resource limits to Alloy config" -``` - ---- - -### Task 4: Uncomment Loki Helm release and PV in loki.tf - -Enable the Loki Helm release and its NFS persistent volume. Remove minio PV (not needed with filesystem storage). - -**Files:** -- Modify: `modules/kubernetes/monitoring/loki.tf` (uncomment Loki resources, remove minio PV) - -**Step 1: Uncomment the Loki Helm release (lines 1-12)** - -Uncomment and update the helm_release to: - -```hcl -resource "helm_release" "loki" { - namespace = kubernetes_namespace.monitoring.metadata[0].name - create_namespace = true - name = "loki" - - repository = "https://grafana.github.io/helm-charts" - chart = "loki" - - values = [templatefile("${path.module}/loki.yaml", {})] - timeout = 300 - - depends_on = [kubernetes_config_map.loki_alert_rules] -} -``` - -**Step 2: Uncomment the Loki NFS PV (lines 14-32)** - -Uncomment the `kubernetes_persistent_volume.loki` resource as-is. - -**Step 3: Remove the minio PV block (lines 34-52)** - -Delete the entire `kubernetes_persistent_volume.loki-minio` commented block — minio is disabled. - -**Step 4: Run terraform fmt** - -Run: `terraform fmt -recursive modules/kubernetes/monitoring/` - -**Step 5: Commit** - -```bash -git add modules/kubernetes/monitoring/loki.tf -git commit -m "[ci skip] Enable Loki Helm release and NFS PV" -``` - ---- - -### Task 5: Uncomment Alloy Helm release in loki.tf - -Enable the Alloy Helm release. - -**Files:** -- Modify: `modules/kubernetes/monitoring/loki.tf` (uncomment Alloy helm release) - -**Step 1: Uncomment and update the Alloy Helm release** - -Replace the commented Alloy block with: - -```hcl -resource "helm_release" "alloy" { - namespace = kubernetes_namespace.monitoring.metadata[0].name - create_namespace = true - name = "alloy" - - repository = "https://grafana.github.io/helm-charts" - chart = "alloy" - - values = [file("${path.module}/alloy.yaml")] - atomic = true - - depends_on = [helm_release.loki] -} -``` - -**Step 2: Run terraform fmt** - -Run: `terraform fmt -recursive modules/kubernetes/monitoring/` - -**Step 3: Commit** - -```bash -git add modules/kubernetes/monitoring/loki.tf -git commit -m "[ci skip] Enable Alloy Helm release" -``` - ---- - -### Task 6: Add Grafana Loki datasource ConfigMap - -Grafana's sidecar auto-discovers ConfigMaps with label `grafana_datasource: "1"`. Create one for Loki. - -**Files:** -- Modify: `modules/kubernetes/monitoring/loki.tf` (add ConfigMap resource) - -**Step 1: Add the datasource ConfigMap** - -Add to `loki.tf`: - -```hcl -resource "kubernetes_config_map" "grafana_loki_datasource" { - metadata { - name = "grafana-loki-datasource" - namespace = kubernetes_namespace.monitoring.metadata[0].name - labels = { - grafana_datasource = "1" - } - } - data = { - "loki-datasource.yaml" = yamlencode({ - apiVersion = 1 - datasources = [{ - name = "Loki" - type = "loki" - access = "proxy" - url = "http://loki.monitoring.svc.cluster.local:3100" - isDefault = false - }] - }) - } -} -``` - -**Step 2: Run terraform fmt** - -Run: `terraform fmt -recursive modules/kubernetes/monitoring/` - -**Step 3: Commit** - -```bash -git add modules/kubernetes/monitoring/loki.tf -git commit -m "[ci skip] Add Grafana Loki datasource ConfigMap" -``` - ---- - -### Task 7: Add Loki alert rules ConfigMap - -Create the ConfigMap that Loki's ruler reads for alert rules. Mounted into the Loki pod at `/loki/rules/fake/`. - -**Files:** -- Modify: `modules/kubernetes/monitoring/loki.tf` (add alert rules ConfigMap) - -**Step 1: Add the alert rules ConfigMap** - -Add to `loki.tf`: - -```hcl -resource "kubernetes_config_map" "loki_alert_rules" { - metadata { - name = "loki-alert-rules" - namespace = kubernetes_namespace.monitoring.metadata[0].name - } - data = { - "rules.yaml" = yamlencode({ - groups = [{ - name = "log-alerts" - rules = [ - { - alert = "HighErrorRate" - expr = "sum(rate({namespace=~\".+\"} |= \"error\" [5m])) by (namespace) > 10" - for = "5m" - labels = { - severity = "warning" - } - annotations = { - summary = "High error rate in {{ $labels.namespace }}" - } - }, - { - alert = "PodCrashLoopBackOff" - expr = "count_over_time({namespace=~\".+\"} |= \"CrashLoopBackOff\" [5m]) > 0" - for = "1m" - labels = { - severity = "critical" - } - annotations = { - summary = "CrashLoopBackOff detected in {{ $labels.namespace }}" - } - }, - { - alert = "OOMKilled" - expr = "count_over_time({namespace=~\".+\"} |= \"OOMKilled\" [5m]) > 0" - for = "1m" - labels = { - severity = "critical" - } - annotations = { - summary = "OOMKilled detected in {{ $labels.namespace }}" - } - } - ] - }] - }) - } -} -``` - -**Step 2: Run terraform fmt** - -Run: `terraform fmt -recursive modules/kubernetes/monitoring/` - -**Step 3: Commit** - -```bash -git add modules/kubernetes/monitoring/loki.tf -git commit -m "[ci skip] Add Loki alert rules ConfigMap" -``` - ---- - -### Task 8: Deploy and verify - -Apply all changes via Terraform and verify the stack is working. - -**Files:** None (deployment only) - -**Step 1: Run terraform apply for monitoring module** - -Run: `terraform apply -target=module.kubernetes_cluster.module.monitoring -var="kube_config_path=$(pwd)/config" -auto-approve` -Expected: Multiple resources created (sysctl DaemonSet, Loki Helm release, Alloy Helm release, PV, ConfigMaps) - -**Step 2: Verify sysctl DaemonSet is running on all nodes** - -Run: `kubectl --kubeconfig $(pwd)/config get ds -n monitoring sysctl-inotify` -Expected: DESIRED=5, CURRENT=5, READY=5 - -**Step 3: Verify Loki pod is running** - -Run: `kubectl --kubeconfig $(pwd)/config get pods -n monitoring -l app.kubernetes.io/name=loki` -Expected: 1/1 Running - -**Step 4: Verify Alloy DaemonSet is running** - -Run: `kubectl --kubeconfig $(pwd)/config get ds -n monitoring -l app.kubernetes.io/name=alloy` -Expected: DESIRED=5, CURRENT=5, READY=5 - -**Step 5: Verify Loki is receiving logs** - -Run: `kubectl --kubeconfig $(pwd)/config exec -n monitoring deploy/loki -- wget -qO- 'http://localhost:3100/loki/api/v1/labels'` -Expected: JSON response with labels like `namespace`, `pod`, `container` - -**Step 6: Verify Grafana has Loki datasource** - -Open `https://grafana.viktorbarzin.me/explore`, select "Loki" datasource, run query: `{namespace="monitoring"}` -Expected: Log lines from monitoring namespace pods - -**Step 7: Commit final state** - -```bash -git add -A -git commit -m "[ci skip] Deploy centralized log collection (Loki + Alloy)" -``` - ---- - -### Troubleshooting - -**If Alloy pods crash with inotify errors:** -- Check sysctl DaemonSet init logs: `kubectl --kubeconfig $(pwd)/config logs -n monitoring ds/sysctl-inotify -c sysctl` -- Verify sysctl values on node: `kubectl --kubeconfig $(pwd)/config debug node/k8s-node2 -it --image=busybox -- sysctl fs.inotify.max_user_watches` - -**If Loki OOMs:** -- Check memory usage: `kubectl --kubeconfig $(pwd)/config top pod -n monitoring -l app.kubernetes.io/name=loki` -- Reduce `max_chunk_age` from 24h to 12h in `loki.yaml` to flush more frequently - -**If Grafana doesn't show Loki datasource:** -- Verify ConfigMap has correct label: `kubectl --kubeconfig $(pwd)/config get cm -n monitoring grafana-loki-datasource -o yaml` -- Restart Grafana sidecar: `kubectl --kubeconfig $(pwd)/config rollout restart deploy -n monitoring grafana` - -**If Loki PV won't bind:** -- Check NFS export exists: `ssh root@10.0.10.15 'showmount -e localhost | grep loki'` -- Run NFS export script: `cd secrets && bash nfs_exports.sh` diff --git a/docs/plans/2026-02-17-multi-user-k8s-access-design.md b/docs/plans/2026-02-17-multi-user-k8s-access-design.md deleted file mode 100644 index 4b17da20..00000000 --- a/docs/plans/2026-02-17-multi-user-k8s-access-design.md +++ /dev/null @@ -1,154 +0,0 @@ -# Multi-User Kubernetes Access Design - -**Date**: 2026-02-17 -**Status**: Approved - -## Problem - -The cluster uses a single `kubernetes-admin` client certificate for all access. There is no way to: -- Give different users different levels of access -- Track who performed which actions -- Enforce resource limits per user -- Onboard new users without sharing admin credentials - -## Decision - -Native OIDC authentication on the kube-apiserver using Authentik as the identity provider, with Terraform-managed RBAC and a self-service Svelte portal for user onboarding. - -### Alternatives Considered - -1. **Pinniped (Concierge + Supervisor)**: Avoids API server changes but adds two components to maintain. Requires Pinniped CLI on user machines. Overkill for a single-cluster setup. -2. **kube-oidc-proxy**: Avoids API server changes but adds a proxy in the request path (single point of failure, extra latency). Sporadic maintenance from JetStack. - -## Architecture - -``` -User → Self-Service Portal → Authentik Login → Download Kubeconfig - │ -User → kubectl (with kubelogin) → kube-apiserver → OIDC validation → Authentik - │ - RBAC evaluation - │ - Audit logging → Alloy → Loki → Grafana -``` - -### User Roles - -| Role | Scope | Access | -|------|-------|--------| -| `admin` | Cluster-wide | Full `cluster-admin` access | -| `power-user` | Cluster-wide | Deploy/manage workloads, view all resources, no RBAC/node modification | -| `namespace-owner` | Specific namespaces | Full `admin` within assigned namespaces only | - -## Components - -### 1. Authentik OIDC Provider - -New OAuth2/OIDC application in Authentik configured via Terraform (`modules/kubernetes/authentik/`). - -- **Application name**: `kubernetes` -- **Provider type**: OAuth2/OpenID Connect -- **Client type**: Public (no client secret, kubelogin uses PKCE) -- **Redirect URIs**: `http://localhost:8000/callback` (kubelogin default) -- **Scopes**: `openid`, `email`, `profile`, `groups` -- **Property mappings**: Include `groups` claim for RBAC group matching - -### 2. kube-apiserver OIDC Flags - -One-time change on k8s-master (`10.0.20.100`), automated via Terraform `null_resource` with `remote-exec`. - -Added to `/etc/kubernetes/manifests/kube-apiserver.yaml`: - -```yaml -- --oidc-issuer-url=https://authentik.viktorbarzin.me/application/o/kubernetes/ -- --oidc-client-id=kubernetes -- --oidc-username-claim=email -- --oidc-groups-claim=groups -``` - -Kubelet auto-restarts the API server pod when the manifest changes. These flags persist through `kubeadm upgrade apply`. - -### 3. RBAC (Terraform-managed) - -New module: `modules/kubernetes/rbac/main.tf` - -**User definition** in `terraform.tfvars`: - -```hcl -k8s_users = { - "viktor" = { - role = "admin" - email = "viktor@viktorbarzin.me" - } - "alice" = { - role = "power-user" - email = "alice@example.com" - } - "bob" = { - role = "namespace-owner" - namespaces = ["bob-apps", "bob-dev"] - email = "bob@example.com" - } -} -``` - -**Resources created per role:** - -| Role | Terraform Resources | -|------|-------------------| -| `admin` | `ClusterRoleBinding` → `cluster-admin` for user email | -| `power-user` | Custom `ClusterRole` (workload management, no RBAC/node access) + `ClusterRoleBinding` | -| `namespace-owner` | `Namespace`(s) + `RoleBinding` → built-in `admin` ClusterRole + `ResourceQuota` per namespace | - -### 4. Self-Service Portal - -Svelte (SvelteKit) app at `https://k8s-portal.viktorbarzin.me`. - -**Flow:** -1. User visits portal → Authentik login via Traefik forward auth -2. Portal displays user's role and assigned namespaces -3. User downloads pre-configured kubeconfig (generated server-side) -4. Portal shows setup instructions (install kubectl + kubelogin) - -**Kubeconfig template** includes: -- Cluster: `https://10.0.20.100:6443` with CA cert -- Auth: `exec` credential plugin pointing to kubelogin -- OIDC issuer URL and client ID pre-configured - -**Deployment**: Standard Kubernetes deployment + service + ingress, Terraform-managed like other services. No database needed — user role info read from Kubernetes RBAC bindings or a Terraform-generated ConfigMap. - -### 5. Audit Logging - -Kubernetes audit policy deployed to master via the same `null_resource`. - -**Policy** (`/etc/kubernetes/audit-policy.yaml`): -- `RequestResponse` level for OIDC-authenticated users (captures what they changed) -- `Metadata` level for system/service accounts (keeps volume down) -- Secrets logged at `Metadata` level only (no request/response bodies) - -**Log pipeline**: Audit log file → Alloy (DaemonSet on master) → Loki → Grafana dashboard - -**Grafana dashboard** shows: who accessed what resource, when, from where, and the outcome (allow/deny). - -### 6. Resource Quotas - -Each namespace-owner namespace gets a `ResourceQuota`: - -```hcl -requests.cpu = "2" -requests.memory = "4Gi" -limits.cpu = "4" -limits.memory = "8Gi" -pods = "20" -``` - -Defaults can be overridden per-user via an optional `quota` field in the `k8s_users` variable. - -## Implementation Order - -1. Authentik OIDC application setup -2. kube-apiserver OIDC flag configuration -3. RBAC Terraform module -4. Audit logging -5. Self-service portal -6. Grafana dashboard for audit logs diff --git a/docs/plans/2026-02-17-multi-user-k8s-access-plan.md b/docs/plans/2026-02-17-multi-user-k8s-access-plan.md deleted file mode 100644 index 126fa637..00000000 --- a/docs/plans/2026-02-17-multi-user-k8s-access-plan.md +++ /dev/null @@ -1,1175 +0,0 @@ -# Multi-User Kubernetes Access Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Enable multi-user access to the Kubernetes cluster with OIDC authentication via Authentik, Terraform-managed RBAC, audit logging, and a self-service onboarding portal. - -**Architecture:** Native OIDC on kube-apiserver (4 flags), Authentik as IdP, three user roles (admin/power-user/namespace-owner), SvelteKit portal for kubeconfig distribution, audit logs to Loki/Grafana. - -**Tech Stack:** Terraform (RBAC, deployments), Authentik (OIDC), SvelteKit (portal), kubelogin (kubectl plugin), Loki (audit logs) - -**Design document:** `docs/plans/2026-02-17-multi-user-k8s-access-design.md` - ---- - -### Task 1: Create Authentik OIDC Application for Kubernetes - -OAuth2 applications are currently created manually in the Authentik UI (not via Terraform provider). Follow this pattern. - -**Step 1: Create the OAuth2/OIDC application in Authentik** - -Log into Authentik admin at `https://authentik.viktorbarzin.me/if/admin/`. - -1. Go to **Applications → Providers → Create** -2. Select **OAuth2/OpenID Connect** -3. Configure: - - Name: `kubernetes` - - Authorization flow: `default-provider-authorization-implicit-consent` - - Client type: `Public` - - Client ID: `kubernetes` (set manually, don't auto-generate) - - Redirect URIs: `http://localhost:8000/callback` and `http://localhost:18000/callback` (kubelogin defaults) - - Scopes: `openid`, `email`, `profile` - - Subject mode: `Based on the User's Email` - - Include claims in id_token: **Yes** - -4. Go to **Applications → Applications → Create** - - Name: `Kubernetes` - - Slug: `kubernetes` - - Provider: Select the `kubernetes` provider just created - -**Step 2: Create a custom scope mapping for groups** - -1. Go to **Customization → Property Mappings → Create** -2. Select **Scope Mapping** -3. Configure: - - Name: `Kubernetes Groups` - - Scope name: `groups` - - Expression: - ```python - return { - "groups": [group.name for group in request.user.ak_groups.all()] - } - ``` - -4. Go back to the `kubernetes` provider → Edit → add the `Kubernetes Groups` scope mapping - -**Step 3: Create Authentik groups for Kubernetes roles** - -1. Go to **Directory → Groups → Create** -2. Create groups: - - `kubernetes-admins` - - `kubernetes-power-users` - - `kubernetes-namespace-owners` -3. Assign your own user to `kubernetes-admins` - -**Step 4: Verify OIDC discovery endpoint** - -```bash -curl -s https://authentik.viktorbarzin.me/application/o/kubernetes/.well-known/openid-configuration | jq . -``` - -Expected: JSON with `issuer`, `authorization_endpoint`, `token_endpoint`, `jwks_uri` fields. - -**Step 5: Commit a note about the Authentik setup** - -No Terraform changes for this step — Authentik apps are managed via UI. Document the client ID in the design doc. - ---- - -### Task 2: Configure kube-apiserver OIDC Flags - -The API server runs as a static pod on k8s-master (10.0.20.100). The manifest is at `/etc/kubernetes/manifests/kube-apiserver.yaml`. Kubelet watches this file and auto-restarts the pod on changes. - -**Files:** -- Create: `modules/kubernetes/rbac/apiserver-oidc.tf` -- Modify: `modules/kubernetes/main.tf` (add rbac module call) -- Modify: `modules/kubernetes/rbac/main.tf` (will be created in Task 3, but apiserver config is separate) - -**Step 1: Create the rbac module directory** - -```bash -mkdir -p modules/kubernetes/rbac -``` - -**Step 2: Create the API server OIDC configuration** - -Create `modules/kubernetes/rbac/apiserver-oidc.tf`: - -```hcl -# Configure kube-apiserver for OIDC authentication -# This SSHs to k8s-master and adds OIDC flags to the static pod manifest. -# Kubelet auto-restarts the API server when the manifest changes. - -variable "k8s_master_host" { - type = string - default = "10.0.20.100" -} - -variable "ssh_private_key" { - type = string - sensitive = true -} - -variable "oidc_issuer_url" { - type = string - default = "https://authentik.viktorbarzin.me/application/o/kubernetes/" -} - -variable "oidc_client_id" { - type = string - default = "kubernetes" -} - -resource "null_resource" "apiserver_oidc_config" { - connection { - type = "ssh" - user = "wizard" - host = var.k8s_master_host - private_key = var.ssh_private_key - } - - provisioner "remote-exec" { - inline = [ - # Check if OIDC flags already present - "if grep -q 'oidc-issuer-url' /etc/kubernetes/manifests/kube-apiserver.yaml; then echo 'OIDC flags already configured'; exit 0; fi", - - # Backup the manifest - "sudo cp /etc/kubernetes/manifests/kube-apiserver.yaml /etc/kubernetes/manifests/kube-apiserver.yaml.bak", - - # Add OIDC flags after the last --etcd flag (safe insertion point) - "sudo sed -i '/- --tls-private-key-file/a\\ - --oidc-issuer-url=${var.oidc_issuer_url}\\n - --oidc-client-id=${var.oidc_client_id}\\n - --oidc-username-claim=email\\n - --oidc-groups-claim=groups' /etc/kubernetes/manifests/kube-apiserver.yaml", - - # Wait for API server to restart (kubelet watches the manifest) - "echo 'Waiting for API server to restart...'", - "sleep 30", - "sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf get nodes || echo 'API server still restarting, check manually'", - ] - } - - triggers = { - oidc_issuer_url = var.oidc_issuer_url - oidc_client_id = var.oidc_client_id - } -} -``` - -**Step 3: Verify the API server accepts OIDC (manual check)** - -```bash -ssh wizard@10.0.20.100 "sudo grep oidc /etc/kubernetes/manifests/kube-apiserver.yaml" -``` - -Expected output: -``` - - --oidc-issuer-url=https://authentik.viktorbarzin.me/application/o/kubernetes/ - - --oidc-client-id=kubernetes - - --oidc-username-claim=email - - --oidc-groups-claim=groups -``` - ---- - -### Task 3: Create RBAC Terraform Module - -**Files:** -- Create: `modules/kubernetes/rbac/main.tf` -- Modify: `modules/kubernetes/main.tf` (add module call + variables) -- Modify: `main.tf` (root, pass ssh_private_key and k8s_users) -- Modify: `terraform.tfvars` (add k8s_users definition) - -**Step 1: Create `modules/kubernetes/rbac/main.tf`** - -```hcl -variable "tls_secret_name" {} -variable "tier" { type = string } - -variable "k8s_users" { - type = map(object({ - role = string # "admin", "power-user", "namespace-owner" - email = string # OIDC email claim - namespaces = optional(list(string), []) # for namespace-owners - quota = optional(object({ - cpu_requests = optional(string, "2") - memory_requests = optional(string, "4Gi") - cpu_limits = optional(string, "4") - memory_limits = optional(string, "8Gi") - pods = optional(string, "20") - }), {}) - })) - default = {} -} - -# --- Admin role --- -# Binds to built-in cluster-admin ClusterRole - -resource "kubernetes_cluster_role_binding" "admin_users" { - for_each = { for name, user in var.k8s_users : name => user if user.role == "admin" } - - metadata { - name = "oidc-admin-${each.key}" - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "cluster-admin" - } - - subject { - kind = "User" - name = each.value.email - api_group = "rbac.authorization.k8s.io" - } -} - -# --- Power-user role --- -# Can manage workloads cluster-wide but cannot modify RBAC, nodes, or persistent volumes - -resource "kubernetes_cluster_role" "power_user" { - metadata { - name = "oidc-power-user" - } - - # Core resources - rule { - api_groups = [""] - resources = ["pods", "pods/log", "pods/exec", "services", "endpoints", "configmaps", "secrets", "persistentvolumeclaims", "events", "namespaces"] - verbs = ["get", "list", "watch"] - } - - rule { - api_groups = [""] - resources = ["pods", "services", "configmaps", "secrets", "persistentvolumeclaims"] - verbs = ["create", "update", "patch", "delete"] - } - - # Apps - rule { - api_groups = ["apps"] - resources = ["deployments", "statefulsets", "daemonsets", "replicasets"] - verbs = ["get", "list", "watch", "create", "update", "patch", "delete"] - } - - # Batch - rule { - api_groups = ["batch"] - resources = ["jobs", "cronjobs"] - verbs = ["get", "list", "watch", "create", "update", "patch", "delete"] - } - - # Networking - rule { - api_groups = ["networking.k8s.io"] - resources = ["ingresses", "networkpolicies"] - verbs = ["get", "list", "watch", "create", "update", "patch", "delete"] - } - - # Autoscaling - rule { - api_groups = ["autoscaling"] - resources = ["horizontalpodautoscalers"] - verbs = ["get", "list", "watch", "create", "update", "patch", "delete"] - } - - # Read-only on cluster-level resources - rule { - api_groups = [""] - resources = ["nodes"] - verbs = ["get", "list", "watch"] - } - - rule { - api_groups = ["storage.k8s.io"] - resources = ["storageclasses"] - verbs = ["get", "list", "watch"] - } - - rule { - api_groups = ["rbac.authorization.k8s.io"] - resources = ["clusterroles", "clusterrolebindings", "roles", "rolebindings"] - verbs = ["get", "list", "watch"] - } -} - -resource "kubernetes_cluster_role_binding" "power_users" { - for_each = { for name, user in var.k8s_users : name => user if user.role == "power-user" } - - metadata { - name = "oidc-power-user-${each.key}" - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = kubernetes_cluster_role.power_user.metadata[0].name - } - - subject { - kind = "User" - name = each.value.email - api_group = "rbac.authorization.k8s.io" - } -} - -# --- Namespace-owner role --- -# Full admin within assigned namespaces + read-only cluster-wide - -locals { - # Flatten user→namespace pairs for iteration - namespace_owner_pairs = flatten([ - for name, user in var.k8s_users : [ - for ns in user.namespaces : { - user_key = name - namespace = ns - email = user.email - quota = user.quota - } - ] if user.role == "namespace-owner" - ]) -} - -resource "kubernetes_namespace" "user_namespaces" { - for_each = { for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair } - - metadata { - name = each.value.namespace - labels = { - tier = var.tier - "k8s-portal/owner" = each.value.user_key - "k8s-portal/managed-by" = "rbac-module" - } - } -} - -resource "kubernetes_role_binding" "namespace_owner" { - for_each = { for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair } - - metadata { - name = "namespace-owner-${each.value.user_key}" - namespace = each.value.namespace - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "admin" # Built-in ClusterRole with full namespace access - } - - subject { - kind = "User" - name = each.value.email - api_group = "rbac.authorization.k8s.io" - } - - depends_on = [kubernetes_namespace.user_namespaces] -} - -# Read-only cluster-wide access for namespace owners -resource "kubernetes_cluster_role" "namespace_owner_readonly" { - metadata { - name = "oidc-namespace-owner-readonly" - } - - rule { - api_groups = [""] - resources = ["namespaces", "nodes"] - verbs = ["get", "list", "watch"] - } - - rule { - api_groups = [""] - resources = ["pods", "services", "configmaps", "events"] - verbs = ["get", "list", "watch"] - } - - rule { - api_groups = ["apps"] - resources = ["deployments", "statefulsets", "daemonsets"] - verbs = ["get", "list", "watch"] - } -} - -resource "kubernetes_cluster_role_binding" "namespace_owner_readonly" { - for_each = { for name, user in var.k8s_users : name => user if user.role == "namespace-owner" } - - metadata { - name = "oidc-ns-owner-readonly-${each.key}" - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = kubernetes_cluster_role.namespace_owner_readonly.metadata[0].name - } - - subject { - kind = "User" - name = each.value.email - api_group = "rbac.authorization.k8s.io" - } -} - -# Resource quotas per user namespace -resource "kubernetes_resource_quota" "user_namespace_quota" { - for_each = { for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair } - - metadata { - name = "user-quota" - namespace = each.value.namespace - } - - spec { - hard = { - "requests.cpu" = each.value.quota.cpu_requests - "requests.memory" = each.value.quota.memory_requests - "limits.cpu" = each.value.quota.cpu_limits - "limits.memory" = each.value.quota.memory_limits - "pods" = each.value.quota.pods - } - } - - depends_on = [kubernetes_namespace.user_namespaces] -} - -# ConfigMap with user-role mapping for the self-service portal -resource "kubernetes_config_map" "user_roles" { - metadata { - name = "k8s-user-roles" - namespace = "k8s-portal" - } - - data = { - "users.json" = jsonencode({ - for name, user in var.k8s_users : user.email => { - role = user.role - namespaces = user.namespaces - } - }) - } -} -``` - -**Step 2: Add variables and module call to `modules/kubernetes/main.tf`** - -Add these variables at the top of the file (after existing variables): - -```hcl -variable "k8s_users" { - type = map(any) - default = {} -} -variable "ssh_private_key" { - type = string - default = "" - sensitive = true -} -``` - -Add the module call (after the authentik module block, around line 830): - -```hcl -module "rbac" { - source = "./rbac" - for_each = contains(local.active_modules, "authentik") ? { rbac = true } : {} - tier = local.tiers.cluster - tls_secret_name = var.tls_secret_name - k8s_users = var.k8s_users - ssh_private_key = var.ssh_private_key -} -``` - -**Step 3: Pass variables from root `main.tf`** - -Add to the `module "kubernetes_cluster"` block (around line 514): - -```hcl - k8s_users = var.k8s_users - ssh_private_key = var.ssh_private_key -``` - -Add the `k8s_users` variable definition at the root level: - -```hcl -variable "k8s_users" { - type = map(any) - default = {} -} -``` - -**Step 4: Add users to `terraform.tfvars`** - -```hcl -k8s_users = { - "viktor" = { - role = "admin" - email = "viktor@viktorbarzin.me" - namespaces = [] - } -} -``` - -**Step 5: Run terraform plan to verify** - -```bash -terraform plan -target=module.kubernetes_cluster.module.rbac -var="kube_config_path=$(pwd)/config" -``` - -Expected: Plan shows ClusterRoleBinding for admin user, power-user ClusterRole, namespace-owner ClusterRole, and ConfigMap creation. - -**Step 6: Apply** - -```bash -terraform apply -target=module.kubernetes_cluster.module.rbac -var="kube_config_path=$(pwd)/config" -auto-approve -``` - -**Step 7: Commit** - -```bash -git add modules/kubernetes/rbac/ modules/kubernetes/main.tf main.tf -git commit -m "[ci skip] Add RBAC module for multi-user Kubernetes access" -``` - ---- - -### Task 4: Configure Audit Logging on kube-apiserver - -**Files:** -- Create: `modules/kubernetes/rbac/audit-policy.tf` - -**Step 1: Create the audit policy configuration** - -Create `modules/kubernetes/rbac/audit-policy.tf`: - -```hcl -# Deploy audit policy to k8s-master and configure kube-apiserver to use it. -# Audit logs are written to /var/log/kubernetes/audit.log on the master node. -# Alloy (log collector DaemonSet) will pick them up and ship to Loki. - -resource "null_resource" "audit_policy" { - connection { - type = "ssh" - user = "wizard" - host = var.k8s_master_host - private_key = var.ssh_private_key - } - - # Upload audit policy file - provisioner "file" { - content = yamlencode({ - apiVersion = "audit.k8s.io/v1" - kind = "Policy" - rules = [ - { - # Don't log requests to the API discovery endpoints (very noisy) - level = "None" - resources = [{ - group = "" - resources = ["endpoints", "services", "services/status"] - }] - users = ["system:kube-proxy"] - }, - { - # Don't log watch requests (very noisy) - level = "None" - verbs = ["watch"] - }, - { - # Don't log health checks - level = "None" - nonResourceURLs = ["/healthz*", "/readyz*", "/livez*"] - }, - { - # Log secret access at Metadata level only (no request/response bodies) - level = "Metadata" - resources = [{ - group = "" - resources = ["secrets"] - }] - }, - { - # Log all other mutating requests at RequestResponse level - level = "RequestResponse" - verbs = ["create", "update", "patch", "delete"] - }, - { - # Log read requests at Metadata level - level = "Metadata" - verbs = ["get", "list"] - }, - ] - }) - destination = "/tmp/audit-policy.yaml" - } - - provisioner "remote-exec" { - inline = [ - # Move audit policy to proper location - "sudo mkdir -p /etc/kubernetes/policies", - "sudo mv /tmp/audit-policy.yaml /etc/kubernetes/policies/audit-policy.yaml", - "sudo chown root:root /etc/kubernetes/policies/audit-policy.yaml", - - # Create audit log directory - "sudo mkdir -p /var/log/kubernetes", - - # Check if audit flags already present - "if grep -q 'audit-policy-file' /etc/kubernetes/manifests/kube-apiserver.yaml; then echo 'Audit flags already configured'; exit 0; fi", - - # Add audit flags to kube-apiserver manifest - "sudo sed -i '/- --oidc-groups-claim/a\\ - --audit-policy-file=/etc/kubernetes/policies/audit-policy.yaml\\n - --audit-log-path=/var/log/kubernetes/audit.log\\n - --audit-log-maxage=7\\n - --audit-log-maxbackup=3\\n - --audit-log-maxsize=100' /etc/kubernetes/manifests/kube-apiserver.yaml", - - # Add volume mount for audit policy (hostPath) - # The kube-apiserver pod needs access to the policy file and log directory - "sudo sed -i '/volumes:/a\\ - hostPath:\\n path: /etc/kubernetes/policies\\n type: DirectoryOrCreate\\n name: audit-policy\\n - hostPath:\\n path: /var/log/kubernetes\\n type: DirectoryOrCreate\\n name: audit-log' /etc/kubernetes/manifests/kube-apiserver.yaml", - - "sudo sed -i '/volumeMounts:/a\\ - mountPath: /etc/kubernetes/policies\\n name: audit-policy\\n readOnly: true\\n - mountPath: /var/log/kubernetes\\n name: audit-log' /etc/kubernetes/manifests/kube-apiserver.yaml", - - # Wait for API server to restart - "echo 'Waiting for API server to restart with audit logging...'", - "sleep 30", - "sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf get nodes || echo 'API server still restarting'", - ] - } - - triggers = { - policy_version = "v1" # Bump to re-apply - } - - depends_on = [null_resource.apiserver_oidc_config] -} -``` - -**Step 2: Apply** - -```bash -terraform apply -target=module.kubernetes_cluster.module.rbac -var="kube_config_path=$(pwd)/config" -auto-approve -``` - -**Step 3: Verify audit log is being written** - -```bash -ssh wizard@10.0.20.100 "sudo tail -5 /var/log/kubernetes/audit.log | jq -r '.user.username + \" \" + .verb + \" \" + .objectRef.resource'" -``` - -Expected: Lines showing API server requests with usernames and resources. - -**Step 4: Commit** - -```bash -git add modules/kubernetes/rbac/audit-policy.tf -git commit -m "[ci skip] Add Kubernetes audit logging to kube-apiserver" -``` - ---- - -### Task 5: Configure Alloy to Collect Audit Logs - -The Alloy DaemonSet (log collector) needs to be configured to also collect `/var/log/kubernetes/audit.log` from the master node and ship it to Loki. - -**Files:** -- Modify: `modules/kubernetes/monitoring/alloy.yaml` (add audit log scrape config) - -**Step 1: Add audit log collection to Alloy config** - -In `modules/kubernetes/monitoring/alloy.yaml`, add a new `local.file_match` and `loki.source.file` block for audit logs: - -``` -local.file_match "audit_logs" { - path_targets = [{ - __path__ = "/var/log/kubernetes/audit.log" - job = "kubernetes-audit" - node = env("HOSTNAME") - }] -} - -loki.source.file "audit_logs" { - targets = local.file_match.audit_logs.targets - forward_to = [loki.write.default.receiver] -} -``` - -**Step 2: Ensure Alloy DaemonSet mounts `/var/log/kubernetes`** - -The Alloy Helm values need to mount `/var/log/kubernetes` from the host. Check if the existing `/var/log` hostPath mount already covers this (it likely does, since `/var/log/kubernetes` is a subdirectory). - -**Step 3: Apply monitoring module** - -```bash -terraform apply -target=module.kubernetes_cluster.module.monitoring -var="kube_config_path=$(pwd)/config" -auto-approve -``` - -**Step 4: Verify in Grafana** - -Go to Grafana → Explore → Loki datasource. Run: - -```logql -{job="kubernetes-audit"} | json | line_format "{{.user.username}} {{.verb}} {{.objectRef.resource}}" -``` - -**Step 5: Commit** - -```bash -git add modules/kubernetes/monitoring/alloy.yaml -git commit -m "[ci skip] Add Kubernetes audit log collection to Alloy" -``` - ---- - -### Task 6: Build Self-Service Portal (SvelteKit App) - -**Files:** -- Create: `modules/kubernetes/k8s-portal/` (entire module) -- Create: `modules/kubernetes/k8s-portal/files/` (SvelteKit app source) -- Modify: `modules/kubernetes/main.tf` (add module call) -- Modify: `terraform.tfvars` (add DNS entry) - -**Step 1: Create the SvelteKit app** - -```bash -mkdir -p modules/kubernetes/k8s-portal/files -cd modules/kubernetes/k8s-portal/files -npm create svelte@latest . -- --template skeleton --types typescript -npm install -``` - -**Step 2: Create the portal pages** - -The portal has three pages: -1. **`/`** — Landing page showing user's role and namespaces -2. **`/download`** — Generates and serves the kubeconfig file -3. **`/setup`** — Instructions for installing kubectl and kubelogin - -The app reads user identity from Traefik forward auth headers (`X-authentik-email`, `X-authentik-username`, `X-authentik-groups`) and user role data from the `k8s-user-roles` ConfigMap (mounted as a volume). - -Create `src/routes/+page.server.ts`: -```typescript -import type { PageServerLoad } from './$types'; -import { readFileSync } from 'fs'; - -interface UserRole { - role: string; - namespaces: string[]; -} - -export const load: PageServerLoad = async ({ request }) => { - const email = request.headers.get('x-authentik-email') || 'unknown'; - const username = request.headers.get('x-authentik-username') || 'unknown'; - const groups = request.headers.get('x-authentik-groups') || ''; - - // Read user roles from ConfigMap-mounted file - let userRole: UserRole = { role: 'unknown', namespaces: [] }; - try { - const usersJson = readFileSync('/config/users.json', 'utf-8'); - const users = JSON.parse(usersJson); - if (users[email]) { - userRole = users[email]; - } - } catch { - // ConfigMap not mounted or parse error - } - - return { - email, - username, - groups: groups.split('|').filter(Boolean), - role: userRole.role, - namespaces: userRole.namespaces, - }; -}; -``` - -Create `src/routes/+page.svelte`: -```svelte - - -
-

Kubernetes Access Portal

- -
-

Your Identity

-

Username: {data.username}

-

Email: {data.email}

-

Role: {data.role}

- {#if data.namespaces.length > 0} -

Namespaces: {data.namespaces.join(', ')}

- {/if} -
- -
-

Get Started

-
    -
  1. Install kubectl and kubelogin
  2. -
  3. Download your kubeconfig
  4. -
  5. Run kubectl get pods to verify access
  6. -
-
-
- - -``` - -Create `src/routes/download/+server.ts`: -```typescript -import type { RequestHandler } from './$types'; -import { readFileSync } from 'fs'; - -const CLUSTER_SERVER = 'https://10.0.20.100:6443'; -const OIDC_ISSUER = 'https://authentik.viktorbarzin.me/application/o/kubernetes/'; -const OIDC_CLIENT_ID = 'kubernetes'; - -export const GET: RequestHandler = async ({ request }) => { - const email = request.headers.get('x-authentik-email') || 'user'; - - // Read CA cert from mounted kubeconfig or file - let caCert = ''; - try { - caCert = readFileSync('/config/ca.crt', 'utf-8'); - } catch { - // CA cert not available - } - - const caCertBase64 = Buffer.from(caCert).toString('base64'); - const sanitizedEmail = email.replace(/[^a-zA-Z0-9@._-]/g, ''); - - const kubeconfig = `apiVersion: v1 -kind: Config -clusters: -- cluster: - server: ${CLUSTER_SERVER} - certificate-authority-data: ${caCertBase64} - name: home-cluster -contexts: -- context: - cluster: home-cluster - user: oidc-${sanitizedEmail} - name: home-cluster -current-context: home-cluster -users: -- name: oidc-${sanitizedEmail} - user: - exec: - apiVersion: client.authentication.k8s.io/v1beta1 - command: kubectl - args: - - oidc-login - - get-token - - --oidc-issuer-url=${OIDC_ISSUER} - - --oidc-client-id=${OIDC_CLIENT_ID} - interactiveMode: IfAvailable -`; - - return new Response(kubeconfig, { - headers: { - 'Content-Type': 'application/yaml', - 'Content-Disposition': `attachment; filename="kubeconfig-home-cluster.yaml"`, - }, - }); -}; -``` - -Create `src/routes/setup/+page.svelte`: -```svelte -
-

Setup Instructions

- -
-

1. Install kubectl

-

macOS

-
brew install kubectl
-

Linux

-
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
-chmod +x kubectl && sudo mv kubectl /usr/local/bin/
-
- -
-

2. Install kubelogin (OIDC plugin)

-

macOS

-
brew install int128/kubelogin/kubelogin
-

Linux

-
curl -LO https://github.com/int128/kubelogin/releases/latest/download/kubelogin_linux_amd64.zip
-unzip kubelogin_linux_amd64.zip && sudo mv kubelogin /usr/local/bin/kubectl-oidc_login
-
- -
-

3. Download and use your kubeconfig

-
-# Download from the portal
-curl -o ~/.kube/config-home https://k8s-portal.viktorbarzin.me/download
-
-# Set the KUBECONFIG environment variable
-export KUBECONFIG=~/.kube/config-home
-
-# Test access (opens browser for login)
-kubectl get namespaces
-    
-
- -

← Back to portal

-
- - -``` - -Create `Dockerfile`: -```dockerfile -FROM node:22-alpine AS build -WORKDIR /app -COPY package*.json ./ -RUN npm ci -COPY . . -RUN npm run build - -FROM node:22-alpine -WORKDIR /app -COPY --from=build /app/build ./build -COPY --from=build /app/package.json ./ -COPY --from=build /app/node_modules ./node_modules -ENV PORT=3000 -EXPOSE 3000 -CMD ["node", "build"] -``` - -Ensure SvelteKit uses the Node adapter. Update `svelte.config.js`: -```javascript -import adapter from '@sveltejs/adapter-node'; -export default { kit: { adapter: adapter() } }; -``` - -Install the Node adapter: -```bash -cd modules/kubernetes/k8s-portal/files -npm install -D @sveltejs/adapter-node -``` - -**Step 3: Create the Terraform module** - -Create `modules/kubernetes/k8s-portal/main.tf`: - -```hcl -variable "tls_secret_name" {} -variable "tier" { type = string } - -resource "kubernetes_namespace" "k8s_portal" { - metadata { - name = "k8s-portal" - labels = { - tier = var.tier - } - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.k8s_portal.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "kubernetes_deployment" "k8s_portal" { - metadata { - name = "k8s-portal" - namespace = kubernetes_namespace.k8s_portal.metadata[0].name - labels = { - app = "k8s-portal" - tier = var.tier - } - } - - spec { - replicas = 1 - selector { - match_labels = { - app = "k8s-portal" - } - } - - template { - metadata { - labels = { - app = "k8s-portal" - } - } - - spec { - container { - name = "portal" - image = "10.0.20.10:5000/k8s-portal:latest" - port { - container_port = 3000 - } - - volume_mount { - name = "config" - mount_path = "/config" - read_only = true - } - } - - volume { - name = "config" - config_map { - name = "k8s-portal-config" - } - } - } - } - } -} - -resource "kubernetes_config_map" "k8s_portal_config" { - metadata { - name = "k8s-portal-config" - namespace = kubernetes_namespace.k8s_portal.metadata[0].name - } - - data = { - # CA cert extracted from kubeconfig — pass via variable or read from file - "ca.crt" = "" # Will be populated with cluster CA cert - } -} - -resource "kubernetes_service" "k8s_portal" { - metadata { - name = "k8s-portal" - namespace = kubernetes_namespace.k8s_portal.metadata[0].name - } - - spec { - selector = { - app = "k8s-portal" - } - port { - port = 80 - target_port = 3000 - } - } -} - -module "ingress" { - source = "../ingress_factory" - namespace = kubernetes_namespace.k8s_portal.metadata[0].name - name = "k8s-portal" - tls_secret_name = var.tls_secret_name - protected = true # Require Authentik login -} -``` - -**Step 4: Add module call to `modules/kubernetes/main.tf`** - -```hcl -module "k8s-portal" { - source = "./k8s-portal" - for_each = contains(local.active_modules, "authentik") ? { portal = true } : {} - tier = local.tiers.edge - tls_secret_name = var.tls_secret_name -} -``` - -**Step 5: Add DNS record** - -Add `k8s-portal` to `cloudflare_non_proxied_names` in `terraform.tfvars`. - -**Step 6: Build and push Docker image** - -```bash -cd modules/kubernetes/k8s-portal/files -docker build -t 10.0.20.10:5000/k8s-portal:latest . -docker push 10.0.20.10:5000/k8s-portal:latest -``` - -**Step 7: Apply** - -```bash -terraform apply -target=module.kubernetes_cluster.module.k8s-portal -var="kube_config_path=$(pwd)/config" -auto-approve -terraform apply -target=module.kubernetes_cluster.module.cloudflared -var="kube_config_path=$(pwd)/config" -auto-approve -``` - -**Step 8: Verify portal works** - -Visit `https://k8s-portal.viktorbarzin.me` — should redirect to Authentik login, then show your role and kubeconfig download. - -**Step 9: Commit** - -```bash -git add modules/kubernetes/k8s-portal/ modules/kubernetes/main.tf -git commit -m "[ci skip] Add self-service Kubernetes access portal" -``` - ---- - -### Task 7: Create Grafana Dashboard for Audit Logs - -**Files:** -- Create: `modules/kubernetes/monitoring/dashboards/k8s-audit.json` - -**Step 1: Create Grafana dashboard** - -Create a dashboard JSON file that queries Loki for audit logs. The dashboard should show: -- **Panel 1**: Table of recent actions (user, verb, resource, namespace, timestamp) -- **Panel 2**: Time series of request count by user -- **Panel 3**: Table of denied requests - -LogQL queries: -- Recent actions: `{job="kubernetes-audit"} | json | line_format "{{.user.username}} {{.verb}} {{.objectRef.resource}} {{.objectRef.namespace}}"` -- By user: `sum by (user_username) (count_over_time({job="kubernetes-audit"} | json [5m]))` -- Denied: `{job="kubernetes-audit"} | json | responseStatus_code >= 403` - -Store the dashboard JSON in `modules/kubernetes/monitoring/dashboards/k8s-audit.json` and provision it via Grafana's file provisioning (same pattern as other dashboards). - -**Step 2: Apply monitoring** - -```bash -terraform apply -target=module.kubernetes_cluster.module.monitoring -var="kube_config_path=$(pwd)/config" -auto-approve -``` - -**Step 3: Commit** - -```bash -git add modules/kubernetes/monitoring/dashboards/k8s-audit.json -git commit -m "[ci skip] Add Grafana dashboard for Kubernetes audit logs" -``` - ---- - -### Task 8: End-to-End Verification - -**Step 1: Test OIDC login with kubelogin** - -```bash -# Install kubelogin -brew install int128/kubelogin/kubelogin - -# Download kubeconfig from portal -curl -H "X-authentik-email: viktor@viktorbarzin.me" -o /tmp/test-kubeconfig https://k8s-portal.viktorbarzin.me/download - -# Test kubectl with OIDC -KUBECONFIG=/tmp/test-kubeconfig kubectl get namespaces -``` - -This should open a browser for Authentik login, then return the namespace list. - -**Step 2: Test RBAC enforcement** - -Create a test namespace-owner user in `terraform.tfvars`, apply, then verify they can only access their namespace. - -**Step 3: Test audit logging** - -After running kubectl commands, verify they appear in Grafana: -- Go to Grafana → Explore → Loki -- Query: `{job="kubernetes-audit"} | json | user_username="viktor@viktorbarzin.me"` - -**Step 4: Final commit and push** - -```bash -git add -A -git commit -m "[ci skip] Multi-user Kubernetes access: complete implementation" -git push origin master -``` diff --git a/docs/plans/2026-02-21-openclaw-cluster-agent-design.md b/docs/plans/2026-02-21-openclaw-cluster-agent-design.md deleted file mode 100644 index fba6446e..00000000 --- a/docs/plans/2026-02-21-openclaw-cluster-agent-design.md +++ /dev/null @@ -1,111 +0,0 @@ -# OpenClaw Cluster Management Agent — Design - -**Date**: 2026-02-21 -**Status**: Approved - -## Goal - -Build a proactive cluster management agent that runs scheduled health checks every 30 minutes, auto-fixes safe issues, and alerts via Slack. The agent is "taught" via an OpenClaw skill and a reusable health check script. - -## Architecture - -``` -CronJob (every 30min) - └─ kubectl exec into OpenClaw pod - └─ /workspace/infra/.claude/cluster-health.sh - ├─ kubectl get nodes (check health) - ├─ kubectl get pods -A (find problems) - ├─ kubectl delete pod (evicted/stuck) - └─ curl Slack webhook (report) -``` - -Interactive path: User asks OpenClaw via UI -> `cluster-health` skill triggers -> runs same script -> LLM analyzes output and can do deeper investigation. - -## Components - -### 1. `cluster-health` skill (`.claude/skills/cluster-health/SKILL.md`) - -Teaches OpenClaw: -- What health checks to run -- What's safe to auto-fix vs alert-only -- How to format Slack alerts -- How to do deeper investigation when asked interactively - -Trigger conditions: "check cluster", "cluster health", "what's wrong", "health check", etc. - -### 2. `cluster-health.sh` helper script (`.claude/cluster-health.sh`) - -Reusable script that performs all checks: - -**Checks:** -- Node health (NotReady, MemoryPressure, DiskPressure, PIDPressure) -- Pod health (CrashLoopBackOff, ImagePullBackOff, Error, OOMKilled, Pending) -- Evicted pods -- Failed deployments (unavailable replicas) -- Pending PVCs -- Resource pressure (high CPU/memory allocation) -- Failed CronJobs -- DaemonSet health (missing pods) - -**Safe auto-fix actions:** -- Delete evicted pods -- Delete completed/succeeded pods older than 24h -- Restart (delete) pods in CrashLoopBackOff for more than 1 hour - -**Alert-only (never auto-fix):** -- Node NotReady -- Persistent OOMKilled -- ImagePullBackOff -- Pending PVCs -- Failed deployments with 0 available replicas - -**Output:** -- Structured text summary -- Posts to Slack via webhook -- Exit code 0 = healthy, 1 = issues found - -### 3. Kubernetes CronJob (in `modules/kubernetes/openclaw/main.tf`) - -- Schedule: `*/30 * * * *` -- Container: `bitnami/kubectl` (minimal image with kubectl) -- Command: `kubectl exec deploy/openclaw -n openclaw -- /bin/bash /workspace/infra/.claude/cluster-health.sh` -- ServiceAccount with RBAC to exec into pods in `openclaw` namespace -- `concurrencyPolicy: Forbid` -- `failedJobsHistoryLimit: 3` -- `successfulJobsHistoryLimit: 3` - -### 4. Slack Integration - -- Webhook URL from `openclaw_skill_secrets["slack"]` (already configured) -- Passed as `SLACK_WEBHOOK_URL` env var to the OpenClaw pod - -## Slack Message Format - -``` -:white_check_mark: Cluster Health Check — All Clear -Nodes: 5/5 Ready | Pods: 142 Running | 0 Issues -``` - -``` -:warning: Cluster Health Check — 3 Issues Found - -Auto-fixed: -- Deleted 4 evicted pods in monitoring namespace -- Restarted stuck pod calibre-web-xyz (CrashLoopBackOff >1h) - -Needs attention: -- Node k8s-node3: MemoryPressure condition detected -- PVC data-tandoor pending for 45 minutes -``` - -## Decisions - -| Decision | Choice | Rationale | -|----------|--------|-----------| -| Mode | Proactive (scheduled) | Want automated monitoring | -| Alert channel | Slack | Existing webhook in openclaw_skill_secrets | -| Auto-fix | Safe fixes only | Delete evicted, restart stuck; alert for the rest | -| Frequency | 30 minutes | Balance between detection speed and overhead | -| Checks scope | Standard K8s health | Pod/node/deployment/PVC/CronJob/DaemonSet | -| Trigger mechanism | CronJob execs into OpenClaw pod | Reuses OpenClaw's tools; LLM available interactively | -| Fallback | None | Uptime Kuma monitors OpenClaw availability | diff --git a/docs/plans/2026-02-21-openclaw-cluster-agent-plan.md b/docs/plans/2026-02-21-openclaw-cluster-agent-plan.md deleted file mode 100644 index 9f51df67..00000000 --- a/docs/plans/2026-02-21-openclaw-cluster-agent-plan.md +++ /dev/null @@ -1,800 +0,0 @@ -# OpenClaw Cluster Management Agent — Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Build a proactive cluster health agent — a skill that teaches OpenClaw to check the cluster, a helper script that runs the checks and posts to Slack, and a CronJob that triggers it every 30 minutes via `kubectl exec`. - -**Architecture:** CronJob (bitnami/kubectl) -> `kubectl exec` into OpenClaw pod -> runs `cluster-health.sh` which performs 8 health checks, auto-fixes safe issues, and posts a summary to Slack. The same script is available as an OpenClaw skill for interactive use. - -**Tech Stack:** Bash (health check script), Terraform/HCL (CronJob + RBAC), Slack webhook API, kubectl - ---- - -### Task 1: Add Slack webhook to openclaw_skill_secrets - -**Files:** -- Modify: `terraform.tfvars:1291-1295` (add slack_webhook key) -- Modify: `modules/kubernetes/openclaw/main.tf:350-376` (add SLACK_WEBHOOK_URL env var) - -**Step 1: Add slack_webhook to openclaw_skill_secrets in tfvars** - -Add a new key `slack_webhook` to the existing `openclaw_skill_secrets` map. The user must provide the webhook URL. For now, use the existing `alertmanager_slack_api_url` value or a dedicated one. - -In `terraform.tfvars`, change: -```hcl -openclaw_skill_secrets = { - home_assistant_token = "..." - home_assistant_sofia_token = "..." - uptime_kuma_password = "..." -} -``` -to: -```hcl -openclaw_skill_secrets = { - home_assistant_token = "..." - home_assistant_sofia_token = "..." - uptime_kuma_password = "..." - slack_webhook = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL" -} -``` - -**NOTE:** Ask the user which Slack webhook URL to use. Candidates: -- `alertmanager_slack_api_url` (line 4 in tfvars) -- `tiny_tuya_slack_url` (line 1213, comment says "K8s bot slack") -- A new webhook the user creates - -**Step 2: Add SLACK_WEBHOOK_URL env var to OpenClaw container** - -In `modules/kubernetes/openclaw/main.tf`, add after the `UPTIME_KUMA_PASSWORD` env block (around line 370): -```hcl - # Skill secrets - Slack - env { - name = "SLACK_WEBHOOK_URL" - value = var.skill_secrets["slack_webhook"] - } -``` - -**Step 3: Commit** - -```bash -git add modules/kubernetes/openclaw/main.tf -git commit -m "[ci skip] Add Slack webhook env var to OpenClaw deployment" -``` - -Do NOT commit `terraform.tfvars` separately — it will be committed with the full set of changes at the end. - ---- - -### Task 2: Create the cluster-health.sh helper script - -**Files:** -- Create: `.claude/cluster-health.sh` - -**Step 1: Write the health check script** - -Create `.claude/cluster-health.sh` with the following structure. The script: -- Uses `$KUBECONFIG` (already set in OpenClaw pod) or falls back to in-cluster config -- Runs 8 checks: nodes, pods, evicted, deployments, PVCs, resources, CronJobs, DaemonSets -- Auto-fixes: deletes evicted pods, restarts CrashLoopBackOff pods stuck >1 hour -- Posts structured Slack message via `$SLACK_WEBHOOK_URL` -- Exit code 0 = healthy, 1 = issues found, 2 = critical - -```bash -#!/usr/bin/env bash -# Cluster health check script for OpenClaw. -# Runs health checks, auto-fixes safe issues, posts to Slack. -# Designed to run inside the OpenClaw pod (has kubectl via $KUBECONFIG). -# -# Usage: ./cluster-health.sh [--no-slack] [--no-fix] -# --no-slack Skip Slack notification (useful for interactive/debug runs) -# --no-fix Skip auto-fix actions (report only) - -set -euo pipefail - -SEND_SLACK=true -AUTO_FIX=true -ISSUES=() -FIXES=() -WARNINGS=() - -# --- Argument parsing --- -for arg in "$@"; do - case "$arg" in - --no-slack) SEND_SLACK=false ;; - --no-fix) AUTO_FIX=false ;; - esac -done - -KUBECTL="kubectl" - -# --- 1. Node Health --- -check_nodes() { - local nodes not_ready - nodes=$($KUBECTL get nodes --no-headers 2>&1) || { ISSUES+=("Cannot reach cluster API"); return; } - not_ready=$(echo "$nodes" | awk '$2 != "Ready" {print $1}' || true) - - if [[ -n "$not_ready" ]]; then - while IFS= read -r node; do - ISSUES+=("Node NotReady: $node") - done <<< "$not_ready" - fi - - # Check conditions - local conditions - conditions=$($KUBECTL get nodes -o json | python3 -c ' -import json, sys -data = json.load(sys.stdin) -for node in data["items"]: - name = node["metadata"]["name"] - for c in node["status"]["conditions"]: - if c["type"] in ("MemoryPressure","DiskPressure","PIDPressure") and c["status"] == "True": - print(name + ": " + c["type"]) -' 2>/dev/null) || true - - if [[ -n "$conditions" ]]; then - while IFS= read -r line; do - ISSUES+=("$line") - done <<< "$conditions" - fi -} - -# --- 2. Pod Health --- -check_pods() { - local bad - bad=$( { - $KUBECTL get pods -A --no-headers 2>/dev/null \ - | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull|Error' || true - } | awk '!seen[$1,$2]++' | sed '/^$/d') || true - - if [[ -z "$bad" ]]; then return; fi - - while IFS= read -r line; do - local ns pod status - ns=$(echo "$line" | awk '{print $1}') - pod=$(echo "$line" | awk '{print $2}') - status=$(echo "$line" | awk '{print $4}') - - if [[ "$status" == "CrashLoopBackOff" ]]; then - # Check if stuck for >1 hour - local restart_count - restart_count=$(echo "$line" | awk '{print $5}') - if [[ "$AUTO_FIX" == true && "$restart_count" -gt 10 ]]; then - $KUBECTL delete pod -n "$ns" "$pod" --grace-period=30 2>/dev/null && \ - FIXES+=("Restarted $ns/$pod (CrashLoopBackOff, $restart_count restarts)") || \ - WARNINGS+=("Failed to restart $ns/$pod") - else - ISSUES+=("CrashLoopBackOff: $ns/$pod ($restart_count restarts)") - fi - elif [[ "$status" == "ImagePullBackOff" || "$status" == "ErrImagePull" ]]; then - ISSUES+=("ImagePullBackOff: $ns/$pod") - else - ISSUES+=("Error: $ns/$pod ($status)") - fi - done <<< "$bad" -} - -# --- 3. Evicted/Failed Pods --- -check_evicted() { - local evicted count - evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true) - - if [[ -z "$evicted" ]]; then return; fi - count=$(echo "$evicted" | wc -l | tr -d ' ') - - if [[ "$AUTO_FIX" == true && "$count" -gt 0 ]]; then - $KUBECTL delete pods -A --field-selector=status.phase=Failed 2>/dev/null && \ - FIXES+=("Deleted $count evicted/failed pod(s)") || \ - WARNINGS+=("Failed to delete evicted pods") - else - ISSUES+=("$count evicted/failed pod(s)") - fi -} - -# --- 4. Failed Deployments --- -check_deployments() { - local deps - deps=$($KUBECTL get deployments -A --no-headers 2>/dev/null) || return - - while IFS= read -r line; do - local ns name ready current desired - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - ready=$(echo "$line" | awk '{print $3}') - current=$(echo "$ready" | cut -d/ -f1) - desired=$(echo "$ready" | cut -d/ -f2) - - if [[ "$current" != "$desired" ]]; then - ISSUES+=("Deployment $ns/$name: $current/$desired ready") - fi - done <<< "$deps" -} - -# --- 5. Pending PVCs --- -check_pvcs() { - local pvcs - pvcs=$($KUBECTL get pvc -A --no-headers 2>/dev/null) || return - - if [[ -z "$pvcs" || "$pvcs" == *"No resources found"* ]]; then return; fi - - while IFS= read -r line; do - local ns name status - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - status=$(echo "$line" | awk '{print $3}') - - if [[ "$status" != "Bound" ]]; then - ISSUES+=("PVC $ns/$name: $status") - fi - done <<< "$pvcs" -} - -# --- 6. Resource Pressure --- -check_resources() { - local top - top=$($KUBECTL top nodes --no-headers 2>/dev/null) || return - - while IFS= read -r line; do - local node cpu_pct mem_pct - node=$(echo "$line" | awk '{print $1}') - cpu_pct=$(echo "$line" | awk '{print $3}' | tr -d '%') - mem_pct=$(echo "$line" | awk '{print $5}' | tr -d '%') - - [[ "$cpu_pct" == *"unknown"* || "$mem_pct" == *"unknown"* ]] && continue - - if [[ "$cpu_pct" -gt 90 || "$mem_pct" -gt 90 ]]; then - ISSUES+=("High resource usage on $node: CPU ${cpu_pct}%, Mem ${mem_pct}%") - elif [[ "$cpu_pct" -gt 80 || "$mem_pct" -gt 80 ]]; then - WARNINGS+=("Elevated resource usage on $node: CPU ${cpu_pct}%, Mem ${mem_pct}%") - fi - done <<< "$top" -} - -# --- 7. CronJob Failures --- -check_cronjobs() { - local failures - failures=$($KUBECTL get jobs -A -o json 2>/dev/null | python3 -c ' -import json, sys -from datetime import datetime, timezone, timedelta - -data = json.load(sys.stdin) -cutoff = datetime.now(timezone.utc) - timedelta(hours=24) - -for job in data.get("items", []): - meta = job.get("metadata", {}) - ns = meta.get("namespace", "") - name = meta.get("name", "") - owners = meta.get("ownerReferences", []) - if not any(o.get("kind") == "CronJob" for o in owners): - continue - for c in job.get("status", {}).get("conditions", []): - if c.get("type") == "Failed" and c.get("status") == "True": - ts = c.get("lastTransitionTime", "") - if ts: - try: - t = datetime.fromisoformat(ts.replace("Z", "+00:00")) - if t > cutoff: - print(f"{ns}/{name}") - except: - print(f"{ns}/{name}") -' 2>/dev/null) || true - - if [[ -n "$failures" ]]; then - local count - count=$(echo "$failures" | wc -l | tr -d ' ') - ISSUES+=("$count CronJob failure(s) in last 24h") - fi -} - -# --- 8. DaemonSet Health --- -check_daemonsets() { - local ds - ds=$($KUBECTL get daemonsets -A --no-headers 2>/dev/null) || return - - while IFS= read -r line; do - local ns name desired ready - ns=$(echo "$line" | awk '{print $1}') - name=$(echo "$line" | awk '{print $2}') - desired=$(echo "$line" | awk '{print $3}') - ready=$(echo "$line" | awk '{print $5}') - - if [[ "$desired" != "$ready" ]]; then - ISSUES+=("DaemonSet $ns/$name: desired=$desired ready=$ready") - fi - done <<< "$ds" -} - -# --- Cluster summary stats --- -get_summary_stats() { - local node_count ready_count pod_count - node_count=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') - ready_count=$($KUBECTL get nodes --no-headers 2>/dev/null | awk '$2 == "Ready"' | wc -l | tr -d ' ') - pod_count=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Running 2>/dev/null | wc -l | tr -d ' ') - echo "${ready_count}/${node_count} nodes | ${pod_count} pods running" -} - -# --- Send Slack message --- -send_slack() { - local webhook_url="$SLACK_WEBHOOK_URL" - if [[ -z "${webhook_url:-}" ]]; then - echo "WARNING: SLACK_WEBHOOK_URL not set, skipping Slack notification" - return - fi - - local summary issue_count fix_count warning_count - summary=$(get_summary_stats) - issue_count=${#ISSUES[@]} - fix_count=${#FIXES[@]} - warning_count=${#WARNINGS[@]} - - local text="" - local total_problems=$((issue_count + warning_count)) - - if [[ "$total_problems" -eq 0 && "$fix_count" -eq 0 ]]; then - text=":white_check_mark: *Cluster Health Check — All Clear*\n${summary} | 0 issues" - else - if [[ "$issue_count" -gt 0 ]]; then - text=":rotating_light: *Cluster Health Check — ${issue_count} Issue(s) Found*\n${summary}" - elif [[ "$warning_count" -gt 0 ]]; then - text=":warning: *Cluster Health Check — ${warning_count} Warning(s)*\n${summary}" - else - text=":white_check_mark: *Cluster Health Check — All Clear (auto-fixed ${fix_count})*\n${summary}" - fi - - if [[ "$fix_count" -gt 0 ]]; then - text+="\n\n*Auto-fixed:*" - for fix in "${FIXES[@]}"; do - text+="\n• ${fix}" - done - fi - - if [[ "$issue_count" -gt 0 ]]; then - text+="\n\n*Needs attention:*" - for issue in "${ISSUES[@]}"; do - text+="\n• ${issue}" - done - fi - - if [[ "$warning_count" -gt 0 ]]; then - text+="\n\n*Warnings:*" - for warning in "${WARNINGS[@]}"; do - text+="\n• ${warning}" - done - fi - fi - - curl -s -X POST "$webhook_url" \ - -H 'Content-Type: application/json' \ - -d "{\"text\": \"${text}\"}" > /dev/null 2>&1 -} - -# --- Main --- -main() { - echo "=== Cluster Health Check — $(date '+%Y-%m-%d %H:%M:%S') ===" - - check_nodes - check_pods - check_evicted - check_deployments - check_pvcs - check_resources - check_cronjobs - check_daemonsets - - local issue_count=${#ISSUES[@]} - local fix_count=${#FIXES[@]} - local warning_count=${#WARNINGS[@]} - - echo "" - echo "Results: ${issue_count} issue(s), ${fix_count} fix(es), ${warning_count} warning(s)" - - if [[ "$fix_count" -gt 0 ]]; then - echo "" - echo "Auto-fixed:" - for fix in "${FIXES[@]}"; do echo " - $fix"; done - fi - - if [[ "$issue_count" -gt 0 ]]; then - echo "" - echo "Issues:" - for issue in "${ISSUES[@]}"; do echo " - $issue"; done - fi - - if [[ "$warning_count" -gt 0 ]]; then - echo "" - echo "Warnings:" - for warning in "${WARNINGS[@]}"; do echo " - $warning"; done - fi - - if [[ "$SEND_SLACK" == true ]]; then - send_slack - echo "" - echo "Slack notification sent." - fi - - # Exit code - if [[ "$issue_count" -gt 0 ]]; then - exit 1 - fi - exit 0 -} - -main "$@" -``` - -**Step 2: Make it executable** - -```bash -chmod +x .claude/cluster-health.sh -``` - -**Step 3: Test locally (dry run)** - -```bash -KUBECONFIG=$(pwd)/config SLACK_WEBHOOK_URL="" bash .claude/cluster-health.sh --no-slack -``` - -Expected: Script runs, prints check results, no Slack post. - -**Step 4: Commit** - -```bash -git add .claude/cluster-health.sh -git commit -m "[ci skip] Add cluster health check script for OpenClaw agent" -``` - ---- - -### Task 3: Create the cluster-health skill - -**Files:** -- Create: `.claude/skills/cluster-health/SKILL.md` - -**Step 1: Write the skill document** - -```markdown ---- -name: cluster-health -description: | - Check Kubernetes cluster health and fix common issues. Use when: - (1) User asks to check the cluster, check health, or "what's wrong", - (2) User asks about pod status, node health, or deployment issues, - (3) User asks to fix stuck pods, evicted pods, or CrashLoopBackOff, - (4) User mentions "health check", "cluster status", "cluster health", - (5) User asks "is everything running" or "any problems". - Runs 8 standard K8s health checks with safe auto-fix for evicted pods - and stuck CrashLoopBackOff pods. -author: Claude Code -version: 1.0.0 -date: 2026-02-21 ---- - -# Cluster Health Check - -## Overview -- **Script**: `/workspace/infra/.claude/cluster-health.sh` -- **Schedule**: CronJob runs every 30 minutes, execs into this pod -- **Slack**: Posts results to `$SLACK_WEBHOOK_URL` -- **Auto-fix**: Deletes evicted pods, restarts CrashLoopBackOff pods (>10 restarts) - -## Quick Check - -Run the health check script: -```bash -bash /workspace/infra/.claude/cluster-health.sh --no-slack -``` - -Or with Slack notification: -```bash -bash /workspace/infra/.claude/cluster-health.sh -``` - -Report-only (no auto-fix): -```bash -bash /workspace/infra/.claude/cluster-health.sh --no-fix -``` - -## What It Checks - -| # | Check | Auto-Fix | Alert | -|---|-------|----------|-------| -| 1 | Node health (NotReady, conditions) | No | Yes | -| 2 | Pod health (CrashLoopBackOff, ImagePullBackOff, Error) | Restart if >10 restarts | Yes | -| 3 | Evicted/failed pods | Delete all | Yes | -| 4 | Deployment availability (current != desired) | No | Yes | -| 5 | PVC status (not Bound) | No | Yes | -| 6 | Resource pressure (CPU/Mem >80%) | No | Yes | -| 7 | CronJob failures (last 24h) | No | Yes | -| 8 | DaemonSet health (desired != ready) | No | Yes | - -## Safe Auto-Fix Rules - -These are the ONLY things the script auto-fixes: -1. **Evicted/failed pods**: `kubectl delete pods -A --field-selector=status.phase=Failed` -2. **CrashLoopBackOff pods with >10 restarts**: `kubectl delete pod -n --grace-period=30` - -Everything else is alert-only. NEVER auto-fix: -- Node NotReady (could be maintenance) -- ImagePullBackOff (needs image tag or registry fix) -- Pending PVCs (needs storage investigation) -- Failed deployments (needs config investigation) - -## Deep Investigation - -When the script reports issues and the user asks for more detail, use these commands: - -### Node issues -```bash -kubectl describe node -kubectl top node -kubectl get events --field-selector involvedObject.name= -``` - -### Pod issues -```bash -kubectl describe pod -n -kubectl logs -n --tail=100 -kubectl logs -n --previous --tail=100 -kubectl get events -n --field-selector involvedObject.name= -``` - -### Deployment issues -```bash -kubectl describe deployment -n -kubectl rollout status deployment -n -kubectl rollout history deployment -n -``` - -### PVC issues -```bash -kubectl describe pvc -n -kubectl get pv -kubectl get events -n --field-selector involvedObject.name= -``` - -### Resource pressure -```bash -kubectl top nodes -kubectl top pods -A --sort-by=memory | head -20 -kubectl top pods -A --sort-by=cpu | head -20 -``` - -## Common Remediation - -### CrashLoopBackOff (persistent) -1. Check logs: `kubectl logs -n --previous --tail=100` -2. Check events: `kubectl describe pod -n ` -3. Common causes: OOMKilled (increase memory limit), bad config, missing env var -4. If image issue: check if newer image exists, update in Terraform - -### OOMKilled -1. Check current limits: `kubectl describe pod -n | grep -A2 Limits` -2. Fix: Update resource limits in Terraform module for the service -3. Apply: `terraform apply -target=module.kubernetes_cluster.module. -var="kube_config_path=$(pwd)/config"` - -### ImagePullBackOff -1. Check image: `kubectl describe pod -n | grep Image` -2. Check registry: Is the image tag valid? Is the registry reachable? -3. Check pull-through cache: Docker registry at 10.0.20.10 - -### Node NotReady -1. Check kubelet: SSH to node, `systemctl status kubelet` -2. Check resources: `kubectl top node ` -3. Check conditions: `kubectl describe node | grep -A10 Conditions` - -## Slack Webhook - -Messages are posted to the webhook at `$SLACK_WEBHOOK_URL`. Format: -- All clear: green check + summary stats -- Issues found: red siren + list of issues + auto-fix actions taken -- Warnings only: yellow warning + elevated metrics - -## Infrastructure - -- **Terraform module**: `modules/kubernetes/openclaw/main.tf` -- **CronJob**: Runs in `openclaw` namespace every 30 min -- **Existing healthcheck**: `scripts/cluster_healthcheck.sh` (local-only, not for OpenClaw) -- **Repo path inside pod**: `/workspace/infra/` -``` - -**Step 2: Commit** - -```bash -git add .claude/skills/cluster-health/SKILL.md -git commit -m "[ci skip] Add cluster-health skill for OpenClaw agent" -``` - ---- - -### Task 4: Add CronJob and RBAC to Terraform - -**Files:** -- Modify: `modules/kubernetes/openclaw/main.tf` (append CronJob + ServiceAccount + Role + RoleBinding) - -**Step 1: Add CronJob resources** - -Append the following to `modules/kubernetes/openclaw/main.tf` after the `module "ingress"` block: - -```hcl -# --- CronJob: Scheduled cluster health check --- - -resource "kubernetes_service_account" "healthcheck" { - metadata { - name = "cluster-healthcheck" - namespace = kubernetes_namespace.openclaw.metadata[0].name - } -} - -resource "kubernetes_role" "healthcheck_exec" { - metadata { - name = "healthcheck-pod-exec" - namespace = kubernetes_namespace.openclaw.metadata[0].name - } - rule { - api_groups = [""] - resources = ["pods"] - verbs = ["get", "list"] - } - rule { - api_groups = [""] - resources = ["pods/exec"] - verbs = ["create"] - } -} - -resource "kubernetes_role_binding" "healthcheck_exec" { - metadata { - name = "healthcheck-pod-exec" - namespace = kubernetes_namespace.openclaw.metadata[0].name - } - subject { - kind = "ServiceAccount" - name = kubernetes_service_account.healthcheck.metadata[0].name - namespace = kubernetes_namespace.openclaw.metadata[0].name - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "Role" - name = kubernetes_role.healthcheck_exec.metadata[0].name - } -} - -resource "kubernetes_cron_job_v1" "cluster_healthcheck" { - metadata { - name = "cluster-healthcheck" - namespace = kubernetes_namespace.openclaw.metadata[0].name - labels = { - app = "cluster-healthcheck" - tier = var.tier - } - } - spec { - schedule = "*/30 * * * *" - concurrency_policy = "Forbid" - failed_jobs_history_limit = 3 - successful_jobs_history_limit = 3 - - job_template { - metadata { - labels = { - app = "cluster-healthcheck" - } - } - spec { - active_deadline_seconds = 300 - template { - metadata { - labels = { - app = "cluster-healthcheck" - } - } - spec { - service_account_name = kubernetes_service_account.healthcheck.metadata[0].name - restart_policy = "Never" - - container { - name = "healthcheck" - image = "bitnami/kubectl:1.34" - command = ["bash", "-c", <<-EOF - # Find the openclaw pod - POD=$(kubectl get pods -n openclaw -l app=openclaw -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [ -z "$POD" ]; then - echo "ERROR: OpenClaw pod not found" - exit 1 - fi - echo "Executing health check in pod $POD..." - kubectl exec -n openclaw "$POD" -c openclaw -- bash /workspace/infra/.claude/cluster-health.sh - EOF - ] - - resources { - requests = { - cpu = "50m" - memory = "64Mi" - } - limits = { - memory = "128Mi" - } - } - } - } - } - } - } - } -} -``` - -**Step 2: Verify Terraform formatting** - -```bash -terraform fmt modules/kubernetes/openclaw/main.tf -``` - -**Step 3: Verify Terraform plan** - -```bash -terraform plan -target=module.kubernetes_cluster.module.openclaw -var="kube_config_path=$(pwd)/config" -``` - -Expected: Plan shows 4 new resources (ServiceAccount, Role, RoleBinding, CronJobV1). No destructive changes to existing resources. - -**Step 4: Commit** - -```bash -git add modules/kubernetes/openclaw/main.tf -git commit -m "[ci skip] Add cluster health check CronJob to OpenClaw module" -``` - ---- - -### Task 5: Deploy and verify - -**Step 1: Apply Terraform** - -```bash -terraform apply -target=module.kubernetes_cluster.module.openclaw -var="kube_config_path=$(pwd)/config" -auto-approve -``` - -**Step 2: Verify CronJob exists** - -```bash -kubectl --kubeconfig $(pwd)/config get cronjob -n openclaw -``` - -Expected: `cluster-healthcheck` with schedule `*/30 * * * *` - -**Step 3: Verify RBAC** - -```bash -kubectl --kubeconfig $(pwd)/config get serviceaccount,role,rolebinding -n openclaw -``` - -Expected: `cluster-healthcheck` SA, `healthcheck-pod-exec` role and rolebinding - -**Step 4: Trigger a manual run** - -```bash -kubectl --kubeconfig $(pwd)/config create job --from=cronjob/cluster-healthcheck healthcheck-manual-test -n openclaw -``` - -**Step 5: Check job output** - -```bash -kubectl --kubeconfig $(pwd)/config wait --for=condition=complete job/healthcheck-manual-test -n openclaw --timeout=120s -kubectl --kubeconfig $(pwd)/config logs job/healthcheck-manual-test -n openclaw -``` - -Expected: Health check output with results. If `SLACK_WEBHOOK_URL` is set, check Slack for the message. - -**Step 6: Clean up test job** - -```bash -kubectl --kubeconfig $(pwd)/config delete job healthcheck-manual-test -n openclaw -``` - -**Step 7: Final commit** - -```bash -git add -A modules/kubernetes/openclaw/ .claude/skills/cluster-health/ .claude/cluster-health.sh -git commit -m "[ci skip] OpenClaw cluster health agent: script + skill + CronJob" -``` diff --git a/docs/plans/2026-02-22-terragrunt-migration-design.md b/docs/plans/2026-02-22-terragrunt-migration-design.md deleted file mode 100644 index 417db72e..00000000 --- a/docs/plans/2026-02-22-terragrunt-migration-design.md +++ /dev/null @@ -1,387 +0,0 @@ -# Terragrunt Migration Design - -**Date**: 2026-02-22 -**Status**: Approved - -## Problem - -The infrastructure repo has a monolithic Terraform setup: -- 15MB state file, 857 resources, 85+ service modules in a single root -- `terraform plan/apply` evaluates all modules even when targeting one service -- `null_resource.core_services` bottleneck blocks 73 services behind 12 core modules -- 150+ variables passed through root -> kubernetes_cluster -> individual services -- 3 providers (kubernetes, helm, proxmox) initialize on every run - -## Goals - -- **Speed**: Faster plan/apply by splitting state into independent stacks -- **Blast radius isolation**: Bad apply can't break unrelated services -- **DRY config**: Shared provider/backend configuration via Terragrunt -- **Proper DAG**: Full references between stacks (not hardcoded DNS strings) -- **Bootstrappable**: `terragrunt run-all apply` works from scratch -- **CI/CD**: Changed-stack detection in Drone CI - -## Architecture: Flat Stacks - -### Directory Structure - -``` -infra/ -├── terragrunt.hcl # Root config (providers, backend, common vars) -├── stacks/ -│ ├── infra/ # Proxmox VMs, templates, docker-registry -│ │ ├── terragrunt.hcl -│ │ └── main.tf -│ ├── platform/ # Core: traefik, metallb, redis, dbaas, authentik, etc. -│ │ ├── terragrunt.hcl -│ │ └── main.tf -│ ├── blog/ # One dir per user service -│ │ ├── terragrunt.hcl -│ │ └── main.tf -│ ├── immich/ -│ │ ├── terragrunt.hcl -│ │ └── main.tf -│ └── ... (~65 service dirs) -├── modules/ # UNCHANGED — existing modules stay where they are -│ ├── kubernetes/ -│ │ ├── ingress_factory/ -│ │ ├── setup_tls_secret/ -│ │ ├── blog/ -│ │ ├── immich/ -│ │ └── ... -│ ├── create-vm/ -│ └── create-template-vm/ -├── state/ # Per-stack state files -│ ├── infra/terraform.tfstate -│ ├── platform/terraform.tfstate -│ ├── blog/terraform.tfstate -│ └── ... -├── terraform.tfvars # UNCHANGED — encrypted secrets -├── secrets/ # UNCHANGED — TLS certs -├── main.tf # LEGACY — gradually emptied during migration -└── terraform.tfstate # LEGACY — gradually emptied during migration -``` - -Each stack has a thin `main.tf` wrapper that calls the existing module via -`source = "../../modules/kubernetes/"`. We do NOT use Terragrunt's -`terraform { source }` directive because our modules use relative paths -(`../ingress_factory`, `../setup_tls_secret`) that would break when Terragrunt -copies them to `.terragrunt-cache/`. - -### Stack Composition - -**Infra stack** (~10 resources): -- Proxmox VM templates (k8s, non-k8s, docker-registry) -- Docker registry VM -- Uses proxmox provider (not kubernetes/helm) - -**Platform stack** (~200 resources, ~20 services): -- traefik, metallb, redis, dbaas, technitium, authentik, crowdsec, cloudflared -- monitoring (prometheus, alertmanager, grafana, loki, alloy) -- kyverno, metrics-server, nvidia, mailserver, authelia -- wireguard, headscale, xray, uptime-kuma, vaultwarden, reverse-proxy -- Exports outputs consumed by service stacks - -**Per-service stacks** (~65, each 5-25 resources): -- One stack per user-facing service -- Each depends on platform via Terragrunt `dependency` block -- Some depend on other services (f1-stream -> coturn, etc.) - -### Dependency Graph - -``` - ┌─────────┐ - │ infra │ - └────┬────┘ - │ - ┌────▼────┐ - │platform │ exports: redis_host, postgresql_host, - │ │ mysql_host, smtp_host, tls_secret_name, ... - └────┬────┘ - │ - ┌────────┬───────────┼───────────┬────────┐ - │ │ │ │ │ - ┌────▼──┐ ┌───▼───┐ ┌────▼───┐ ┌─────▼──┐ ┌──▼───┐ - │ blog │ │immich │ │ affine │ │ollama │ │coturn│ ... - └───────┘ └───────┘ └────────┘ └───┬────┘ └──┬───┘ - │ │ - ┌────▼───┐ ┌───▼──────┐ - │openclaw│ │f1-stream │ - │gramps │ └──────────┘ - │ytdlp │ - └────────┘ -``` - -### Platform Stack Outputs - -| Output | Value | Consumers | -|--------|-------|-----------| -| `redis_host` | `redis.redis.svc.cluster.local` | 10 services | -| `postgresql_host` | `postgresql.dbaas.svc.cluster.local` | 10 services | -| `postgresql_port` | `5432` | 10 services | -| `mysql_host` | `mysql.dbaas.svc.cluster.local` | 8 services | -| `mysql_port` | `3306` | 8 services | -| `smtp_host` | `mail.viktorbarzin.me` | 6 services | -| `smtp_port` | `587` | 6 services | -| `tls_secret_name` | from variable | all services | -| `authentik_outpost_url` | `http://ak-outpost-...` | traefik | -| `crowdsec_lapi_host` | `crowdsec-service...` | traefik | -| `alertmanager_url` | `http://prometheus-alertmanager...` | loki | -| `loki_push_url` | `http://loki...` | alloy | - -Service-to-service dependencies: - -| Service | Depends on | Outputs consumed | -|---------|-----------|-----------------| -| f1-stream | coturn | `coturn_host`, `coturn_port` | -| real-estate-crawler | osm-routing | `osrm_foot_host`, `osrm_bicycle_host` | -| openclaw, grampsweb, ytdlp | ollama | `ollama_host` | - -### Module Modifications - -Service modules that hardcode DNS names need modification to accept hosts as variables. -~20 modules affected. Example for affine: - -**Before:** -```hcl -# modules/kubernetes/affine/main.tf -DATABASE_URL = "postgresql://...@postgresql.dbaas.svc.cluster.local:5432/affine" -REDIS_SERVER_HOST = "redis.redis.svc.cluster.local" -``` - -**After:** -```hcl -variable "redis_host" { type = string } -variable "postgresql_host" { type = string } -variable "postgresql_port" { type = number } - -DATABASE_URL = "postgresql://...@${var.postgresql_host}:${var.postgresql_port}/affine" -REDIS_SERVER_HOST = var.redis_host -``` - -## Root Terragrunt Configuration - -```hcl -# infra/terragrunt.hcl - -remote_state { - backend = "local" - generate = { - path = "backend.tf" - if_exists = "overwrite_terragrunt" - } - config = { - path = "${get_repo_root()}/state/${path_relative_to_include()}/terraform.tfstate" - } -} - -terraform { - extra_arguments "common_vars" { - commands = get_terraform_commands_that_need_vars() - required_var_files = [ - "${get_repo_root()}/terraform.tfvars" - ] - } -} - -generate "k8s_providers" { - path = "providers.tf" - if_exists = "overwrite_terragrunt" - contents = < coturn) - -```hcl -# stacks/f1-stream/terragrunt.hcl -include "root" { - path = find_in_parent_folders() -} - -dependency "platform" { - config_path = "../platform" -} - -dependency "coturn" { - config_path = "../coturn" -} - -inputs = { - tls_secret_name = dependency.platform.outputs.tls_secret_name - coturn_host = dependency.coturn.outputs.coturn_host - coturn_port = dependency.coturn.outputs.coturn_port -} -``` - -## Migration Strategy - -### Phase 0: Setup -- Install Terragrunt -- Create root `terragrunt.hcl`, `stacks/`, `state/` directories -- No state changes, no risk - -### Phase 1: Infra Stack (VMs) -- Create `stacks/infra/` with Proxmox provider + VM module calls -- `terraform state mv` 4 root-level module resources to `state/infra/` -- Remove from root `main.tf` -- Verify: `cd stacks/infra && terragrunt plan` shows no changes - -### Phase 2: Platform Stack (Core Services) -- Create `stacks/platform/main.tf` with ~20 core services + outputs -- `terraform state mv` ~200 resources from `module.kubernetes_cluster.module.` -- Remove `null_resource.core_services` (Terragrunt handles ordering) -- Verify: `cd stacks/platform && terragrunt plan` shows no changes - -### Phase 3: Simple Services (No DB Dependencies) -- blog, echo, privatebin, excalidraw, city-guesser, dashy, etc. -- Create stack, move state, verify — one at a time - -### Phase 4: Database-Backed Services -- Modify modules to accept hosts as variables -- affine, immich, linkwarden, nextcloud, grampsweb, etc. -- Create stack, move state, verify - -### Phase 5: Service-to-Service Dependencies -- ollama -> openclaw, grampsweb, ytdlp -- coturn -> f1-stream -- osm-routing -> real-estate-crawler - -### Phase 6: Cleanup -- Delete DEFCON system from `modules/kubernetes/main.tf` -- Delete legacy `terraform.tfstate` -- Delete root `main.tf` kubernetes_cluster module call -- Update CI/CD to Terragrunt - -### Rollback -At any phase, `terraform state mv` resources back to monolith state and -restore module calls. - -## CI/CD: Changed-Stack Detection - -Drone CI pipeline detects changed files per commit and maps to affected stacks: - -| Changed file | Affected stack | -|-------------|---------------| -| `stacks/blog/*` | blog | -| `modules/kubernetes/blog/*` | blog | -| `terraform.tfvars` | all stacks | -| `terragrunt.hcl` | all stacks | -| `modules/kubernetes/ingress_factory/*` | all stacks | - -### Manual Workflow - -```bash -# Apply single service -cd stacks/blog && terragrunt apply - -# Apply everything (respects DAG ordering) -cd stacks && terragrunt run-all apply - -# Plan everything -cd stacks && terragrunt run-all plan -``` - -## Decisions Made - -| Decision | Choice | Rationale | -|----------|--------|-----------| -| Tool | Terragrunt | DRY config, dependency management, run-all orchestration | -| Stack granularity | 1 platform + 1 per service | Max isolation for apps, grouped core | -| Migration | Incremental | Lower risk, verify each step | -| Shared modules | Relative paths | Simple, no registry overhead | -| State backend | Local files | No external dependencies | -| Cross-stack refs | Full references via outputs | Proper DAG, bootstrappable from scratch | -| CI/CD | Changed-stack detection | Only apply what changed | diff --git a/docs/plans/2026-02-22-terragrunt-migration-plan.md b/docs/plans/2026-02-22-terragrunt-migration-plan.md deleted file mode 100644 index fd1e9446..00000000 --- a/docs/plans/2026-02-22-terragrunt-migration-plan.md +++ /dev/null @@ -1,1235 +0,0 @@ -# Terragrunt Migration Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Migrate the monolithic Terraform setup (857 resources, 15MB state) to Terragrunt with per-service state isolation, proper DAG dependencies, and changed-stack CI/CD detection. - -**Architecture:** Flat stacks under `stacks/` with thin `main.tf` wrappers calling existing modules. Root `terragrunt.hcl` provides DRY provider/backend config. Platform stack groups ~20 core services and exports outputs (redis_host, postgresql_host, etc.) consumed by ~65 per-service stacks via Terragrunt `dependency` blocks. - -**Tech Stack:** Terragrunt, Terraform 1.14.x, local state backend, Drone CI - -**Design Doc:** `docs/plans/2026-02-22-terragrunt-migration-design.md` - ---- - -## Task 1: Install Terragrunt and Create Directory Skeleton - -**Files:** -- Create: `stacks/` directory -- Create: `state/` directory -- Create: `.gitignore` updates - -**Step 1: Install Terragrunt** - -Run: -```bash -brew install terragrunt -``` -Expected: Terragrunt available at `terragrunt --version` - -**Step 2: Create directory skeleton** - -Run: -```bash -mkdir -p stacks/{infra,platform} -mkdir -p state -``` - -**Step 3: Update `.gitignore`** - -Add to `.gitignore`: -``` -# Terragrunt -.terragrunt-cache/ -state/ -``` - -The `state/` directory contains per-stack terraform state files. These are local-only and should not be committed (they contain resource IDs and potentially sensitive data, same as the current `terraform.tfstate`). - -**Step 4: Commit** - -```bash -git add stacks/ .gitignore -git commit -m "[ci skip] Add Terragrunt directory skeleton" -``` - ---- - -## Task 2: Create Root Terragrunt Configuration - -**Files:** -- Create: `terragrunt.hcl` - -**Step 1: Write root terragrunt.hcl** - -```hcl -# Root Terragrunt configuration -# Provides DRY provider, backend, and variable loading for all stacks. - -# Each stack gets its own local state file under state// -remote_state { - backend = "local" - generate = { - path = "backend.tf" - if_exists = "overwrite_terragrunt" - } - config = { - path = "${get_repo_root()}/state/${path_relative_to_include()}/terraform.tfstate" - } -} - -# Load terraform.tfvars for all stacks. -# Variables not declared by a stack are silently ignored (Terraform 1.x behavior). -terraform { - extra_arguments "common_vars" { - commands = get_terraform_commands_that_need_vars() - required_var_files = [ - "${get_repo_root()}/terraform.tfvars" - ] - } - - extra_arguments "kube_config" { - commands = get_terraform_commands_that_need_vars() - arguments = [ - "-var", "kube_config_path=${get_repo_root()}/config" - ] - } -} - -# Generate kubernetes + helm providers for K8s stacks. -# The infra stack overrides this to add the proxmox provider. -generate "k8s_providers" { - path = "providers.tf" - if_exists = "overwrite_terragrunt" - contents = <&1 | head -5 -``` -Expected: No parse errors (may show warnings about missing main.tf, that's fine) - -**Step 3: Commit** - -```bash -git add terragrunt.hcl -git commit -m "[ci skip] Add root Terragrunt configuration" -``` - ---- - -## Task 3: Create Infra Stack (Proxmox VMs) - -**Files:** -- Create: `stacks/infra/terragrunt.hcl` -- Create: `stacks/infra/main.tf` - -**Step 1: Write infra terragrunt.hcl** - -This stack needs the proxmox provider instead of (or in addition to) the default k8s providers. - -```hcl -# stacks/infra/terragrunt.hcl -include "root" { - path = find_in_parent_folders() -} - -# Override provider generation to include proxmox -generate "providers" { - path = "providers.tf" - if_exists = "overwrite_terragrunt" - contents = < /etc/containerd/certs.d/docker.io/hosts.toml - mkdir -p /etc/containerd/certs.d/ghcr.io - printf 'server = "https://ghcr.io"\n\n[host."http://10.0.20.10:5010"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/ghcr.io/hosts.toml - mkdir -p /etc/containerd/certs.d/quay.io - printf 'server = "https://quay.io"\n\n[host."http://10.0.20.10:5020"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/quay.io/hosts.toml - mkdir -p /etc/containerd/certs.d/registry.k8s.io - printf 'server = "https://registry.k8s.io"\n\n[host."http://10.0.20.10:5030"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/registry.k8s.io/hosts.toml - mkdir -p /etc/containerd/certs.d/reg.kyverno.io - printf 'server = "https://reg.kyverno.io"\n\n[host."http://10.0.20.10:5040"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/reg.kyverno.io/hosts.toml - sed -i 's/.*max_concurrent_downloads = 3/max_concurrent_downloads = 20/g' /etc/containerd/config.toml - sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \ - sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \ - echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 50' | sudo tee -a /var/lib/kubelet/config.yaml - EOF - k8s_join_command = var.k8s_join_command -} - -module "non-k8s-node-template" { - source = "../../modules/create-template-vm" - proxmox_host = var.proxmox_host - proxmox_user = "root" - - ssh_private_key = var.ssh_private_key - ssh_public_key = var.ssh_public_key - - cloud_image_url = local.cloud_init_image_url - image_path = local.non_k8s_cloud_init_image_path - template_id = 1000 - template_name = local.non_k8s_vm_template - user_passwd = var.vm_wizard_password - - is_k8s_template = false - snippet_name = local.non_k8s_cloud_init_snippet_name -} - -module "docker-registry-template" { - source = "../../modules/create-template-vm" - proxmox_host = var.proxmox_host - proxmox_user = "root" - - ssh_private_key = var.ssh_private_key - ssh_public_key = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDHLhYDfyx237eJgOGVoJRECpUS95+7rEBS9vacsIxtx devvm" - - cloud_image_url = local.cloud_init_image_url - image_path = local.non_k8s_cloud_init_image_path - template_id = 1001 - template_name = "docker-registry-template" - - user_passwd = var.vm_wizard_password - is_k8s_template = false - snippet_name = "docker-registry.yaml" - - provision_cmds = [ - "mkdir -p /etc/docker-registry", - format("echo %s | base64 -d > /etc/docker-registry/config.yml", - base64encode( - templatefile("${path.root}/../../modules/docker-registry/config.yaml", { - password = var.dockerhub_registry_password - }) - ) - ), - # ... (copy remaining provision_cmds from main.tf lines 305-371) - ] -} - -module "docker-registry-vm" { - source = "../../modules/create-vm" - vmid = 220 - vm_cpus = 4 - vm_mem_mb = 4196 - vm_disk_size = "64G" - template_name = "docker-registry-template" - vm_name = "docker-registry" - cisnippet_name = "docker-registry.yaml" - vm_mac_address = "DE:AD:BE:EF:22:22" - bridge = "vmbr1" - vlan_tag = "20" - ipconfig0 = "ip=10.0.20.10/24,gw=10.0.20.1" -} -``` - -**Note:** The `provision_cmds` for docker-registry-template is long (~60 lines). Copy it exactly from the current `main.tf` lines 296-371. The only change is `templatefile` paths: prefix with `${path.root}/../../` since the working directory is now `stacks/infra/`. - -**Step 3: Verify with init (do NOT apply yet)** - -Run: -```bash -cd stacks/infra && terragrunt init -``` -Expected: Successful init, providers downloaded - -**Step 4: Commit** - -```bash -git add stacks/infra/ -git commit -m "[ci skip] Add infra stack (Proxmox VMs)" -``` - ---- - -## Task 4: Migrate Infra Stack State - -**CRITICAL: This task modifies live state. Take a backup first.** - -**Step 1: Backup current state** - -Run: -```bash -cp terraform.tfstate terraform.tfstate.backup-pre-terragrunt -``` - -**Step 2: List current infra resources in state** - -Run: -```bash -terraform state list | grep -E '^module\.(k8s-node-template|non-k8s-node-template|docker-registry-template|docker-registry-vm)\.' -``` -Expected: List of ~10 resources belonging to these 4 modules - -**Step 3: Move resources to new state file** - -For each resource listed in step 2, run: -```bash -mkdir -p state/infra -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/infra/terraform.tfstate \ - 'module.k8s-node-template' 'module.k8s-node-template' -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/infra/terraform.tfstate \ - 'module.non-k8s-node-template' 'module.non-k8s-node-template' -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/infra/terraform.tfstate \ - 'module.docker-registry-template' 'module.docker-registry-template' -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/infra/terraform.tfstate \ - 'module.docker-registry-vm' 'module.docker-registry-vm' -``` - -**Step 4: Verify no changes in new state** - -Run: -```bash -cd stacks/infra && terragrunt plan -``` -Expected: `No changes. Infrastructure is up-to-date.` - -If there are changes, something went wrong — restore from backup and investigate. - -**Step 5: Remove infra modules from root main.tf** - -Remove (or comment out) the `module.k8s-node-template`, `module.non-k8s-node-template`, `module.docker-registry-template`, and `module.docker-registry-vm` blocks from `main.tf` (lines 208-400). - -Also remove the corresponding `locals` block (lines 196-206) since they're now in `stacks/infra/main.tf`. - -**Step 6: Verify legacy state is clean** - -Run: -```bash -terraform plan -var="kube_config_path=$(pwd)/config" -``` -Expected: No changes (the moved resources are gone from this state but also from main.tf) - -**Step 7: Commit** - -```bash -git add main.tf stacks/infra/ -git commit -m "[ci skip] Migrate infra stack (VMs) to Terragrunt" -``` - ---- - -## Task 5: Create Platform Stack - -**Files:** -- Create: `stacks/platform/terragrunt.hcl` -- Create: `stacks/platform/main.tf` - -This is the largest task — it groups ~20 core services into one stack. - -**Step 1: Write platform terragrunt.hcl** - -```hcl -# stacks/platform/terragrunt.hcl -include "root" { - path = find_in_parent_folders() -} - -dependency "infra" { - config_path = "../infra" - skip_outputs = true -} -``` - -**Step 2: Write platform main.tf** - -This file contains all core/cluster service module calls. Copy each from `modules/kubernetes/main.tf`, adjusting `source` paths from `"./"` to `"../../modules/kubernetes/"`. Remove `for_each` conditionals (core services are always present). Remove `depends_on = [null_resource.core_services]`. - -Platform services (from `modules/kubernetes/main.tf`): - -```hcl -# stacks/platform/main.tf - -# Variables — declare all variables needed by platform services -variable "kube_config_path" { default = "~/.kube/config" } -variable "tls_secret_name" {} -variable "prod" { default = false } - -# dbaas vars -variable "dbaas_root_password" {} -variable "dbaas_postgresql_root_password" {} -variable "dbaas_pgadmin_password" {} - -# traefik vars -variable "ingress_crowdsec_api_key" {} - -# technitium vars -variable "technitium_db_password" {} -variable "homepage_credentials" { type = map(any) } - -# headscale vars -variable "headscale_config" {} -variable "headscale_acl" {} - -# authentik vars -variable "authentik_secret_key" {} -variable "authentik_postgres_password" {} -variable "k8s_users" { type = map(any); default = {} } -variable "ssh_private_key" { type = string; default = ""; sensitive = true } - -# crowdsec vars -variable "crowdsec_enroll_key" { type = string } -variable "crowdsec_db_password" { type = string } -variable "crowdsec_dash_api_key" { type = string } -variable "crowdsec_dash_machine_id" { type = string } -variable "crowdsec_dash_machine_password" { type = string } -variable "alertmanager_slack_api_url" {} - -# cloudflared vars -variable "cloudflare_api_key" {} -variable "cloudflare_email" {} -variable "cloudflare_account_id" {} -variable "cloudflare_zone_id" {} -variable "cloudflare_tunnel_id" {} -variable "public_ip" {} -variable "cloudflare_proxied_names" {} -variable "cloudflare_non_proxied_names" {} -variable "cloudflare_tunnel_token" {} - -# monitoring vars -variable "alertmanager_account_password" {} -variable "idrac_username" { default = "" } -variable "idrac_password" { default = "" } -variable "tiny_tuya_service_secret" { type = string } -variable "haos_api_token" { type = string } -variable "pve_password" { type = string } -variable "grafana_db_password" { type = string } -variable "grafana_admin_password" { type = string } - -# vaultwarden vars -variable "vaultwarden_smtp_password" {} - -# reverse-proxy vars (homepage tokens are in homepage_credentials) - -# wireguard vars -variable "wireguard_wg_0_conf" {} -variable "wireguard_wg_0_key" {} -variable "wireguard_firewall_sh" {} - -# xray vars -variable "xray_reality_clients" { type = list(map(string)) } -variable "xray_reality_private_key" { type = string } -variable "xray_reality_short_ids" { type = list(string) } - -# nvidia vars (none beyond tls_secret_name + tier) - -# mailserver vars -variable "mailserver_accounts" {} -variable "mailserver_aliases" {} -variable "mailserver_opendkim_key" {} -variable "mailserver_sasl_passwd" {} -variable "mailserver_roundcubemail_db_password" { type = string } - -# infra-maintenance vars -variable "webhook_handler_git_user" {} -variable "webhook_handler_git_token" {} -variable "technitium_username" {} -variable "technitium_password" {} - -# uptime-kuma (no extra vars) -# metrics-server (no extra vars) -# kyverno (no extra vars) - -locals { - tiers = { - core = "0-core" - cluster = "1-cluster" - gpu = "2-gpu" - edge = "3-edge" - aux = "4-aux" - } -} - -# --- Core Services (no dependencies, deployed first) --- - -module "metallb" { - source = "../../modules/kubernetes/metallb" - tier = local.tiers.core -} - -module "dbaas" { - source = "../../modules/kubernetes/dbaas" - prod = var.prod - tls_secret_name = var.tls_secret_name - dbaas_root_password = var.dbaas_root_password - postgresql_root_password = var.dbaas_postgresql_root_password - pgadmin_password = var.dbaas_pgadmin_password - tier = local.tiers.cluster -} - -module "redis" { - source = "../../modules/kubernetes/redis" - tls_secret_name = var.tls_secret_name - tier = local.tiers.cluster -} - -module "traefik" { - source = "../../modules/kubernetes/traefik" - tier = local.tiers.core - crowdsec_api_key = var.ingress_crowdsec_api_key - tls_secret_name = var.tls_secret_name -} - -module "technitium" { - source = "../../modules/kubernetes/technitium" - tls_secret_name = var.tls_secret_name - homepage_token = var.homepage_credentials["technitium"]["token"] - technitium_db_password = var.technitium_db_password - tier = local.tiers.core -} - -module "headscale" { - source = "../../modules/kubernetes/headscale" - tls_secret_name = var.tls_secret_name - headscale_config = var.headscale_config - headscale_acl = var.headscale_acl - tier = local.tiers.core -} - -module "authentik" { - source = "../../modules/kubernetes/authentik" - tier = local.tiers.cluster - tls_secret_name = var.tls_secret_name - secret_key = var.authentik_secret_key - postgres_password = var.authentik_postgres_password -} - -module "rbac" { - source = "../../modules/kubernetes/rbac" - tier = local.tiers.cluster - tls_secret_name = var.tls_secret_name - k8s_users = var.k8s_users - ssh_private_key = var.ssh_private_key -} - -module "k8s-portal" { - source = "../../modules/kubernetes/k8s-portal" - tier = local.tiers.edge - tls_secret_name = var.tls_secret_name -} - -module "crowdsec" { - source = "../../modules/kubernetes/crowdsec" - tier = local.tiers.cluster - tls_secret_name = var.tls_secret_name - homepage_username = var.homepage_credentials["crowdsec"]["username"] - homepage_password = var.homepage_credentials["crowdsec"]["password"] - enroll_key = var.crowdsec_enroll_key - db_password = var.crowdsec_db_password - crowdsec_dash_api_key = var.crowdsec_dash_api_key - crowdsec_dash_machine_id = var.crowdsec_dash_machine_id - crowdsec_dash_machine_password = var.crowdsec_dash_machine_password - slack_webhook_url = var.alertmanager_slack_api_url -} - -module "cloudflared" { - source = "../../modules/kubernetes/cloudflared" - tier = local.tiers.core - tls_secret_name = var.tls_secret_name - cloudflare_api_key = var.cloudflare_api_key - cloudflare_email = var.cloudflare_email - cloudflare_account_id = var.cloudflare_account_id - cloudflare_zone_id = var.cloudflare_zone_id - cloudflare_tunnel_id = var.cloudflare_tunnel_id - public_ip = var.public_ip - cloudflare_proxied_names = var.cloudflare_proxied_names - cloudflare_non_proxied_names = var.cloudflare_non_proxied_names - cloudflare_tunnel_token = var.cloudflare_tunnel_token -} - -module "monitoring" { - source = "../../modules/kubernetes/monitoring" - tls_secret_name = var.tls_secret_name - alertmanager_account_password = var.alertmanager_account_password - idrac_username = var.idrac_username - idrac_password = var.idrac_password - alertmanager_slack_api_url = var.alertmanager_slack_api_url - tiny_tuya_service_secret = var.tiny_tuya_service_secret - haos_api_token = var.haos_api_token - pve_password = var.pve_password - grafana_db_password = var.grafana_db_password - grafana_admin_password = var.grafana_admin_password - tier = local.tiers.cluster -} - -module "vaultwarden" { - source = "../../modules/kubernetes/vaultwarden" - tls_secret_name = var.tls_secret_name - smtp_password = var.vaultwarden_smtp_password - tier = local.tiers.edge -} - -module "reverse-proxy" { - source = "../../modules/kubernetes/reverse_proxy" - tls_secret_name = var.tls_secret_name - truenas_homepage_token = var.homepage_credentials["reverse_proxy"]["truenas_token"] - pfsense_homepage_token = var.homepage_credentials["reverse_proxy"]["pfsense_token"] -} - -module "metrics-server" { - source = "../../modules/kubernetes/metrics-server" - tier = local.tiers.cluster - tls_secret_name = var.tls_secret_name -} - -module "nvidia" { - source = "../../modules/kubernetes/nvidia" - tls_secret_name = var.tls_secret_name - tier = local.tiers.gpu -} - -module "kyverno" { - source = "../../modules/kubernetes/kyverno" -} - -module "uptime-kuma" { - source = "../../modules/kubernetes/uptime-kuma" - tls_secret_name = var.tls_secret_name - tier = local.tiers.cluster -} - -module "wireguard" { - source = "../../modules/kubernetes/wireguard" - tls_secret_name = var.tls_secret_name - wg_0_conf = var.wireguard_wg_0_conf - wg_0_key = var.wireguard_wg_0_key - firewall_sh = var.wireguard_firewall_sh - tier = local.tiers.core -} - -module "xray" { - source = "../../modules/kubernetes/xray" - tls_secret_name = var.tls_secret_name - tier = local.tiers.core - xray_reality_clients = var.xray_reality_clients - xray_reality_private_key = var.xray_reality_private_key - xray_reality_short_ids = var.xray_reality_short_ids -} - -module "mailserver" { - source = "../../modules/kubernetes/mailserver" - tls_secret_name = var.tls_secret_name - mailserver_accounts = var.mailserver_accounts - postfix_account_aliases = var.mailserver_aliases - opendkim_key = var.mailserver_opendkim_key - sasl_passwd = var.mailserver_sasl_passwd - roundcube_db_password = var.mailserver_roundcubemail_db_password - tier = local.tiers.edge -} - -module "infra-maintenance" { - source = "../../modules/kubernetes/infra-maintenance" - git_user = var.webhook_handler_git_user - git_token = var.webhook_handler_git_token - technitium_username = var.technitium_username - technitium_password = var.technitium_password -} - -# --- OUTPUTS (consumed by service stacks via Terragrunt dependency) --- - -output "tls_secret_name" { value = var.tls_secret_name } -output "redis_host" { value = "redis.redis.svc.cluster.local" } -output "postgresql_host" { value = "postgresql.dbaas.svc.cluster.local" } -output "postgresql_port" { value = 5432 } -output "mysql_host" { value = "mysql.dbaas.svc.cluster.local" } -output "mysql_port" { value = 3306 } -output "smtp_host" { value = "mail.viktorbarzin.me" } -output "smtp_port" { value = 587 } -``` - -**Step 3: Verify init succeeds** - -Run: -```bash -cd stacks/platform && terragrunt init -``` - -**Step 4: Commit** - -```bash -git add stacks/platform/ -git commit -m "[ci skip] Add platform stack (core services)" -``` - ---- - -## Task 6: Migrate Platform Stack State - -**CRITICAL: Largest state migration. Backup first.** - -**Step 1: Backup** - -```bash -cp terraform.tfstate terraform.tfstate.backup-pre-platform -``` - -**Step 2: Move core service resources** - -The resources are currently at `module.kubernetes_cluster.module.[""]` (the `for_each` key). Services without `for_each` are at `module.kubernetes_cluster.module.`. - -Run state mv for each platform service. Example pattern: -```bash -# Services WITH for_each (note the ["key"] suffix): -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.redis["redis"]' \ - 'module.redis' - -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.traefik["traefik"]' \ - 'module.traefik' - -# Services WITHOUT for_each: -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.metallb' \ - 'module.metallb' - -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.dbaas' \ - 'module.dbaas' - -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.cloudflared' \ - 'module.cloudflared' - -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.infra-maintenance' \ - 'module.infra-maintenance' - -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/platform/terraform.tfstate \ - 'module.kubernetes_cluster.module.reverse-proxy["reverse-proxy"]' \ - 'module.reverse-proxy' -``` - -Repeat for all platform services. Check whether each has `for_each` by looking at the state list: -```bash -terraform state list | grep 'module.kubernetes_cluster.module' | sort -``` - -Services with `for_each` have `["key"]` suffix; those without don't. - -**Step 3: Also move null_resource.core_services** - -```bash -# This resource can be dropped — don't move it, just remove it -terraform state rm 'module.kubernetes_cluster.null_resource.core_services' -``` - -**Step 4: Verify platform state** - -Run: -```bash -cd stacks/platform && terragrunt plan -``` -Expected: `No changes.` (or only expected diffs from removed for_each wrappers) - -**Step 5: Remove platform services from modules/kubernetes/main.tf** - -Remove the module blocks for all services that moved to the platform stack. Also remove `null_resource.core_services` and the `defcon_modules`/`active_modules` locals that reference these modules. - -**Step 6: Verify legacy state** - -Run: -```bash -terraform plan -var="kube_config_path=$(pwd)/config" -``` -Expected: No changes for remaining services - -**Step 7: Commit** - -```bash -git add main.tf modules/kubernetes/main.tf stacks/platform/ -git commit -m "[ci skip] Migrate platform stack (core services) to Terragrunt" -``` - ---- - -## Task 7: Create Simple Service Stack Template + Migrate First Service (blog) - -**Files:** -- Create: `stacks/blog/terragrunt.hcl` -- Create: `stacks/blog/main.tf` - -**Step 1: Write blog terragrunt.hcl** - -```hcl -# stacks/blog/terragrunt.hcl -include "root" { - path = find_in_parent_folders() -} - -dependency "platform" { - config_path = "../platform" -} - -inputs = { - tls_secret_name = dependency.platform.outputs.tls_secret_name -} -``` - -**Step 2: Write blog main.tf** - -```hcl -# stacks/blog/main.tf -variable "tls_secret_name" {} -variable "kube_config_path" { default = "~/.kube/config" } - -module "blog" { - source = "../../modules/kubernetes/blog" - tls_secret_name = var.tls_secret_name - tier = "4-aux" -} -``` - -**Step 3: Move blog state** - -```bash -terraform state mv \ - -state=terraform.tfstate \ - -state-out=state/blog/terraform.tfstate \ - 'module.kubernetes_cluster.module.blog["blog"]' \ - 'module.blog' -``` - -**Step 4: Verify** - -```bash -cd stacks/blog && terragrunt plan -``` -Expected: `No changes.` - -**Step 5: Remove blog from modules/kubernetes/main.tf** - -Delete the `module "blog" { ... }` block (lines 197-205). - -**Step 6: Commit** - -```bash -git add stacks/blog/ modules/kubernetes/main.tf -git commit -m "[ci skip] Migrate blog to Terragrunt stack" -``` - ---- - -## Task 8: Batch-Migrate Remaining Simple Services - -Simple services only need `tls_secret_name` (and possibly a few non-DB variables). These follow the exact same pattern as blog. - -**Simple services to migrate** (one stack each): -- echo, privatebin, excalidraw, city-guesser, dashy, travel_blog, jsoncrack, cyberchef, stirling-pdf, networking-toolbox, meshcentral, ntfy, plotting-book, reloader, descheduler, homepage, tor-proxy, forgejo, freshrss, navidrome, audiobookshelf, ebook2audiobook, whisper, frigate, matrix, changedetection, isponsorblocktv - -**Services with a few extra variables** (still no DB host refs): -- shadowsocks (password), kms, hackmd (db_password), drone (github creds, rpc_secret), diun (nfty_token, slack_url), calibre (homepage creds), owntracks (credentials), webhook_handler (many tokens), coturn (turn_secret, public_ip), wealthfolio (password_hash), actualbudget (credentials), servarr (aiostreams), onlyoffice (db_password, jwt_token), xray (reality vars), tuya-bridge (api keys), openclaw (ssh_key, api keys), f1-stream (turn_secret), paperless-ngx (db_password), freedify (credentials), netbox - -For each service, create: -1. `stacks//terragrunt.hcl` — include root, dependency on platform, inputs from platform outputs -2. `stacks//main.tf` — variable declarations + module call with `source = "../../modules/kubernetes/"` -3. `terraform state mv` from legacy state -4. Remove module block from `modules/kubernetes/main.tf` -5. Verify with `terragrunt plan` - -**Automation script** (run for each simple service): -```bash -#!/bin/bash -# Usage: ./migrate-service.sh -# Example: ./migrate-service.sh echo echo echo 3-edge - -SERVICE=$1 -SOURCE_DIR=${2:-$1} -FOR_EACH_KEY=${3:-$1} -TIER=${4:-4-aux} - -mkdir -p stacks/$SERVICE - -cat > stacks/$SERVICE/terragrunt.hcl <<'TGEOF' -include "root" { - path = find_in_parent_folders() -} - -dependency "platform" { - config_path = "../platform" -} - -inputs = { - tls_secret_name = dependency.platform.outputs.tls_secret_name -} -TGEOF - -cat > stacks/$SERVICE/main.tf < to Terragrunt stacks" -``` - ---- - -## Task 9: Modify Service Modules to Accept Host Variables - -**~20 modules need modification** to replace hardcoded DNS names with variables. - -For each module, the change is mechanical: -1. Add `variable "redis_host" { type = string }` (and/or postgresql_host, etc.) -2. Replace the hardcoded string with `var.redis_host` - -**Modules to modify and their needed variables:** - -| Module | Add variables | Replace in | -|--------|-------------|-----------| -| affine | redis_host, postgresql_host, postgresql_port, smtp_host, smtp_port | main.tf:25,29,50-64 | -| immich | redis_host, postgresql_host | main.tf:80,96 | -| nextcloud | redis_host, mysql_host | chart_values.yaml:31,37 | -| grampsweb | redis_host, smtp_host, smtp_port | main.tf:37,41,45,57 | -| dawarich | redis_host, postgresql_host | main.tf:75,79,147 | -| send | redis_host | main.tf:75 | -| linkwarden | postgresql_host, postgresql_port | main.tf:67 | -| n8n | postgresql_host | main.tf:56 | -| health | postgresql_host, postgresql_port | main.tf:54 | -| tandoor | postgresql_host, smtp_host, smtp_port | main.tf:66,98 | -| rybbit | postgresql_host | main.tf:162 | -| netbox | postgresql_host | main.tf:73 | -| speedtest | mysql_host | main.tf:85 | -| real-estate-crawler | redis_host, mysql_host | main.tf:140,153,157,301,305,309,401,405,409 | -| ytdlp | redis_host, ollama_host | main.tf:241,255 | -| resume | smtp_host, smtp_port | main.tf:186 | -| monitoring | mysql_host, smtp_host | grafana_chart_values.yaml:51, prometheus_chart_values.tpl:35,37 | - -**Example modification for affine:** - -In `modules/kubernetes/affine/main.tf`, add variables: -```hcl -variable "redis_host" { type = string } -variable "postgresql_host" { type = string } -variable "postgresql_port" { type = number } -variable "smtp_host" { type = string } -variable "smtp_port" { type = number } -``` - -Replace: -```hcl -# Before: -DATABASE_URL = "postgresql://postgres:${var.postgresql_password}@postgresql.dbaas.svc.cluster.local:5432/affine" -# After: -DATABASE_URL = "postgresql://postgres:${var.postgresql_password}@${var.postgresql_host}:${var.postgresql_port}/affine" - -# Before: -REDIS_SERVER_HOST = "redis.redis.svc.cluster.local" -# After: -REDIS_SERVER_HOST = var.redis_host -``` - -**Step: Commit each module modification** - -```bash -git add modules/kubernetes// -git commit -m "[ci skip] Accept host variables in module" -``` - ---- - -## Task 10: Migrate Database-Backed Services to Terragrunt Stacks - -After modules are modified (Task 9), create stacks that wire platform outputs to module inputs. - -**Example: stacks/affine/terragrunt.hcl** - -```hcl -include "root" { - path = find_in_parent_folders() -} - -dependency "platform" { - config_path = "../platform" -} - -inputs = { - tls_secret_name = dependency.platform.outputs.tls_secret_name - redis_host = dependency.platform.outputs.redis_host - postgresql_host = dependency.platform.outputs.postgresql_host - postgresql_port = dependency.platform.outputs.postgresql_port - smtp_host = dependency.platform.outputs.smtp_host - smtp_port = dependency.platform.outputs.smtp_port -} -``` - -**stacks/affine/main.tf:** - -```hcl -variable "tls_secret_name" {} -variable "kube_config_path" { default = "~/.kube/config" } -variable "affine_postgresql_password" {} -variable "redis_host" { type = string } -variable "postgresql_host" { type = string } -variable "postgresql_port" { type = number } -variable "smtp_host" { type = string } -variable "smtp_port" { type = number } - -module "affine" { - source = "../../modules/kubernetes/affine" - tls_secret_name = var.tls_secret_name - postgresql_password = var.affine_postgresql_password - redis_host = var.redis_host - postgresql_host = var.postgresql_host - postgresql_port = var.postgresql_port - smtp_host = var.smtp_host - smtp_port = var.smtp_port - tier = "4-aux" -} -``` - -State migration follows the same pattern as Task 7. - -Repeat for all DB-backed services from the table in Task 9. - ---- - -## Task 11: Migrate Service-to-Service Dependencies - -Services that depend on other non-platform services need multi-dependency stacks. - -**Step 1: Create ollama stack with outputs** - -```hcl -# stacks/ollama/main.tf -variable "tls_secret_name" {} -variable "kube_config_path" { default = "~/.kube/config" } -variable "ollama_api_credentials" {} - -module "ollama" { - source = "../../modules/kubernetes/ollama" - tls_secret_name = var.tls_secret_name - tier = "2-gpu" - ollama_api_credentials = var.ollama_api_credentials -} - -output "ollama_host" { - value = "ollama.ollama.svc.cluster.local" -} -``` - -**Step 2: Create openclaw stack with ollama dependency** - -```hcl -# stacks/openclaw/terragrunt.hcl -include "root" { - path = find_in_parent_folders() -} - -dependency "platform" { - config_path = "../platform" -} - -dependency "ollama" { - config_path = "../ollama" -} - -inputs = { - tls_secret_name = dependency.platform.outputs.tls_secret_name - ollama_host = dependency.ollama.outputs.ollama_host -} -``` - -**Step 3: Similarly for coturn → f1-stream and osm-routing → real-estate-crawler** - ---- - -## Task 12: Final Cleanup - -**Step 1: Remove legacy modules/kubernetes/main.tf** - -After all services are migrated, this file should be empty (or contain only commented-out blocks). Delete it. - -**Step 2: Remove kubernetes_cluster module call from root main.tf** - -The root `main.tf` should now only contain provider blocks (which can also be removed since Terragrunt generates them) and the `variable` declarations for `terraform.tfvars` loading. - -**Step 3: Archive legacy state** - -```bash -mv terraform.tfstate terraform.tfstate.legacy -mv terraform.tfstate.backup-* state/backups/ -``` - -**Step 4: Verify full DAG** - -```bash -cd stacks && terragrunt run-all plan -``` -Expected: All stacks show `No changes.` - -**Step 5: Update CLAUDE.md** - -Update the knowledge file to reflect the new Terragrunt architecture, commands, and workflow. - -**Step 6: Final commit** - -```bash -git add -A -git commit -m "[ci skip] Complete Terragrunt migration — remove legacy monolith" -``` - ---- - -## Task 13: Update CI/CD (Drone Pipeline) - -**Files:** -- Modify: `.drone.yml` - -Create a Drone pipeline that: -1. Detects changed files -2. Maps to affected stacks -3. Runs `terragrunt plan` (on PR) or `terragrunt apply` (on master merge) - -See design doc section "CI/CD: Changed-Stack Detection" for the pipeline logic. - ---- - -## Execution Order Summary - -| Task | Phase | Risk | Reversible | -|------|-------|------|-----------| -| 1. Install + skeleton | 0 | None | Yes (delete dirs) | -| 2. Root terragrunt.hcl | 0 | None | Yes (delete file) | -| 3. Infra stack files | 1 | None | Yes (delete stack) | -| 4. Infra state migration | 1 | Medium | Yes (state mv back) | -| 5. Platform stack files | 2 | None | Yes (delete stack) | -| 6. Platform state migration | 2 | High | Yes (state mv back) | -| 7. First simple service (blog) | 3 | Low | Yes (state mv back) | -| 8. Batch simple services | 3 | Low | Yes (state mv back) | -| 9. Module host variable mods | 4 | Low | Yes (revert changes) | -| 10. DB service stacks | 4 | Low | Yes (state mv back) | -| 11. Service-to-service deps | 5 | Low | Yes (state mv back) | -| 12. Final cleanup | 6 | Medium | Harder to reverse | -| 13. CI/CD update | 6 | Low | Yes (revert .drone.yml) | diff --git a/main.tf b/main.tf deleted file mode 100644 index 78559292..00000000 --- a/main.tf +++ /dev/null @@ -1,326 +0,0 @@ -variable "prod" { - type = bool - default = false -} -variable "proxmox_pm_api_url" { type = string } -variable "proxmox_pm_api_token_id" { type = string } -variable "proxmox_pm_api_token_secret" { type = string } -variable "k8s_join_command" { type = string } -variable "vm_wizard_password" { type = string } -variable "proxmox_host" { type = string } -variable "ssh_private_key" { - type = string - default = "" -} -variable "ssh_public_key" { - type = string - default = "" -} -variable "tls_secret_name" {} -variable "tls_crt" { - default = "" -} -variable "tls_key" { - default = "" -} -variable "client_certificate_secret_name" {} -variable "mailserver_accounts" {} -variable "mailserver_aliases" {} -variable "mailserver_opendkim_key" {} -variable "mailserver_roundcubemail_db_password" { type = string } -variable "mailserver_sasl_passwd" {} -variable "pihole_web_password" {} -variable "webhook_handler_secret" {} -variable "wireguard_wg_0_conf" {} -variable "wireguard_firewall_sh" {} -variable "hackmd_db_password" {} -variable "bind_db_viktorbarzin_me" {} -variable "bind_db_viktorbarzin_lan" {} -variable "bind_named_conf_options" {} -variable "alertmanager_account_password" {} -variable "wireguard_wg_0_key" {} -variable "dbaas_root_password" {} -variable "dbaas_postgresql_root_password" {} -variable "dbaas_pgadmin_password" {} -variable "drone_github_client_id" {} -variable "drone_github_client_secret" {} -variable "drone_rpc_secret" {} -variable "drone_webhook_secret" {} -variable "dockerhub_registry_password" {} -variable "oauth2_proxy_client_id" {} -variable "oauth2_proxy_client_secret" {} -variable "oauth2_proxy_authenticated_emails" {} -variable "url_shortener_mysql_password" {} -variable "url_shortener_geolite_license_key" {} -variable "url_shortener_api_key" {} -variable "webhook_handler_fb_verify_token" {} -variable "webhook_handler_fb_page_token" {} -variable "webhook_handler_fb_app_secret" {} -variable "webhook_handler_git_user" {} -variable "technitium_username" {} -variable "technitium_password" {} -variable "technitium_db_password" {} -variable "webhook_handler_git_token" {} -variable "webhook_handler_ssh_key" {} -variable "monitoring_idrac_username" {} -variable "monitoring_idrac_password" {} -variable "alertmanager_slack_api_url" {} -variable "home_assistant_configuration" {} -variable "shadowsocks_password" {} -variable "finance_app_db_connection_string" {} -variable "finance_app_currency_converter_api_key" {} -variable "finance_app_graphql_api_secret" {} -variable "finance_app_gocardless_secret_key" {} -variable "finance_app_gocardless_secret_id" {} -variable "headscale_config" {} -variable "headscale_acl" {} -variable "immich_postgresql_password" {} -variable "immich_frame_api_key" {} -variable "ingress_crowdsec_api_key" {} -variable "crowdsec_enroll_key" { type = string } -variable "crowdsec_db_password" { type = string } -variable "crowdsec_dash_api_key" { type = string } -variable "crowdsec_dash_machine_id" { type = string } -variable "crowdsec_dash_machine_password" { type = string } -variable "vaultwarden_smtp_password" {} -variable "resume_database_url" {} -variable "resume_database_password" {} -variable "resume_redis_url" {} -variable "resume_auth_secret" { type = string } -variable "frigate_valchedrym_camera_credentials" { default = "" } -variable "paperless_db_password" {} -variable "diun_nfty_token" {} -variable "diun_slack_url" {} -variable "docker_config" {} -variable "nextcloud_db_password" {} -variable "homepage_credentials" { - type = map(any) -} -variable "authentik_secret_key" {} -variable "authentik_postgres_password" {} - -variable "ansible_prefix" { - default = "ANSIBLE_VAULT_PASSWORD_FILE=~/.ansible/vault_pass.txt ansible-playbook -i playbook/hosts.yaml playbook/linux.yml -t linux/initial_setup" - description = "Provisioner command" -} -variable "linkwarden_postgresql_password" {} -variable "linkwarden_authentik_client_id" {} -variable "linkwarden_authentik_client_secret" {} -variable "cloudflare_api_key" {} -variable "cloudflare_email" {} -variable "cloudflare_account_id" {} -variable "cloudflare_zone_id" {} -variable "cloudflare_tunnel_id" {} -variable "public_ip" {} -variable "cloudflare_proxied_names" {} -variable "cloudflare_non_proxied_names" {} -variable "cloudflare_tunnel_token" {} -variable "owntracks_credentials" {} -variable "ollama_api_credentials" {} -variable "dawarich_database_password" {} -variable "geoapify_api_key" {} -variable "tandoor_database_password" {} -variable "n8n_postgresql_password" {} -variable "realestate_crawler_db_password" {} -variable "realestate_crawler_notification_settings" { - type = map(string) -} -variable "kured_notify_url" {} -variable "onlyoffice_db_password" { type = string } -variable "onlyoffice_jwt_token" { type = string } -variable "xray_reality_clients" { type = list(map(string)) } -variable "xray_reality_private_key" { type = string } -variable "xray_reality_short_ids" { type = list(string) } -variable "tiny_tuya_api_key" { type = string } -variable "tiny_tuya_api_secret" { type = string } -variable "tiny_tuya_service_secret" { type = string } -variable "tiny_tuya_slack_url" { type = string } -variable "haos_api_token" { type = string } -variable "pve_password" { type = string } -variable "grafana_db_password" { type = string } -variable "grafana_admin_password" { type = string } -variable "clickhouse_password" { type = string } -variable "clickhouse_postgres_password" { type = string } -variable "wealthfolio_password_hash" { type = string } -variable "aiostreams_database_connection_string" { type = string } -variable "actualbudget_credentials" { type = map(any) } -variable "speedtest_db_password" { type = string } -variable "freedify_credentials" { type = map(any) } -variable "mcaptcha_postgresql_password" { type = string } -variable "mcaptcha_cookie_secret" { type = string } -variable "mcaptcha_captcha_salt" { type = string } -variable "openrouter_api_key" { type = string } -variable "slack_bot_token" { type = string } -variable "slack_channel" { type = string } -variable "affine_postgresql_password" { type = string } -variable "health_postgresql_password" { type = string } -variable "health_secret_key" { type = string } -variable "openclaw_ssh_key" { type = string } -variable "openclaw_skill_secrets" { type = map(string) } -variable "gemini_api_key" { type = string } -variable "llama_api_key" { type = string } -variable "brave_api_key" { type = string } -variable "modal_api_key" { type = string } -variable "coturn_turn_secret" { type = string } - -variable "k8s_users" { - type = map(any) - default = {} -} - -variable "kube_config_path" { - type = string - default = "~/.kube/config" -} - -provider "kubernetes" { - config_path = var.prod ? "" : var.kube_config_path -} - -provider "helm" { - kubernetes = { - config_path = var.prod ? "" : var.kube_config_path - } -} - -provider "proxmox" { - pm_api_url = var.proxmox_pm_api_url - pm_api_token_id = var.proxmox_pm_api_token_id - pm_api_token_secret = var.proxmox_pm_api_token_secret - pm_tls_insecure = true -} -# TODO: add DEFCON levels - -# --------------------------------------------------------------------------- -# Infra modules (VM templates, docker-registry) migrated to stacks/infra/ -# Manage with: cd stacks/infra && terragrunt apply -# --------------------------------------------------------------------------- - -# module that provisions the proxmox host? -# make dns stateless? -# pfsense/truenas configs in code -# etcd db backup in code - -# module "k8s_node5" { -# template_name = local.vm_template_name -# source = "./modules/create-vm" -# vm_name = "k8s-node5" -# vmid = 205 -# cisnippet_name = local.vm_cloud_init_snippet_name - -# vm_mac_address = "00:50:56:87:4a:2d" -# bridge = "vmbr1" -# vlan_tag = "20" -# } - -# module "k8s_master" { -# source = "./modules/create-vm" -# vm_name = "k8s-master" -# vm_mac_address = "00:50:56:b0:a1:39" -# network = "dKubernetes" -# provisioner_command = "${var.ansible_prefix} -t linux/k8s/master -e hostname=k8s-master" - -# vsphere_password = var.vsphere_password -# vsphere_user = var.vsphere_user -# vsphere_server = var.vsphere_server -# vsphere_datastore = "r730-datastore" -# vsphere_resource_pool = "R730" - -# } -# module "k8s_node1" { -# source = "./modules/create-vm" -# vm_name = "k8s-node1" -# vm_mac_address = "00:50:56:b0:e0:c9" -# network = "dKubernetes" -# provisioner_command = "${var.ansible_prefix} -t linux/k8s/node -e hostname=k8s-node1 -e k8s_master='wizard@${module.k8s_master.guest_ip}'" - -# vsphere_password = var.vsphere_password -# vsphere_user = var.vsphere_user -# vsphere_server = var.vsphere_server -# vsphere_datastore = "r730-datastore" -# vsphere_resource_pool = "R730" - -# } - -# module "k8s_node2" { -# source = "./modules/create-vm" -# vm_name = "k8s-node2" -# vm_mac_address = "00:50:56:b0:a1:36" -# network = "dKubernetes" -# provisioner_command = "${var.ansible_prefix} -t linux/k8s/node -e hostname=k8s-node2 -e k8s_master='wizard@${module.k8s_master.guest_ip}'" - -# vsphere_password = var.vsphere_password -# vsphere_user = var.vsphere_user -# vsphere_server = var.vsphere_server -# vsphere_datastore = "r730-datastore" -# vsphere_resource_pool = "R730" -# } - -# module "k8s_node3" { -# source = "./modules/create-vm" -# vm_name = "k8s-node3" -# vm_mac_address = "00:50:56:b0:a1:37" -# network = "dKubernetes" -# provisioner_command = "${var.ansible_prefix} -t linux/k8s/node -e hostname=k8s-node3 -e k8s_master='wizard@${module.k8s_master.guest_ip}'" - -# vsphere_password = var.vsphere_password -# vsphere_user = var.vsphere_user -# vsphere_server = var.vsphere_server -# vsphere_datastore = "r730-datastore" -# vsphere_resource_pool = "R730" -# } - -# module "k8s_node4" { -# source = "./modules/create-vm" -# vm_name = "k8s-node4" -# vmid = 204 -# template_name = local.vm_template_name -# cisnippet_name = local.vm_cloud_init_snippet_name - -# vm_mac_address = "00:50:56:b0:a1:38" -# bridge = "vmbr1" -# vlan_tag = "20" -# } - -# module "k8s_node5" { -# source = "./modules/create-vm" -# vm_name = "k8s-node5" -# vm_mac_address = "00:50:56:b0:a1:40" -# network = "dKubernetes" -# provisioner_command = "${var.ansible_prefix} -t linux/k8s/node -e hostname=k8s-node5 -e k8s_master='wizard@${module.k8s_master.guest_ip}'" - -# vsphere_password = var.vsphere_password -# vsphere_user = var.vsphere_user -# vsphere_server = var.vsphere_server -# vsphere_datastore = "r730-datastore" -# vsphere_resource_pool = "R730" - -# } -# module "devvm" { -# source = "./modules/create-vm" -# vm_name = "devvm" -# vm_mac_address = "00:50:56:b0:a1:41" -# network = "dKubernetes" -# # provisioner_command = "${var.ansible_prefix} -t linux/k8s/node -e hostname=k8s-node5 -e k8s_master='wizard@${module.k8s_master.guest_ip}'" - -# vsphere_password = var.vsphere_password -# vsphere_user = var.vsphere_user -# vsphere_server = var.vsphere_server -# vsphere_datastore = "r730-datastore" -# vsphere_resource_pool = "R730" -# } - -# resource "null_resource" "test" { -# provisioner "local-exec" { -# working_dir = "/home/viktor/" -# command = "ANSIBLE_VAULT_PASSWORD_FILE=~/.ansible/vault_pass.txt ansible-playbook -i playbook/hosts.yaml playbook/linux.yml -t linux/k8s/node -e host='10.0.40.126'" -# } -# } - -# --------------------------------------------------------------------------- -# The kubernetes_cluster module (modules/kubernetes/) has been migrated to -# individual Terragrunt stacks under stacks/. -# See stacks//main.tf for each service's configuration. -# --------------------------------------------------------------------------- - - diff --git a/migrate_tfstate.txt b/migrate_tfstate.txt deleted file mode 100644 index 4ebc3c7e..00000000 --- a/migrate_tfstate.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Steps to migrate 1 .tfstate into another - -# Inside the dir to be migrated out from do: -for s in $(tf state list); do tf state mv -state-out=../../terraform.tfstate $s "module.UPPER_WORKSPACE_MODULE_NAME.$s"; done diff --git a/modules/kubernetes/authelia/main.tf b/modules/kubernetes/authelia/main.tf deleted file mode 100644 index 133f5670..00000000 --- a/modules/kubernetes/authelia/main.tf +++ /dev/null @@ -1,178 +0,0 @@ -variable "tls_secret_name" {} - -resource "kubernetes_namespace" "authelia" { - metadata { - name = "authelia" - labels = { - "istio-injection" : "disabled" - } - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.authelia.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "helm_release" "authelia" { - namespace = kubernetes_namespace.authelia.metadata[0].name - name = "authelia" - atomic = true - - repository = "https://charts.authelia.com" - chart = "authelia" - version = "0.10.49" - - depends_on = [kubernetes_namespace.authelia] - - values = [templatefile("${path.module}/values.yaml", {})] -} - -# resource "kubernetes_config_map" "configuration" { -# metadata { -# name = "configuration" -# namespace = kubernetes_namespace.authelia.metadata[0].name - -# labels = { -# app = "configuration" -# } -# annotations = { -# "reloader.stakater.com/match" = "true" -# } -# } - -# data = { -# # "configuration.yml" = yamldecode(file("${path.module}/configuration.yml")) -# "configuration.yml" = file("${path.module}/configuration.yml") -# "users_database.yml" = file("${path.module}/users_database.yml") -# } -# } - - -# resource "kubernetes_deployment" "authelia" { -# metadata { -# name = "authelia" -# namespace = kubernetes_namespace.authelia.metadata[0].name -# labels = { -# app = "authelia" -# } -# annotations = { -# "reloader.stakater.com/search" = "true" -# } -# } -# spec { -# replicas = 1 -# selector { -# match_labels = { -# app = "authelia" -# } -# } -# template { -# metadata { -# labels = { -# app = "authelia" -# } -# } -# spec { -# container { -# image = "authelia/authelia:4.38" -# name = "authelia" -# # command = ["tail", "-f", "/etc/passwd"] - -# port { -# container_port = 9091 -# } -# port { -# container_port = 8080 -# } -# volume_mount { -# name = "config" -# # mount_path = "/etc/authelia/configuration.yml" -# mount_path = "/config/configuration.yml" -# sub_path = "configuration.yml" -# } -# volume_mount { -# name = "users-database" -# # mount_path = "/etc/authelia/users_database.yml" -# mount_path = "/config/users_database.yml" -# sub_path = "users_database.yml" -# } -# } -# volume { -# name = "config" -# config_map { -# name = "configuration" -# } -# } -# volume { -# name = "users-database" -# config_map { -# name = "configuration" -# } -# } -# } -# } -# } -# } - -# resource "kubernetes_service" "authelia" { -# metadata { -# name = "authelia" -# namespace = kubernetes_namespace.authelia.metadata[0].name -# labels = { -# "app" = "authelia" -# } -# } - -# spec { -# selector = { -# app = "authelia" -# } -# port { -# name = "http" -# port = 80 -# protocol = "TCP" -# # target_port = 8080 -# target_port = 9091 -# } -# } -# } - -# resource "kubernetes_ingress_v1" "authelia" { -# metadata { -# name = "authelia" -# namespace = kubernetes_namespace.authelia.metadata[0].name -# annotations = { -# "kubernetes.io/ingress.class" = "nginx" -# # "nginx.ingress.kubernetes.io/affinity" = "cookie" -# # "nginx.ingress.kubernetes.io/auth-tls-verify-client" = "on" -# # "nginx.ingress.kubernetes.io/auth-tls-secret" = "default/ca-secret" -# # "nginx.ingress.kubernetes.io/auth-url" : "https://oauth2.viktorbarzin.me/oauth2/auth" -# # "nginx.ingress.kubernetes.io/auth-signin" : "https://oauth2.viktorbarzin.me/oauth2/start?rd=/redirect/$http_host$escaped_request_uri" -# } -# } - -# spec { -# tls { -# hosts = ["auth.viktorbarzin.me"] -# secret_name = var.tls_secret_name -# } -# rule { -# host = "auth.viktorbarzin.me" -# http { -# path { -# path = "/" -# backend { -# service { -# name = "authelia" -# port { -# number = 80 -# } -# } -# } -# } -# } -# } -# } -# } diff --git a/modules/kubernetes/authelia/users_database.yml b/modules/kubernetes/authelia/users_database.yml deleted file mode 100644 index 9c311d0b..00000000 --- a/modules/kubernetes/authelia/users_database.yml +++ /dev/null @@ -1,10 +0,0 @@ -users: - authelia: - disabled: false - displayname: "Viktor" - # Password is authelia - password: "$6$rounds=50000$BpLnfgDsc2WD8F2q$Zis.ixdg9s/UOJYrs56b5QEZFiZECu0qZVNsIYxBaNJ7ucIL.nlxVCT5tqh8KHG8X4tlwCFm5r6NTOZZ5qRFN/" # yamllint disable-line rule:line-length - email: me@viktorbarzin.me - groups: - - admins - - dev diff --git a/modules/kubernetes/authelia/values.yaml b/modules/kubernetes/authelia/values.yaml deleted file mode 100644 index d2b0afb1..00000000 --- a/modules/kubernetes/authelia/values.yaml +++ /dev/null @@ -1,24 +0,0 @@ -configMap: - session: - cookies: - - domain: 'authelia.viktorbarzin.me' - authelia_url: 'https://authelia.viktorbarzin.me' - - storage: - local: - path: '/config/db.sqlite3' - - - theme: light - # Error 1: access_control (The warning) - access_control: - default_policy: 'one_factor' # Change to 'two_factor' once you have 2FA set up - rules: - - domain: "*.viktorbarzin.me" - policy: one_factor - - # Error 2: authentication_backend (Where users are stored) - authentication_backend: - file: - path: /config/users.yml - \ No newline at end of file diff --git a/modules/kubernetes/bind/deployment-factory/main.tf b/modules/kubernetes/bind/deployment-factory/main.tf deleted file mode 100644 index 78b4841f..00000000 --- a/modules/kubernetes/bind/deployment-factory/main.tf +++ /dev/null @@ -1,93 +0,0 @@ -variable "named_conf_mounts" {} -variable "deployment_name" {} - -resource "kubernetes_deployment" "bind" { - metadata { - name = var.deployment_name - namespace = "bind" - labels = { - "app" = "bind" - "kubernetes.io/cluster-service" : "true" - } - annotations = { - "reloader.stakater.com/search" = "true" - } - } - spec { - replicas = "3" - selector { - match_labels = { - "app" = var.deployment_name - } - } - template { - metadata { - labels = { - "app" = var.deployment_name - "kubernetes.io/cluster-service" : "true" - } - } - spec { - container { - name = "bind" - image = "resystit/bind9:latest" - image_pull_policy = "IfNotPresent" - port { - container_port = 53 - protocol = "UDP" - } - volume_mount { - mount_path = "/etc/bind/named.conf" - sub_path = "named.conf" - name = "bindconf" - } - - dynamic "volume_mount" { - for_each = [for m in var.named_conf_mounts : - { - name = m.name - mount_path = m.mount_path - sub_path = m.sub_path - }] - content { - name = volume_mount.value.name - mount_path = volume_mount.value.mount_path - sub_path = volume_mount.value.sub_path - } - } - - volume_mount { - mount_path = "/etc/bind/db.viktorbarzin.me" - sub_path = "db.viktorbarzin.me" - name = "bindconf" - } - volume_mount { - mount_path = "/etc/bind/db.viktorbarzin.lan" - sub_path = "db.viktorbarzin.lan" - name = "bindconf" - } - volume_mount { - mount_path = "/etc/bind/db.181.191.213.in-addr.arpa" - sub_path = "db.181.191.213.in-addr.arpa" - name = "bindconf" - } - } - container { - name = "bind-exporter" - image = "prometheuscommunity/bind-exporter:latest" - image_pull_policy = "IfNotPresent" - port { - container_port = 9119 - } - } - - volume { - name = "bindconf" - config_map { - name = "bind-configmap" - } - } - } - } - } -} diff --git a/modules/kubernetes/bind/extra/viktorbarzin.me b/modules/kubernetes/bind/extra/viktorbarzin.me deleted file mode 100644 index 83abebd7..00000000 --- a/modules/kubernetes/bind/extra/viktorbarzin.me +++ /dev/null @@ -1,180 +0,0 @@ -; additional bind records added via terraform automation -; entries are usually programmatically added to this file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/modules/kubernetes/bind/main.tf b/modules/kubernetes/bind/main.tf deleted file mode 100644 index 2b79b5fc..00000000 --- a/modules/kubernetes/bind/main.tf +++ /dev/null @@ -1,77 +0,0 @@ -variable "db_viktorbarzin_me" {} -variable "db_viktorbarzin_lan" {} -variable "named_conf_options" {} - -resource "kubernetes_namespace" "bind" { - metadata { - name = "bind" - } -} - -resource "kubernetes_config_map" "bind_configmap" { - metadata { - name = "bind-configmap" - namespace = "bind" - - annotations = { - "reloader.stakater.com/match" = "true" - } - } - - data = { - "db.viktorbarzin.lan" = var.db_viktorbarzin_lan - "db.viktorbarzin.me" = format("%s%s", var.db_viktorbarzin_me, file("${path.module}/extra/viktorbarzin.me")) - "db.181.191.213.in-addr.arpa" = var.db_ptr - "named.conf" = var.named_conf - "named.conf.local" = var.named_conf_local - "named.conf.options" = var.named_conf_options - "public-named.conf.local" = var.public_named_conf_local - "public-named.conf.options" = var.public_named_conf_options - } -} - -module "bind-local-deployment" { - source = "./deployment-factory" - deployment_name = "bind" - named_conf_mounts = [ - { - "mount_path" = "/etc/bind/named.conf.local" - "sub_path" = "named.conf.local" - "name" = "bindconf" - }, - { - mount_path = "/etc/bind/named.conf.options" - sub_path = "named.conf.options" - name = "bindconf" - } - ] -} - -module "bind-local-service" { - source = "./service-factory" - service_name = "bind" - port = 5354 -} - -module "bind-public-deployment" { - source = "./deployment-factory" - deployment_name = "bind-public" - named_conf_mounts = [ - { - "mount_path" = "/etc/bind/named.conf.local" - "sub_path" = "public-named.conf.local" - "name" = "bindconf" - }, - { - mount_path = "/etc/bind/named.conf.options" - sub_path = "public-named.conf.options" - name = "bindconf" - } - ] -} - -module "bind-public-service" { - source = "./service-factory" - service_name = "bind-public" - port = 10053 -} diff --git a/modules/kubernetes/bind/service-factory/main.tf b/modules/kubernetes/bind/service-factory/main.tf deleted file mode 100644 index d64a4a7e..00000000 --- a/modules/kubernetes/bind/service-factory/main.tf +++ /dev/null @@ -1,28 +0,0 @@ -variable "service_name" {} -variable "port" {} - -resource "kubernetes_service" "bind" { - metadata { - name = var.service_name - namespace = "bind" - annotations = { - "metallb.universe.tf/allow-shared-ip" = "shared" - } - labels = { - "app" = var.service_name - } - } - spec { - type = "LoadBalancer" - external_traffic_policy = "Cluster" - selector = { - "app" = var.service_name - } - port { - name = "dns" - protocol = "UDP" - port = var.port - target_port = "53" - } - } -} diff --git a/modules/kubernetes/bind/variables.tf b/modules/kubernetes/bind/variables.tf deleted file mode 100644 index e6327f47..00000000 --- a/modules/kubernetes/bind/variables.tf +++ /dev/null @@ -1,98 +0,0 @@ -variable "named_conf" { - default = <=1.19 - ingressClassName: # "nginx" - - ## Configure the hosts for the ingress - hosts: - - # -- Host address. Helm template can be passed. - host: home-assistant.viktorbarzin.me - ## Configure the paths for the host - paths: - - # -- Path. Helm template can be passed. - path: / - # -- Ignored if not kubeVersion >= 1.14-0 - pathType: Prefix - service: - # -- Overrides the service name reference for this path - name: home-assistant - # -- Overrides the service port reference for this path - port: 8123 - - # -- Configure TLS for the ingress. Both secretName and hosts can process a Helm template. - tls: #[] - - secretName: ${tls_secret_name} - hosts: - - home-assistant.viktorbarzin.me - -# -- Configure persistence for the chart here. -# Additional items can be added by adding a dictionary key similar to the 'config' key. -# [[ref]](http://docs.k8s-at-home.com/our-helm-charts/common-library-storage) -# @default -- See below -persistence: - # -- Default persistence for configuration files. - # @default -- See below - config: - # -- Enables or disables the persistence item - enabled: false - - # -- Sets the persistence type - # Valid options are pvc, emptyDir, hostPath, secret, configMap or custom - type: configMap - name: home-assistant-configmap - - # -- Where to mount the volume in the main container. - # Defaults to `/`, - # setting to '-' creates the volume but disables the volumeMount. - mountPath: /config - # -- Specify if the volume should be mounted read-only. - readOnly: true diff --git a/modules/kubernetes/home_assistant/main.tf b/modules/kubernetes/home_assistant/main.tf deleted file mode 100644 index 8d025efa..00000000 --- a/modules/kubernetes/home_assistant/main.tf +++ /dev/null @@ -1,238 +0,0 @@ -variable "tls_secret_name" {} -variable "client_certificate_secret_name" {} -variable "configuration_yaml" {} - -resource "kubernetes_namespace" "home_assistant" { - metadata { - name = "home-assistant" - } -} - -resource "kubernetes_config_map" "home_assistant_config_map" { - metadata { - name = "home-assistant-configmap" - namespace = kubernetes_namespace.home_assistant.metadata[0].name - - annotations = { - "reloader.stakater.com/match" = "true" - } - } - - data = { - # "db.viktorbarzin.lan" = var.db_viktorbarzin_lan - # "db.viktorbarzin.me" = format("%s%s", var.db_viktorbarzin_me, file("${path.module}/extra/viktorbarzin.me")) - # "db.181.191.213.in-addr.arpa" = var.db_ptr - "configuration.yaml" = var.configuration_yaml - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.home_assistant.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "helm_release" "home_assistant" { - namespace = kubernetes_namespace.home_assistant.metadata[0].name - create_namespace = true - name = "home-assistant" - - repository = "https://k8s-at-home.com/charts/" - chart = "home-assistant" - - values = [templatefile("${path.module}/home_assistant_chart_values.tpl", { tls_secret_name = var.tls_secret_name, client_certificate_secret_name = var.client_certificate_secret_name })] -} - -resource "kubernetes_deployment" "home_assistant" { - metadata { - name = "home-assistant" - namespace = kubernetes_namespace.home_assistant.metadata[0].name - - labels = { - "app.kubernetes.io/instance" = "home-assistant" - "app.kubernetes.io/name" = "home-assistant" - "app.kubernetes.io/version" = "2022.5.4" - } - } - - spec { - replicas = 1 - - selector { - match_labels = { - "app.kubernetes.io/instance" = "home-assistant" - "app.kubernetes.io/name" = "home-assistant" - } - } - - template { - metadata { - labels = { - "app.kubernetes.io/instance" = "home-assistant" - - "app.kubernetes.io/name" = "home-assistant" - } - } - - spec { - container { - name = "home-assistant" - # image = "ghcr.io/home-assistant/home-assistant:2022.5.4" - image = "ghcr.io/home-assistant/home-assistant:2022.5.5" - # image = "ghcr.io/home-assistant/home-assistant" - port { - name = "http" - container_port = 8123 - protocol = "TCP" - } - env { - name = "TZ" - value = "UTC+3" - } - - volume_mount { - name = "configuration" - mount_path = "/config" - # sub_path = "hackmd" - } - liveness_probe { - tcp_socket { - port = "8123" - } - timeout_seconds = 1 - period_seconds = 10 - success_threshold = 1 - failure_threshold = 3 - } - - readiness_probe { - tcp_socket { - port = "8123" - } - - timeout_seconds = 1 - period_seconds = 10 - success_threshold = 1 - failure_threshold = 3 - } - - startup_probe { - tcp_socket { - port = "8123" - } - - timeout_seconds = 1 - period_seconds = 5 - success_threshold = 1 - failure_threshold = 30 - } - - termination_message_path = "/dev/termination-log" - image_pull_policy = "IfNotPresent" - } - - volume { - name = "configuration" - iscsi { - target_portal = "iscsi.viktorbarzin.lan:3260" - fs_type = "ext4" - iqn = "iqn.2020-12.lan.viktorbarzin:storage:home-assistant" - lun = 0 - read_only = false - } - } - - restart_policy = "Always" - termination_grace_period_seconds = 30 - dns_policy = "ClusterFirst" - service_account_name = "default" - } - } - - strategy { - type = "Recreate" - } - revision_history_limit = 3 - } -} -resource "kubernetes_service" "home_assistant" { - metadata { - name = "home-assistant" - namespace = kubernetes_namespace.home_assistant.metadata[0].name - - labels = { - "app.kubernetes.io/instance" = "home-assistant" - - "app.kubernetes.io/managed-by" = "Helm" - - "app.kubernetes.io/name" = "home-assistant" - - "app.kubernetes.io/version" = "2022.5.4" - - "helm.sh/chart" = "home-assistant-13.2.0" - } - - annotations = { - "meta.helm.sh/release-name" = "home-assistant" - - "meta.helm.sh/release-namespace" = "home-assistant" - } - } - - spec { - port { - name = "http" - protocol = "TCP" - port = 8123 - target_port = "http" - } - - selector = { - "app.kubernetes.io/instance" = "home-assistant" - - "app.kubernetes.io/name" = "home-assistant" - } - - # cluster_ip = "10.102.20.150" - type = "ClusterIP" - session_affinity = "None" - } -} - - - -resource "kubernetes_ingress_v1" "home-assistant-ui" { - metadata { - name = "home-assistant-ui-ingress" - namespace = kubernetes_namespace.home_assistant.metadata[0].name - annotations = { - "traefik.ingress.kubernetes.io/router.middlewares" = "traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd" - "traefik.ingress.kubernetes.io/router.entrypoints" = "websecure" - "traefik.ingress.kubernetes.io/router.tls.options" = "traefik-mtls@kubernetescrd" - } - } - - spec { - ingress_class_name = "traefik" - tls { - hosts = ["home-assistant.viktorbarzin.me"] - secret_name = var.tls_secret_name - } - rule { - host = "home-assistant.viktorbarzin.me" - http { - path { - path = "/" - backend { - service { - name = "home-assistant" - port { - number = 8123 - } - } - } - } - } - } - } -} diff --git a/modules/kubernetes/idrac-power-cycle.sh b/modules/kubernetes/idrac-power-cycle.sh deleted file mode 100644 index 41b7f497..00000000 --- a/modules/kubernetes/idrac-power-cycle.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -user="user" -pass="pass" -# Get power supply on outside system voltage -curl -s -k -u $user:$pass -H"Content-type: application/json" -X GET https://idrac/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2 |jq .LineInputVoltage - -# Power off -curl -s -k -u $user:$pass -X POST -d '{"Action": "Reset", "ResetType": "GracefulShutdown"}' -H"Content-type: application/json" https://idrac/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset - -# Power on -curl -s -k -u $user:$pass -X POST -d '{"Action": "Reset", "ResetType": "On"}' -H"Content-type: application/json" https://idrac/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset diff --git a/modules/kubernetes/istio/base.yaml b/modules/kubernetes/istio/base.yaml deleted file mode 100644 index 1495030a..00000000 --- a/modules/kubernetes/istio/base.yaml +++ /dev/null @@ -1,40 +0,0 @@ -global: - - # ImagePullSecrets for control plane ServiceAccount, list of secrets in the same namespace - # to use for pulling any images in pods that reference this ServiceAccount. - # Must be set for any cluster configured with private docker registry. - imagePullSecrets: [] - - # Used to locate istiod. - istioNamespace: istio-system - - istiod: - enableAnalysis: false - - configValidation: true - externalIstiod: false - remotePilotAddress: "" - - # Platform where Istio is deployed. Possible values are: "openshift", "gcp". - # An empty value means it is a vanilla Kubernetes distribution, therefore no special - # treatment will be considered. - platform: "" - - # Setup how istiod Service is configured. See https://kubernetes.io/docs/concepts/services-networking/dual-stack/#services - # This is intended only for use with external istiod. - ipFamilyPolicy: "" - ipFamilies: [] - -base: - # Used for helm2 to add the CRDs to templates. - enableCRDTemplates: false - - # Validation webhook configuration url - # For example: https://$remotePilotAddress:15017/validate - validationURL: "" - - # For istioctl usage to disable istio config crds in base - enableIstioConfigCRDs: true - -defaultRevision: "default" - diff --git a/modules/kubernetes/istio/istiod.yaml b/modules/kubernetes/istio/istiod.yaml deleted file mode 100644 index 0b3363d9..00000000 --- a/modules/kubernetes/istio/istiod.yaml +++ /dev/null @@ -1,520 +0,0 @@ -#.Values.pilot for discovery and mesh wide config - -## Discovery Settings -pilot: - autoscaleEnabled: true - autoscaleMin: 1 - autoscaleMax: 5 - autoscaleBehavior: {} - replicaCount: 1 - rollingMaxSurge: 100% - rollingMaxUnavailable: 25% - - hub: "" - tag: "" - variant: "" - - # Can be a full hub/image:tag - image: pilot - traceSampling: 1.0 - - # Resources for a small pilot install - resources: - requests: - cpu: 500m - memory: 2048Mi - - # Set to `type: RuntimeDefault` to use the default profile if available. - seccompProfile: {} - - # Additional container arguments - extraContainerArgs: [] - - env: {} - - cpu: - targetAverageUtilization: 80 - - # Additional volumeMounts to the istiod container - volumeMounts: [] - - # Additional volumes to the istiod pod - volumes: [] - - nodeSelector: {} - podAnnotations: {} - serviceAnnotations: {} - - topologySpreadConstraints: [] - - # You can use jwksResolverExtraRootCA to provide a root certificate - # in PEM format. This will then be trusted by pilot when resolving - # JWKS URIs. - jwksResolverExtraRootCA: "" - - # This is used to set the source of configuration for - # the associated address in configSource, if nothing is specified - # the default MCP is assumed. - configSource: - subscribedResources: [] - - plugins: [] - - # The following is used to limit how long a sidecar can be connected - # to a pilot. It balances out load across pilot instances at the cost of - # increasing system churn. - keepaliveMaxServerConnectionAge: 30m - - # Additional labels to apply to the deployment. - deploymentLabels: {} - - ## Mesh config settings - - # Install the mesh config map, generated from values.yaml. - # If false, pilot wil use default values (by default) or user-supplied values. - configMap: true - - # Additional labels to apply on the pod level for monitoring and logging configuration. - podLabels: {} - - # Setup how istiod Service is configured. See https://kubernetes.io/docs/concepts/services-networking/dual-stack/#services - ipFamilyPolicy: "" - ipFamilies: [] - -sidecarInjectorWebhook: - # You can use the field called alwaysInjectSelector and neverInjectSelector which will always inject the sidecar or - # always skip the injection on pods that match that label selector, regardless of the global policy. - # See https://istio.io/docs/setup/kubernetes/additional-setup/sidecar-injection/#more-control-adding-exceptions - neverInjectSelector: [] - alwaysInjectSelector: [] - - # injectedAnnotations are additional annotations that will be added to the pod spec after injection - # This is primarily to support PSP annotations. For example, if you defined a PSP with the annotations: - # - # annotations: - # apparmor.security.beta.kubernetes.io/allowedProfileNames: runtime/default - # apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default - # - # The PSP controller would add corresponding annotations to the pod spec for each container. However, this happens before - # the inject adds additional containers, so we must specify them explicitly here. With the above example, we could specify: - # injectedAnnotations: - # container.apparmor.security.beta.kubernetes.io/istio-init: runtime/default - # container.apparmor.security.beta.kubernetes.io/istio-proxy: runtime/default - injectedAnnotations: {} - - # This enables injection of sidecar in all namespaces, - # with the exception of namespaces with "istio-injection:disabled" annotation - # Only one environment should have this enabled. - enableNamespacesByDefault: false - - # Mutations that occur after the sidecar injector are not handled by default, as the Istio sidecar injector is only run - # once. For example, an OPA sidecar injected after the Istio sidecar will not have it's liveness/readiness probes rewritten. - # Setting this to `IfNeeded` will result in the sidecar injector being run again if additional mutations occur. - reinvocationPolicy: Never - - rewriteAppHTTPProbe: true - - # Templates defines a set of custom injection templates that can be used. For example, defining: - # - # templates: - # hello: | - # metadata: - # labels: - # hello: world - # - # Then starting a pod with the `inject.istio.io/templates: hello` annotation, will result in the pod - # being injected with the hello=world labels. - # This is intended for advanced configuration only; most users should use the built in template - templates: {} - - # Default templates specifies a set of default templates that are used in sidecar injection. - # By default, a template `sidecar` is always provided, which contains the template of default sidecar. - # To inject other additional templates, define it using the `templates` option, and add it to - # the default templates list. - # For example: - # - # templates: - # hello: | - # metadata: - # labels: - # hello: world - # - # defaultTemplates: ["sidecar", "hello"] - defaultTemplates: [] -istiodRemote: - # Sidecar injector mutating webhook configuration clientConfig.url value. - # For example: https://$remotePilotAddress:15017/inject - # The host should not refer to a service running in the cluster; use a service reference by specifying - # the clientConfig.service field instead. - injectionURL: "" - - # Sidecar injector mutating webhook configuration path value for the clientConfig.service field. - # Override to pass env variables, for example: /inject/cluster/remote/net/network2 - injectionPath: "/inject" -telemetry: - enabled: true - v2: - # For Null VM case now. - # This also enables metadata exchange. - enabled: true - metadataExchange: - # Indicates whether to enable WebAssembly runtime for metadata exchange filter. - wasmEnabled: false - # Indicate if prometheus stats filter is enabled or not - prometheus: - enabled: true - # Indicates whether to enable WebAssembly runtime for stats filter. - wasmEnabled: false - # overrides stats EnvoyFilter configuration. - configOverride: - gateway: {} - inboundSidecar: {} - outboundSidecar: {} - # stackdriver filter settings. - stackdriver: - enabled: false - logging: false - monitoring: false - topology: false # deprecated. setting this to true will have no effect, as this option is no longer supported. - disableOutbound: false - # configOverride parts give you the ability to override the low level configuration params passed to envoy filter. - - configOverride: {} - # e.g. - # disable_server_access_logging: false - # disable_host_header_fallback: true - # Access Log Policy Filter Settings. This enables filtering of access logs from stackdriver. - accessLogPolicy: - enabled: false - # To reduce the number of successful logs, default log window duration is - # set to 12 hours. - logWindowDuration: "43200s" -# Revision is set as 'version' label and part of the resource names when installing multiple control planes. -revision: "" - -# Revision tags are aliases to Istio control plane revisions -revisionTags: [] - -# For Helm compatibility. -ownerName: "" - -# meshConfig defines runtime configuration of components, including Istiod and istio-agent behavior -# See https://istio.io/docs/reference/config/istio.mesh.v1alpha1/ for all available options -meshConfig: - enablePrometheusMerge: true - -global: - # Used to locate istiod. - istioNamespace: istio-system - # List of cert-signers to allow "approve" action in the istio cluster role - # - # certSigners: - # - clusterissuers.cert-manager.io/istio-ca - certSigners: [] - # enable pod disruption budget for the control plane, which is used to - # ensure Istio control plane components are gradually upgraded or recovered. - defaultPodDisruptionBudget: - enabled: true - # The values aren't mutable due to a current PodDisruptionBudget limitation - # minAvailable: 1 - - # A minimal set of requested resources to applied to all deployments so that - # Horizontal Pod Autoscaler will be able to function (if set). - # Each component can overwrite these default values by adding its own resources - # block in the relevant section below and setting the desired resources values. - defaultResources: - requests: - cpu: 10m - # memory: 128Mi - # limits: - # cpu: 100m - # memory: 128Mi - - # Default hub for Istio images. - # Releases are published to docker hub under 'istio' project. - # Dev builds from prow are on gcr.io - hub: docker.io/istio - # Default tag for Istio images. - tag: 1.20.1 - # Variant of the image to use. - # Currently supported are: [debug, distroless] - variant: "" - - # Specify image pull policy if default behavior isn't desired. - # Default behavior: latest images will be Always else IfNotPresent. - imagePullPolicy: "" - - # ImagePullSecrets for all ServiceAccount, list of secrets in the same namespace - # to use for pulling any images in pods that reference this ServiceAccount. - # For components that don't use ServiceAccounts (i.e. grafana, servicegraph, tracing) - # ImagePullSecrets will be added to the corresponding Deployment(StatefulSet) objects. - # Must be set for any cluster configured with private docker registry. - imagePullSecrets: [] - # - private-registry-key - - # Enabled by default in master for maximising testing. - istiod: - enableAnalysis: false - - # To output all istio components logs in json format by adding --log_as_json argument to each container argument - logAsJson: false - - # Comma-separated minimum per-scope logging level of messages to output, in the form of :,: - # The control plane has different scopes depending on component, but can configure default log level across all components - # If empty, default scope and level will be used as configured in code - logging: - level: "default:info" - - omitSidecarInjectorConfigMap: false - - # Whether to restrict the applications namespace the controller manages; - # If not set, controller watches all namespaces - oneNamespace: false - - # Configure whether Operator manages webhook configurations. The current behavior - # of Istiod is to manage its own webhook configurations. - # When this option is set as true, Istio Operator, instead of webhooks, manages the - # webhook configurations. When this option is set as false, webhooks manage their - # own webhook configurations. - operatorManageWebhooks: false - - # Custom DNS config for the pod to resolve names of services in other - # clusters. Use this to add additional search domains, and other settings. - # see - # https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#dns-config - # This does not apply to gateway pods as they typically need a different - # set of DNS settings than the normal application pods (e.g., in - # multicluster scenarios). - # NOTE: If using templates, follow the pattern in the commented example below. - #podDNSSearchNamespaces: - #- global - #- "{{ valueOrDefault .DeploymentMeta.Namespace \"default\" }}.global" - - # Kubernetes >=v1.11.0 will create two PriorityClass, including system-cluster-critical and - # system-node-critical, it is better to configure this in order to make sure your Istio pods - # will not be killed because of low priority class. - # Refer to https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#priorityclass - # for more detail. - priorityClassName: "" - - proxy: - image: proxyv2 - - # This controls the 'policy' in the sidecar injector. - autoInject: enabled - - # CAUTION: It is important to ensure that all Istio helm charts specify the same clusterDomain value - # cluster domain. Default value is "cluster.local". - clusterDomain: "cluster.local" - - # Per Component log level for proxy, applies to gateways and sidecars. If a component level is - # not set, then the global "logLevel" will be used. - componentLogLevel: "misc:error" - - # If set, newly injected sidecars will have core dumps enabled. - enableCoreDump: false - - # istio ingress capture allowlist - # examples: - # Redirect only selected ports: --includeInboundPorts="80,8080" - excludeInboundPorts: "" - includeInboundPorts: "*" - - # istio egress capture allowlist - # https://istio.io/docs/tasks/traffic-management/egress.html#calling-external-services-directly - # example: includeIPRanges: "172.30.0.0/16,172.20.0.0/16" - # would only capture egress traffic on those two IP Ranges, all other outbound traffic would - # be allowed by the sidecar - includeIPRanges: "*" - excludeIPRanges: "" - includeOutboundPorts: "" - excludeOutboundPorts: "" - - # Log level for proxy, applies to gateways and sidecars. - # Expected values are: trace|debug|info|warning|error|critical|off - logLevel: warning - - #If set to true, istio-proxy container will have privileged securityContext - privileged: false - - # The number of successive failed probes before indicating readiness failure. - readinessFailureThreshold: 4 - - # The initial delay for readiness probes in seconds. - readinessInitialDelaySeconds: 0 - - # The period between readiness probes. - readinessPeriodSeconds: 15 - - # Enables or disables a startup probe. - # For optimal startup times, changing this should be tied to the readiness probe values. - # - # If the probe is enabled, it is recommended to have delay=0s,period=15s,failureThreshold=4. - # This ensures the pod is marked ready immediately after the startup probe passes (which has a 1s poll interval), - # and doesn't spam the readiness endpoint too much - # - # If the probe is disabled, it is recommended to have delay=1s,period=2s,failureThreshold=30. - # This ensures the startup is reasonable fast (polling every 2s). 1s delay is used since the startup is not often ready instantly. - startupProbe: - enabled: true - failureThreshold: 600 # 10 minutes - - # Resources for the sidecar. - resources: - requests: - cpu: 100m - memory: 128Mi - limits: - cpu: 2000m - memory: 1024Mi - - # Default port for Pilot agent health checks. A value of 0 will disable health checking. - statusPort: 15020 - - # Specify which tracer to use. One of: zipkin, lightstep, datadog, stackdriver. - # If using stackdriver tracer outside GCP, set env GOOGLE_APPLICATION_CREDENTIALS to the GCP credential file. - tracer: "zipkin" - - proxy_init: - # Base name for the proxy_init container, used to configure iptables. - image: proxyv2 - - # configure remote pilot and istiod service and endpoint - remotePilotAddress: "" - - ############################################################################################## - # The following values are found in other charts. To effectively modify these values, make # - # make sure they are consistent across your Istio helm charts # - ############################################################################################## - - # The customized CA address to retrieve certificates for the pods in the cluster. - # CSR clients such as the Istio Agent and ingress gateways can use this to specify the CA endpoint. - # If not set explicitly, default to the Istio discovery address. - caAddress: "" - - # Configure a remote cluster data plane controlled by an external istiod. - # When set to true, istiod is not deployed locally and only a subset of the other - # discovery charts are enabled. - externalIstiod: false - - # Configure a remote cluster as the config cluster for an external istiod. - configCluster: false - - # Configure the policy for validating JWT. - # Currently, two options are supported: "third-party-jwt" and "first-party-jwt". - jwtPolicy: "third-party-jwt" - - # Mesh ID means Mesh Identifier. It should be unique within the scope where - # meshes will interact with each other, but it is not required to be - # globally/universally unique. For example, if any of the following are true, - # then two meshes must have different Mesh IDs: - # - Meshes will have their telemetry aggregated in one place - # - Meshes will be federated together - # - Policy will be written referencing one mesh from the other - # - # If an administrator expects that any of these conditions may become true in - # the future, they should ensure their meshes have different Mesh IDs - # assigned. - # - # Within a multicluster mesh, each cluster must be (manually or auto) - # configured to have the same Mesh ID value. If an existing cluster 'joins' a - # multicluster mesh, it will need to be migrated to the new mesh ID. Details - # of migration TBD, and it may be a disruptive operation to change the Mesh - # ID post-install. - # - # If the mesh admin does not specify a value, Istio will use the value of the - # mesh's Trust Domain. The best practice is to select a proper Trust Domain - # value. - meshID: "" - - # Configure the mesh networks to be used by the Split Horizon EDS. - # - # The following example defines two networks with different endpoints association methods. - # For `network1` all endpoints that their IP belongs to the provided CIDR range will be - # mapped to network1. The gateway for this network example is specified by its public IP - # address and port. - # The second network, `network2`, in this example is defined differently with all endpoints - # retrieved through the specified Multi-Cluster registry being mapped to network2. The - # gateway is also defined differently with the name of the gateway service on the remote - # cluster. The public IP for the gateway will be determined from that remote service (only - # LoadBalancer gateway service type is currently supported, for a NodePort type gateway service, - # it still need to be configured manually). - # - # meshNetworks: - # network1: - # endpoints: - # - fromCidr: "192.168.0.1/24" - # gateways: - # - address: 1.1.1.1 - # port: 80 - # network2: - # endpoints: - # - fromRegistry: reg1 - # gateways: - # - registryServiceName: istio-ingressgateway.istio-system.svc.cluster.local - # port: 443 - # - meshNetworks: {} - - # Use the user-specified, secret volume mounted key and certs for Pilot and workloads. - mountMtlsCerts: false - - multiCluster: - # Set to true to connect two kubernetes clusters via their respective - # ingressgateway services when pods in each cluster cannot directly - # talk to one another. All clusters should be using Istio mTLS and must - # have a shared root CA for this model to work. - enabled: false - # Should be set to the name of the cluster this installation will run in. This is required for sidecar injection - # to properly label proxies - clusterName: "" - - # Network defines the network this cluster belong to. This name - # corresponds to the networks in the map of mesh networks. - network: "" - - # Configure the certificate provider for control plane communication. - # Currently, two providers are supported: "kubernetes" and "istiod". - # As some platforms may not have kubernetes signing APIs, - # Istiod is the default - pilotCertProvider: istiod - - sds: - # The JWT token for SDS and the aud field of such JWT. See RFC 7519, section 4.1.3. - # When a CSR is sent from Istio Agent to the CA (e.g. Istiod), this aud is to make sure the - # JWT is intended for the CA. - token: - aud: istio-ca - - sts: - # The service port used by Security Token Service (STS) server to handle token exchange requests. - # Setting this port to a non-zero value enables STS server. - servicePort: 0 - - # The name of the CA for workload certificates. - # For example, when caName=GkeWorkloadCertificate, GKE workload certificates - # will be used as the certificates for workloads. - # The default value is "" and when caName="", the CA will be configured by other - # mechanisms (e.g., environmental variable CA_PROVIDER). - caName: "" - - # whether to use autoscaling/v2 template for HPA settings - # for internal usage only, not to be configured by users. - autoscalingv2API: true - -base: - # For istioctl usage to disable istio config crds in base - enableIstioConfigCRDs: true - - # If enabled, gateway-api types will be validated using the standard upstream validation logic. - # This is an alternative to deploying the standalone validation server the project provides. - # This is disabled by default, as the cluster may already have a validation server; while technically - # it works to have multiple redundant validations, this adds complexity and operational risks. - # Users should consider enabling this if they want full gateway-api validation but don't have other validation servers. - validateGateway: false - -# keep in sync with settings used when installing the Istio CNI chart -istio_cni: - enabled: false - chained: true - diff --git a/modules/kubernetes/istio/kiali.yaml b/modules/kubernetes/istio/kiali.yaml deleted file mode 100644 index 7b82f383..00000000 --- a/modules/kubernetes/istio/kiali.yaml +++ /dev/null @@ -1,122 +0,0 @@ -nameOverride: "" -fullnameOverride: "" - -image: # see: https://quay.io/repository/kiali/kiali-operator?tab=tags - repo: quay.io/kiali/kiali-operator # quay.io/kiali/kiali-operator - tag: v1.78.0 # version string like v1.39.0 or a digest hash - digest: "" # use "sha256" if tag is a sha256 hash (do NOT prefix this value with a "@") - pullPolicy: Always - pullSecrets: [] - -# Deployment options for the operator pod. -nodeSelector: {} -podAnnotations: {} -podLabels: {} -env: [] -tolerations: [] -resources: - requests: - cpu: "10m" - memory: "64Mi" -affinity: {} -replicaCount: 1 -priorityClassName: "" -securityContext: {} - -# metrics.enabled: set to true if you want Prometheus to collect metrics from the operator -metrics: - enabled: true - -# debug.enabled: when true the full ansible logs are dumped after each reconciliation run -# debug.verbosity: defines the amount of details the operator will log (higher numbers are more noisy) -# debug.enableProfiler: when true (regardless of debug.enabled), timings for the most expensive tasks will be logged after each reconciliation loop -debug: - enabled: true - verbosity: "1" - enableProfiler: false - -# Defines where the operator will look for Kial CR resources. "" means "all namespaces". -watchNamespace: "" - -# Set to true if you want the operator to be able to create cluster roles. This is necessary -# if you want to support Kiali CRs with spec.deployment.accessible_namespaces of '**'. -# Setting this to "true" requires allowAllAccessibleNamespaces to be "true" also. -# Note that this will be overriden to "true" if cr.create is true and cr.spec.deployment.accessible_namespaces is ['**']. -clusterRoleCreator: true - -# Set to a list of secrets in the cluster that the operator will be allowed to read. This is necessary if you want to -# support Kiali CRs with spec.kiali_feature_flags.certificates_information_indicators.enabled=true. -# The secrets in this list will be the only ones allowed to be specified in any Kiali CR (in the setting -# spec.kiali_feature_flags.certificates_information_indicators.secrets). -# If you set this to an empty list, the operator will not be given permission to read any additional secrets -# found in the cluster, and thus will only support a value of "false" in the Kiali CR setting -# spec.kiali_feature_flags.certificates_information_indicators.enabled. -secretReader: ["cacerts", "istio-ca-secret"] - -# Set to true if you want to allow the operator to only be able to install Kiali in view-only-mode. -# The purpose for this setting is to allow you to restrict the permissions given to the operator itself. -onlyViewOnlyMode: false - -# allowAdHocKialiNamespace tells the operator to allow a user to be able to install a Kiali CR in one namespace but -# be able to install Kiali in another namespace. In other words, it will allow the Kiali CR spec.deployment.namespace -# to be something other than the namespace where the CR is installed. You may want to disable this if you are -# running in a multi-tenant scenario in which you only want a user to be able to install Kiali in the same namespace -# where the user has permissions to install a Kiali CR. -allowAdHocKialiNamespace: true - -# allowAdHocKialiImage tells the operator to allow a user to be able to install a custom Kiali image as opposed -# to the image the operator will install by default. In other words, it will allow the -# Kiali CR spec.deployment.image_name and spec.deployment.image_version to be configured by the user. -# You may want to disable this if you do not want users to install their own Kiali images. -allowAdHocKialiImage: false - -# allowAdHocOSSMConsoleImage tells the operator to allow a user to be able to install a custom OSSMC image as opposed -# to the image the operator will install by default. In other words, it will allow the -# OSSMConsole CR spec.deployment.imageName and spec.deployment.imageVersion to be configured by the user. -# You may want to disable this if you do not want users to install their own OSSMC images. -# This is only applicable when running on OpenShift. -allowAdHocOSSMConsoleImage: false - -# allowSecurityContextOverride tells the operator to allow a user to be able to fully override the Kiali -# container securityContext. If this is false, certain securityContext settings must exist on the Kiali -# container and any attempt to override them will be ignored. -allowSecurityContextOverride: false - -# allowAllAccessibleNamespaces tells the operator to allow a user to be able to configure Kiali -# to access all namespaces in the cluster via spec.deployment.accessible_namespaces=['**']. -# If this is false, the user must specify an explicit list of namespaces in the Kiali CR. -# Setting this to "true" requires clusterRoleCreator to be "true" also. -# Note that this will be overriden to "true" if cr.create is true and cr.spec.deployment.accessible_namespaces is ['**']. -allowAllAccessibleNamespaces: true - -# accessibleNamespacesLabel restricts the namespaces that a user can add to the Kiali CR spec.deployment.accessible_namespaces. -# This value is either an empty string (which disables this feature) or a label name with an optional label value -# (e.g. "mylabel" or "mylabel=myvalue"). Only namespaces that have that label will be permitted in -# spec.deployment.accessible_namespaces. Any namespace not labeled properly but specified in accessible_namespaces will cause -# the operator to abort the Kiali installation. -# If just a label name (but no label value) is specified, the label value the operator will look for is the value of -# the Kiali CR's spec.istio_namespace. In other words, the operator will look for the named label whose value must be the name -# of the Istio control plane namespace (which is typically, but not necessarily, "istio-system"). -accessibleNamespacesLabel: "" - -# For what a Kiali CR spec can look like, see: -# https://github.com/kiali/kiali-operator/blob/master/deploy/kiali/kiali_cr.yaml -cr: - create: false - name: kiali - # If you elect to create a Kiali CR (--set cr.create=true) - # and the operator is watching all namespaces (--set watchNamespace="") - # then this is the namespace where the CR will be created (the default will be the operator namespace). - namespace: "" - - # Annotations to place in the Kiali CR metadata. - annotations: {} - - spec: - deployment: - accessible_namespaces: - - "**" - external_services: - prometheus: - # Prometheus service name is "metrics" and is in the "telemetry" namespace - url: "http://prometheus-server.monitoring:80/" diff --git a/modules/kubernetes/istio/main.tf b/modules/kubernetes/istio/main.tf deleted file mode 100644 index 3f3021d8..00000000 --- a/modules/kubernetes/istio/main.tf +++ /dev/null @@ -1,116 +0,0 @@ -variable "tls_secret_name" {} - -resource "kubernetes_namespace" "istio" { - metadata { - name = "istio-system" - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.istio.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -# to delete all CRDS: kubectl get crd -oname | grep --color=never 'istio.io' | xargs kubectl delete -resource "helm_release" "istio-base" { - namespace = kubernetes_namespace.istio.metadata[0].name - create_namespace = false - name = "istio-base" - atomic = true - - repository = "https://istio-release.storage.googleapis.com/charts" - chart = "base" - depends_on = [kubernetes_namespace.istio] -} - -resource "helm_release" "istiod" { - namespace = kubernetes_namespace.istio.metadata[0].name - create_namespace = false - name = "istiod" - atomic = true - - repository = "https://istio-release.storage.googleapis.com/charts" - chart = "istiod" - depends_on = [kubernetes_namespace.istio] -} - -resource "helm_release" "istio-gateway" { - namespace = kubernetes_namespace.istio.metadata[0].name - create_namespace = false - name = "istio-gateway" - atomic = true - - repository = "https://istio-release.storage.googleapis.com/charts" - chart = "gateway" - depends_on = [kubernetes_namespace.istio] -} - -# Kiali dashboard -resource "helm_release" "kiali" { - namespace = kubernetes_namespace.istio.metadata[0].name - create_namespace = false - name = "kiali" - atomic = true - - repository = "https://kiali.org/helm-charts" - chart = "kiali-operator" - set { - name = "cr.create" - value = "true" - } - set { - name = "cr.namespace" - value = "istio-system" - } - values = [templatefile("${path.module}/kiali.yaml", {})] - - depends_on = [kubernetes_namespace.istio] -} - -resource "kubernetes_secret" "kiali-token" { - metadata { - name = "kiali-secret" - namespace = kubernetes_namespace.istio.metadata[0].name - annotations = { - "kubernetes.io/service-account.name" : "kiali-service-account" - } - } - type = "kubernetes.io/service-account-token" -} - -# Gets auto removed. revisit after finishing power consmption analysis -# resource "kubernetes_ingress_v1" "kiali" { -# metadata { -# name = "kiali" -# namespace = kubernetes_namespace.istio.metadata[0].name -# annotations = { -# "kubernetes.io/ingress.class" = "nginx" -# "nginx.ingress.kubernetes.io/auth-url" : "https://oauth2.viktorbarzin.me/oauth2/auth" -# "nginx.ingress.kubernetes.io/auth-signin" : "https://oauth2.viktorbarzin.me/oauth2/start?rd=/redirect/$http_host$escaped_request_uri" -# } -# } - -# spec { -# tls { -# hosts = ["kiali.viktorbarzin.me"] -# secret_name = var.tls_secret_name -# } -# rule { -# host = "kiali.viktorbarzin.me" -# http { -# path { -# path = "/" -# backend { -# service { -# name = "kiali" -# port { -# number = 20001 -# } -# } -# } -# } -# } -# } -# } -# } diff --git a/modules/kubernetes/jellyfin/main.tf b/modules/kubernetes/jellyfin/main.tf deleted file mode 100644 index 31768890..00000000 --- a/modules/kubernetes/jellyfin/main.tf +++ /dev/null @@ -1,117 +0,0 @@ -variable "tls_secret_name" {} - -resource "kubernetes_namespace" "jellyfin" { - metadata { - name = "jellyfin" - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.jellyfin.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "kubernetes_deployment" "jellyfin" { - metadata { - name = "jellyfin" - namespace = kubernetes_namespace.jellyfin.metadata[0].name - labels = { - app = "jellyfin" - } - annotations = { - "reloader.stakater.com/search" = "true" - } - } - spec { - replicas = 1 - strategy { - type = "Recreate" - } - selector { - match_labels = { - app = "jellyfin" - } - } - template { - metadata { - labels = { - app = "jellyfin" - } - } - spec { - container { - image = "jellyfin/jellyfin" - name = "jellyfin" - - port { - container_port = 8096 - } - volume_mount { - name = "media" - mount_path = "/media" - } - volume_mount { - name = "config" - mount_path = "/config" - } - volume_mount { - name = "cache" - mount_path = "/cache" - } - } - volume { - name = "media" - nfs { - path = "/mnt/main/jellyfin/media" - server = "10.0.10.15" - } - } - volume { - name = "config" - nfs { - path = "/mnt/main/jellyfin/config" - server = "10.0.10.15" - } - } - volume { - name = "cache" - nfs { - path = "/mnt/main/jellyfin/cache" - server = "10.0.10.15" - } - } - } - } - } -} - -resource "kubernetes_service" "jellyfin" { - metadata { - name = "jellyfin" - namespace = kubernetes_namespace.jellyfin.metadata[0].name - labels = { - "app" = "jellyfin" - } - } - - spec { - selector = { - app = "jellyfin" - } - port { - name = "http" - target_port = 8096 - port = 80 - protocol = "TCP" - } - } -} - -module "ingress" { - source = "../ingress_factory" - namespace = kubernetes_namespace.jellyfin.metadata[0].name - name = "jellyfin" - tls_secret_name = var.tls_secret_name -} - diff --git a/modules/kubernetes/kafka/kafka_chart_values.tpl b/modules/kubernetes/kafka/kafka_chart_values.tpl deleted file mode 100644 index 871df263..00000000 --- a/modules/kubernetes/kafka/kafka_chart_values.tpl +++ /dev/null @@ -1,9 +0,0 @@ -metrics: - kafka: - enabled: true -persistence: - enabled: false -zookeeper: - persistence: - enabled: false -replicaCount: 3 diff --git a/modules/kubernetes/kafka/main.tf b/modules/kubernetes/kafka/main.tf deleted file mode 100644 index a9cd605f..00000000 --- a/modules/kubernetes/kafka/main.tf +++ /dev/null @@ -1,142 +0,0 @@ -variable "tls_secret_name" {} -variable "client_certificate_secret_name" {} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.kafka.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "helm_release" "kafka" { - namespace = kubernetes_namespace.kafka.metadata[0].name - create_namespace = true - name = "kafka" - - repository = "https://charts.bitnami.com/bitnami" - chart = "kafka" - - values = [templatefile("${path.module}/kafka_chart_values.tpl", {})] -} - -resource "kubernetes_deployment" "kafka-ui" { - metadata { - name = "kafka-ui" - namespace = kubernetes_namespace.kafka.metadata[0].name - labels = { - run = "kafka-ui" - } - } - spec { - replicas = 1 - selector { - match_labels = { - run = "kafka-ui" - } - } - template { - metadata { - labels = { - run = "kafka-ui" - } - } - spec { - container { - image = "provectuslabs/kafka-ui:latest" - name = "kafka-ui" - resources { - limits = { - cpu = "0.5" - memory = "512Mi" - } - requests = { - cpu = "250m" - memory = "50Mi" - } - } - port { - container_port = 8080 - } - env { - name = "KAFKA_CLUSTERS_0_NAME" - value = "local" - } - env { - name = "KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS" - value = "kafka:9092" - } - env { - name = "KAFKA_CLUSTERS_0_ZOOKEEPER" - value = "kafka-zookeeper:2181" - } - } - } - } - } -} - -resource "kubernetes_service" "kafka-ui" { - metadata { - name = "kafka-ui" - namespace = kubernetes_namespace.kafka.metadata[0].name - labels = { - "run" = "kafka-ui" - } - # annotations = { - # "prometheus.io/scrape" = "true" - # "prometheus.io/path" = "/metrics" - # "prometheus.io/port" = "9113" - # } - } - - spec { - selector = { - run = "kafka-ui" - } - port { - name = "http" - port = "80" - target_port = "8080" - } - # port { - # name = "prometheus" - # port = "9113" - # target_port = "9113" - # } - } -} - -resource "kubernetes_ingress_v1" "kafka-ui" { - metadata { - name = "kafka-ui-ingress" - namespace = kubernetes_namespace.kafka.metadata[0].name - annotations = { - "traefik.ingress.kubernetes.io/router.middlewares" = "traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd" - "traefik.ingress.kubernetes.io/router.entrypoints" = "websecure" - "traefik.ingress.kubernetes.io/router.tls.options" = "traefik-mtls@kubernetescrd" - } - } - - spec { - ingress_class_name = "traefik" - tls { - hosts = ["kafka.viktorbarzin.me"] - secret_name = var.tls_secret_name - } - rule { - host = "kafka.viktorbarzin.me" - http { - path { - path = "/" - backend { - service { - name = "kafka-ui" - port { - number = 80 - } - } - } - } - } - } - } -} diff --git a/modules/kubernetes/keyserver/index.md b/modules/kubernetes/keyserver/index.md deleted file mode 100644 index 8b6a7b66..00000000 --- a/modules/kubernetes/keyserver/index.md +++ /dev/null @@ -1,73 +0,0 @@ -This contains the setup for setting up a remote machine that serves a keyfile for decrypting a luks volume - -1. Install nginx -``` -sudo apt update -sudo apt install nginx apache2-utils -y -``` - -2. Create User for basic auth - -``` -sudo htpasswd -c /etc/nginx/.htpasswd truenas -``` - -3. Create secure directory and key file - -``` -sudo mkdir -p /srv/keys -head -c 128 /dev/urandom | sudo tee /srv/keys/truenas.key >/dev/null -``` - -4. Create rate limit zone -``` -# /etc/nginx/conf.d/ratelimit.conf - -# Allow only 3 key requests per minute per IP -limit_req_zone $binary_remote_addr zone=keylimit:10m rate=3r/m; -``` - -5. Configure nginx virtual host -``` -# /etc/nginx/sites-available/keyserver.conf - -server { - listen 443 ssl; - server_name ; - - # TLS certificate and key (we will set these in the next step) - ssl_certificate /etc/ssl/certs/keyserver.crt; - ssl_certificate_key /etc/ssl/private/keyserver.key; - - # Enforce strong TLS - ssl_protocols TLSv1.2 TLSv1.3; - ssl_prefer_server_ciphers on; - - # Rate limiting zone created earlier - limit_req zone=keylimit burst=2 nodelay; - - location /keys/ { - alias /srv/keys/; - - # Basic auth - auth_basic "Restricted"; - auth_basic_user_file /etc/nginx/.htpasswd; - - # Disable directory listing - autoindex off; - - # Prevent caching - add_header Cache-Control "no-store, no-cache, must-revalidate, max-age=0" always; - } -} -``` - -6. Enable the host: -``` -sudo ln -s /etc/nginx/sites-available/keyserver.conf /etc/nginx/sites-enabled/ -``` - -7. Disable default host: -``` -sudo rm /etc/nginx/sites-enabled/default -``` diff --git a/modules/kubernetes/keyserver/inventory.ini b/modules/kubernetes/keyserver/inventory.ini deleted file mode 100644 index 55ba5d2e..00000000 --- a/modules/kubernetes/keyserver/inventory.ini +++ /dev/null @@ -1,2 +0,0 @@ -[keyserver] -130.162.165.220 ansible_user=ubuntu ansible_ssh_private_key_file=~/.ssh/id_ed25519 diff --git a/modules/kubernetes/kured/main.tf b/modules/kubernetes/kured/main.tf deleted file mode 100644 index 4e997fa7..00000000 --- a/modules/kubernetes/kured/main.tf +++ /dev/null @@ -1,31 +0,0 @@ -variable "tls_secret_name" {} -variable "notify_url" {} - -resource "kubernetes_namespace" "kured" { - metadata { - name = "kured" - labels = { - "istio-injection" : "disabled" - } - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.kured.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "helm_release" "kured" { - namespace = kubernetes_namespace.kured.metadata[0].name - create_namespace = false - name = "kured" - - repository = "https://kubereboot.github.io/charts" - chart = "kured" - - values = [templatefile("${path.module}/values.yaml", { notify_url : var.notify_url })] - atomic = true - - depends_on = [kubernetes_namespace.kured] -} diff --git a/modules/kubernetes/kured/values.yaml b/modules/kubernetes/kured/values.yaml deleted file mode 100644 index 8b04c1fc..00000000 --- a/modules/kubernetes/kured/values.yaml +++ /dev/null @@ -1,12 +0,0 @@ -window_start: "22:00" -window_end: "06:00" -reboot_days: "mon,tue,wed,thu,fri" - -service: - annotations: - prometheus.io/scrape: "true" - prometheus.io/path: "/metrics" - prometheus.io/port: "8080" - -configuration: - notifyUrl: "${notify_url}" diff --git a/modules/kubernetes/localai/chart_values.tpl b/modules/kubernetes/localai/chart_values.tpl deleted file mode 100644 index 7fb70512..00000000 --- a/modules/kubernetes/localai/chart_values.tpl +++ /dev/null @@ -1,93 +0,0 @@ -replicaCount: 1 - -deployment: - image: quay.io/go-skynet/local-ai:latest - env: - threads: 4 - context_size: 512 - modelsPath: "/models" - -resources: - {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -# Prompt templates to include -# Note: the keys of this map will be the names of the prompt template files -promptTemplates: - {} - # ggml-gpt4all-j.tmpl: | - # The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response. - # ### Prompt: - # {{.Input}} - # ### Response: - -# Models to download at runtime -models: - # Whether to force download models even if they already exist - forceDownload: false - - # The list of URLs to download models from - # Note: the name of the file will be the name of the loaded model - list: - - url: - "https://gpt4all.io/models/ggml-gpt4all-j.bin" - # basicAuth: base64EncodedCredentials - - # Persistent storage for models and prompt templates. - # PVC and HostPath are mutually exclusive. If both are enabled, - # PVC configuration takes precedence. If neither are enabled, ephemeral - # storage is used. - persistence: - pvc: - enabled: false - size: 2Gi - accessModes: - - ReadWriteOnce - - annotations: {} - - # Optional - storageClass: ~ - - hostPath: - enabled: false - path: "/models" - -service: - type: ClusterIP - port: 80 - annotations: {} - # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout - # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200" - -ingress: - enabled: true - className: "nginx" - annotations: - {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: ai.viktorbarzin.me - paths: - - path: / - pathType: ImplementationSpecific - tls: - - secretName: "${tls_secret}" - hosts: - - ai.viktorbarzin.me - -nodeSelector: {} - -tolerations: [] - -affinity: {} diff --git a/modules/kubernetes/localai/main.tf b/modules/kubernetes/localai/main.tf deleted file mode 100644 index 32f66e20..00000000 --- a/modules/kubernetes/localai/main.tf +++ /dev/null @@ -1,21 +0,0 @@ -variable "tls_secret_name" {} - -resource "helm_release" "prometheus" { - namespace = "localai" - create_namespace = true - name = "localai" - - repository = "https://go-skynet.github.io/helm-charts/" - chart = "local-ai" - # version = "15.0.2" - # atomic = true - # cleanup_on_fail = true - - values = [templatefile("${path.module}/chart_values.tpl", { tls_secret = var.tls_secret_name })] -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = "localai" - tls_secret_name = var.tls_secret_name -} diff --git a/modules/kubernetes/main.tf b/modules/kubernetes/main.tf deleted file mode 100644 index d1bc22e3..00000000 --- a/modules/kubernetes/main.tf +++ /dev/null @@ -1,3 +0,0 @@ -# All service modules have been migrated to individual Terragrunt stacks under stacks/. -# See stacks//main.tf for each service's configuration. -# This file is no longer used. diff --git a/modules/kubernetes/mcaptcha/main.tf b/modules/kubernetes/mcaptcha/main.tf deleted file mode 100644 index 0d65ee38..00000000 --- a/modules/kubernetes/mcaptcha/main.tf +++ /dev/null @@ -1,310 +0,0 @@ -variable "tls_secret_name" {} -variable "tier" { type = string } -variable "postgresql_password" {} -variable "cookie_secret" {} -variable "captcha_salt" {} - -locals { - domain = "mcaptcha.viktorbarzin.me" - port = 7000 -} - -resource "kubernetes_namespace" "mcaptcha" { - metadata { - name = "mcaptcha" - labels = { - "istio-injection" : "disabled" - tier = var.tier - } - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.mcaptcha.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -# mCaptcha requires a special Redis with the mcaptcha/cache module loaded -resource "kubernetes_deployment" "mcaptcha_redis" { - metadata { - name = "mcaptcha-redis" - namespace = kubernetes_namespace.mcaptcha.metadata[0].name - labels = { - app = "mcaptcha-redis" - tier = var.tier - } - } - - spec { - replicas = 1 - selector { - match_labels = { - app = "mcaptcha-redis" - } - } - - strategy { - type = "Recreate" - } - - template { - metadata { - labels = { - app = "mcaptcha-redis" - } - } - - spec { - container { - image = "mcaptcha/cache:latest" - name = "redis" - - port { - container_port = 6379 - } - - resources { - requests = { - memory = "64Mi" - cpu = "25m" - } - limits = { - memory = "128Mi" - cpu = "200m" - } - } - - liveness_probe { - tcp_socket { - port = 6379 - } - initial_delay_seconds = 10 - period_seconds = 10 - } - - readiness_probe { - tcp_socket { - port = 6379 - } - initial_delay_seconds = 5 - period_seconds = 5 - } - } - } - } - } -} - -resource "kubernetes_service" "mcaptcha_redis" { - metadata { - name = "mcaptcha-redis" - namespace = kubernetes_namespace.mcaptcha.metadata[0].name - labels = { - app = "mcaptcha-redis" - } - } - - spec { - selector = { - app = "mcaptcha-redis" - } - port { - name = "redis" - port = 6379 - target_port = 6379 - } - } -} - -resource "kubernetes_deployment" "mcaptcha" { - metadata { - name = "mcaptcha" - namespace = kubernetes_namespace.mcaptcha.metadata[0].name - labels = { - app = "mcaptcha" - tier = var.tier - } - annotations = { - "reloader.stakater.com/search" = "true" - } - } - - spec { - replicas = 1 - selector { - match_labels = { - app = "mcaptcha" - } - } - - strategy { - type = "Recreate" - } - - template { - metadata { - labels = { - app = "mcaptcha" - } - annotations = { - "diun.enable" = "true" - "diun.include_tags" = "^\\d+(?:\\.\\d+)?(?:\\.\\d+)?$" - } - } - - spec { - container { - image = "mcaptcha/mcaptcha:latest" - name = "mcaptcha" - - port { - container_port = local.port - } - - # Required configuration - env { - name = "MCAPTCHA_server_DOMAIN" - value = local.domain - } - - env { - name = "MCAPTCHA_server_COOKIE_SECRET" - value = var.cookie_secret - } - - env { - name = "MCAPTCHA_captcha_SALT" - value = var.captcha_salt - } - - # Server configuration - env { - name = "PORT" - value = tostring(local.port) - } - - env { - name = "MCAPTCHA_server_IP" - value = "0.0.0.0" - } - - env { - name = "MCAPTCHA_server_PROXY_HAS_TLS" - value = "true" - } - - # Database configuration (PostgreSQL) - env { - name = "DATABASE_URL" - value = "postgres://mcaptcha:${var.postgresql_password}@postgresql.dbaas.svc.cluster.local:5432/mcaptcha" - } - - # Redis configuration (using mcaptcha/cache module) - env { - name = "MCAPTCHA_redis_URL" - value = "redis://mcaptcha-redis.mcaptcha.svc.cluster.local:6379" - } - - # Feature flags - env { - name = "MCAPTCHA_allow_registration" - # value = "true" - value = "false" - } - - env { - name = "MCAPTCHA_allow_demo" - value = "false" - } - - env { - name = "MCAPTCHA_commercial" - value = "false" - } - - env { - name = "MCAPTCHA_captcha_ENABLE_STATS" - value = "true" - } - - env { - name = "MCAPTCHA_captcha_GC" - value = "30" - } - - env { - name = "MCAPTCHA_debug" - value = "false" - } - env { - name = "RUST_BACKTRACE" - value = "1" - } - - resources { - requests = { - memory = "64Mi" - cpu = "50m" - } - limits = { - memory = "256Mi" - cpu = "500m" - } - } - - # Health checks - liveness_probe { - http_get { - path = "/" - port = local.port - } - initial_delay_seconds = 30 - period_seconds = 10 - timeout_seconds = 5 - failure_threshold = 3 - } - - readiness_probe { - http_get { - path = "/" - port = local.port - } - initial_delay_seconds = 10 - period_seconds = 5 - timeout_seconds = 3 - failure_threshold = 3 - } - } - } - } - } -} - -resource "kubernetes_service" "mcaptcha" { - metadata { - name = "mcaptcha" - namespace = kubernetes_namespace.mcaptcha.metadata[0].name - labels = { - "app" = "mcaptcha" - } - } - - spec { - selector = { - app = "mcaptcha" - } - port { - name = "http" - port = 80 - target_port = local.port - } - } -} - -module "ingress" { - source = "../ingress_factory" - namespace = kubernetes_namespace.mcaptcha.metadata[0].name - name = "mcaptcha" - tls_secret_name = var.tls_secret_name -} diff --git a/modules/kubernetes/oauth-proxy/main.tf b/modules/kubernetes/oauth-proxy/main.tf deleted file mode 100644 index 0f46bb02..00000000 --- a/modules/kubernetes/oauth-proxy/main.tf +++ /dev/null @@ -1,400 +0,0 @@ -# variable "host" { -# type = string -# } - -resource "kubernetes_namespace" "oauth2" { - metadata { - name = "oauth2" - # cookie seems to be not set and auth fails - # labels = { - # "istio-injection" : "enabled" - # } - } -} -variable "tls_secret_name" { - type = string -} - -variable "oauth2_proxy_client_secret" { - type = string -} - -variable "oauth2_proxy_client_id" { - type = string -} -variable "authenticated_emails" { - type = string - default = "" -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = "oauth2" - tls_secret_name = var.tls_secret_name -} - -resource "kubernetes_config_map" "config" { - metadata { - name = "oauth2-proxy-nginx" - namespace = "oauth2" - - annotations = { - "reloader.stakater.com/match" = "true" - } - } - - data = { - "nginx.conf" = <<-EOT - worker_processes 5; - - events { - } - - http { - server { - listen 80 default_server; - - location = /healthcheck { - add_header Content-Type text/plain; - return 200 'ok'; - } - - location ~ /redirect/(.*) { - return 307 https://$1$is_args$args; - } - } - } - EOT - } -} - -resource "kubernetes_config_map" "authorized-emails" { - metadata { - name = "authorized-emails" - namespace = "oauth2" - - annotations = { - "reloader.stakater.com/match" = "true" - } - } - - data = { - "authorized_emails.txt" = var.authenticated_emails - } -} - -resource "random_password" "cookie" { - length = 16 - special = true - override_special = "_%@" -} - -resource "kubernetes_deployment" "oauth2-proxy" { - metadata { - name = "oauth2-proxy" - namespace = "oauth2" - labels = { - app = "oauth2" - } - annotations = { - "reloader.stakater.com/search" = "true" - } - } - spec { - replicas = 1 - selector { - match_labels = { - app = "oauth2" - } - } - template { - metadata { - labels = { - app = "oauth2" - } - } - spec { - container { - image = "nginx:latest" - name = "nginx" - - port { - name = "http" - container_port = 80 - protocol = "TCP" - } - volume_mount { - name = "config" - mount_path = "/etc/nginx/" - } - liveness_probe { - http_get { - path = "/healthcheck" - port = 80 - } - } - } - container { - image = "quay.io/pusher/oauth2_proxy:latest" - name = "oauth2-proxy" - args = ["--provider=google", "--upstream=file:///dev/null", "--upstream=http://localhost/redirect/", "--http-address=0.0.0.0:4180", "--cookie-domain=.viktorbarzin.me", "--footer=-", "--authenticated-emails-file=/etc/authorized_emails/authorized_emails.txt"] - # args = ["--provider=google", "--upstream=file:///dev/null", "--upstream=http://localhost/redirect/", "--http-address=0.0.0.0:4180", "--cookie-domain=.viktorbarzin.me", "--footer=-", "--email-domain=*", "--google-group=barzini-lab-admins@googlegroups.com", "--google-admin-email=vbarzin@gmail.com", "--google-service-account-json=/etc/google_service_account/google_service_account.json"] - # args = ["--provider=google", "--upstream=file:///dev/null", "--upstream=http://localhost/redirect/", "--http-address=0.0.0.0:4180", "--cookie-domain=.viktorbarzin.me", "--footer=-", "--email-domain=*", "--google-group=barzini-lab-admins", "--google-admin-email=533122798643-compute@developer.gserviceaccount.com", "--google-service-account-json=/etc/google_service_account/google_service_account.json"] - env { - name = "OAUTH2_PROXY_CLIENT_ID" - value = var.oauth2_proxy_client_id - } - env { - name = "OAUTH2_PROXY_CLIENT_SECRET" - value = var.oauth2_proxy_client_secret - } - env { - name = "OAUTH2_PROXY_COOKIE_SECRET" - value = random_password.cookie.result - } - port { - name = "oauth" - container_port = 4180 - protocol = "TCP" - } - volume_mount { - name = "authorized-emails" - mount_path = "/etc/authorized_emails" - } - # volume_mount { - # name = "sa-json" - # mount_path = "/etc/google_service_account/" - # } - } - volume { - name = "config" - config_map { - name = "oauth2-proxy-nginx" - } - } - volume { - name = "authorized-emails" - config_map { - name = "authorized-emails" - } - } - # volume { - # name = "sa-json" - # config_map { - # name = "google-service-account" - # } - # } - } - } - } -} - -resource "kubernetes_service" "oauth_proxy" { - metadata { - name = "oauth2" - namespace = "oauth2" - labels = { - app = "oauth2" - } - } - - spec { - selector = { - app = "oauth2" - } - port { - name = "http" - port = "80" - target_port = 4180 - } - } -} - -module "ingress" { - source = "../ingress_factory" - namespace = "oauth2" - name = "oauth2" - tls_secret_name = var.tls_secret_name -} - - - - - -# variable "svc_name" { -# type = string -# } -# variable "client_id" {} -# variable "client_secret" {} - - -# resource "kubernetes_deployment" "oauth_proxy" { -# metadata { -# name = "oauth-proxy" -# namespace = var.namespace -# labels = { -# run = "oauth-proxy" -# } -# } -# spec { -# replicas = 1 -# selector { -# match_labels = { -# run = "oauth-proxy" -# } -# } -# template { -# metadata { -# labels = { -# run = "oauth-proxy" -# } -# } -# spec { -# container { -# image = "quay.io/oauth2-proxy/oauth2-proxy:latest" -# args = ["--provider=google", "--email-domain=*", "upstream=file:///dev/null", "--http-address=0.0.0.0:4180"] -# name = "oauth-proxy" -# image_pull_policy = "IfNotPresent" -# resources { -# limits = { -# cpu = "0.5" -# memory = "512Mi" -# } -# requests = { -# cpu = "250m" -# memory = "50Mi" -# } -# } -# port { -# container_port = 4180 -# } -# env { -# name = "OAUTH2_PROXY_CLIENT_ID" -# value = var.client_id -# } -# env { -# name = "OAUTH2_PROXY_CLIENT_SECRET" -# value = var.client_secret -# } -# env { -# name = "OAUTH2_PROXY_COOKIE_SECRET" -# value = random_password.cookie.result -# } -# } -# } -# } -# } -# } - -# resource "kubernetes_service" "oauth_proxy" { -# metadata { -# name = var.svc_name -# namespace = var.namespace -# labels = { -# run = "oauth-proxy" -# } -# } - -# spec { -# selector = { -# run = "oauth-proxy" -# } -# port { -# name = "http" -# port = "80" -# target_port = "4180" -# } -# } -# } - -# resource "kubernetes_ingress_v1" "oauth" { -# metadata { -# name = "oauth-ingress" -# namespace = var.namespace -# annotations = { -# "kubernetes.io/ingress.class" = "nginx" -# "nginx.ingress.kubernetes.io/use-regex" = "true" -# } -# } - -# spec { -# tls { -# hosts = [var.host] -# secret_name = var.tls_secret_name -# } -# rule { -# host = var.host -# http { -# path { -# path = "/oauth2/.*" -# backend { -# service { -# name = var.svc_name -# port { -# number = 80 -# } -# } -# } -# } -# } -# } -# } -# } - -# apiVersion: apps/v1 -# kind: Deployment -# metadata: -# labels: -# k8s-app: oauth2-proxy -# name: oauth2-proxy -# namespace: kube-system -# spec: -# replicas: 1 -# selector: -# matchLabels: -# k8s-app: oauth2-proxy -# template: -# metadata: -# labels: -# k8s-app: oauth2-proxy -# spec: -# containers: -# - args: -# - --provider=github -# - --email-domain=* -# - --upstream=file:///dev/null -# - --http-address=0.0.0.0:4180 -# # Register a new application -# # https://github.com/settings/applications/new -# env: -# - name: OAUTH2_PROXY_CLIENT_ID -# value: -# - name: OAUTH2_PROXY_CLIENT_SECRET -# value: -# # docker run -ti --rm python:3-alpine python -c 'import secrets,base64; print(base64.b64encode(base64.b64encode(secrets.token_bytes(16))));' -# - name: OAUTH2_PROXY_COOKIE_SECRET -# value: SECRET -# image: quay.io/oauth2-proxy/oauth2-proxy:latest -# imagePullPolicy: Always -# name: oauth2-proxy -# ports: -# - containerPort: 4180 -# protocol: TCP - -# --- - -# apiVersion: v1 -# kind: Service -# metadata: -# labels: -# k8s-app: oauth2-proxy -# name: oauth2-proxy -# namespace: kube-system -# spec: -# ports: -# - name: http -# port: 4180 -# protocol: TCP -# targetPort: 4180 -# selector: -# k8s-app: oauth2-proxy diff --git a/modules/kubernetes/openid_help_page/main.tf b/modules/kubernetes/openid_help_page/main.tf deleted file mode 100644 index 59151979..00000000 --- a/modules/kubernetes/openid_help_page/main.tf +++ /dev/null @@ -1,87 +0,0 @@ -variable "tls_secret_name" {} - -resource "kubernetes_namespace" "openid_help_page" { - metadata { - name = "openid-help-page" - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = "openid-help-page" - tls_secret_name = var.tls_secret_name -} - -resource "kubernetes_deployment" "openid_help_page" { - metadata { - name = "openid-help-page" - namespace = "openid-help-page" - labels = { - app = "openid-help-page" - } - } - spec { - replicas = 3 - selector { - match_labels = { - app = "openid-help-page" - } - } - template { - metadata { - labels = { - app = "openid-help-page" - } - } - spec { - container { - image = "viktorbarzin/openid-create-account-help-webpage:latest" - name = "openid-help-page" - resources { - limits = { - cpu = "0.5" - memory = "512Mi" - } - requests = { - cpu = "250m" - memory = "50Mi" - } - } - port { - container_port = 80 - } - } - } - } - } -} - -resource "kubernetes_service" "openid_help_page" { - metadata { - name = "openid-help-page" - namespace = "openid-help-page" - } - - spec { - port { - name = "service-port" - protocol = "TCP" - port = 80 - target_port = "80" - } - - selector = { - app = "openid-help-page" - } - type = "ClusterIP" - session_affinity = "None" - } -} - -module "ingress" { - source = "../ingress_factory" - namespace = "openid-help-page" - name = "openid-help-page" - host = "kubectl" - tls_secret_name = var.tls_secret_name -} diff --git a/modules/kubernetes/pihole/main.tf b/modules/kubernetes/pihole/main.tf deleted file mode 100644 index ede0c04a..00000000 --- a/modules/kubernetes/pihole/main.tf +++ /dev/null @@ -1,201 +0,0 @@ -variable "tls_secret_name" {} -variable "web_password" {} - -resource "kubernetes_namespace" "pihole" { - metadata { - name = "pihole" - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.pihole.metadata[0].name - tls_secret_name = var.tls_secret_name -} - - -resource "kubernetes_config_map" "external_conf" { - metadata { - name = "external-conf" - namespace = kubernetes_namespace.pihole.metadata[0].name - - labels = { - app = "pihole" - } - } - data = { - "external.conf" = "$HTTP[\"host\"] == \"pihole.viktorbarzin.me\" {\n server.document-root = \"/var/www/html/admin/\"\n}\n" - } -} - -resource "kubernetes_deployment" "pihole" { - metadata { - name = "pihole" - namespace = kubernetes_namespace.pihole.metadata[0].name - labels = { - app = "pihole" - } - } - spec { - replicas = 1 - selector { - match_labels = { - app = "pihole" - } - } - template { - metadata { - labels = { - app = "pihole" - } - } - spec { - container { - image = "pihole/pihole:latest" - name = "pihole" - resources { - limits = { - cpu = "1" - memory = "1Gi" - } - requests = { - cpu = "1" - memory = "1Gi" - } - } - port { - container_port = 80 - } - env { - name = "DNS1" - value = "10.0.20.200#5354" # bind - } - env { - name = "VIRTUAL_HOST" - value = "pihole.viktorbarzin.me" - } - env { - name = "WEBPASSWORD" - value = var.web_password - } - env { - name = "TZ" - value = "Europe/Sofia" - } - volume_mount { - name = "external-conf" - mount_path = "/tmp/external.conf" - sub_path = "external.conf" - } - volume_mount { - name = "pihole-local-etc-volume" - mount_path = "/etc/pihole" - } - volume_mount { - name = "pihole-local-dnsmasq-volume" - mount_path = "/etc/dnsmasq.d" - } - } - volume { - name = "external-conf" - config_map { - name = "external-conf" - } - } - volume { - name = "pihole-local-etc-volume" - empty_dir {} # no hard dependencies on truenas which needs dns - } - volume { - name = "pihole-local-dnsmasq-volume" - empty_dir {} # no hard dependencies on truenas which needs dns - } - } - } - } -} - -resource "kubernetes_service" "pihole-dns" { - metadata { - name = "pihole-dns" - namespace = kubernetes_namespace.pihole.metadata[0].name - labels = { - "app" = "pihole" - } - annotations = { - "metallb.universe.tf/allow-shared-ip" : "shared" - } - } - - spec { - # type = "LoadBalancer" - # external_traffic_policy = "Cluster" - selector = { - app = "pihole" - } - port { - name = "dns-udp" - port = "53" - protocol = "UDP" - } - } -} - -resource "kubernetes_service" "pihole-web" { - metadata { - name = "pihole-web" - namespace = kubernetes_namespace.pihole.metadata[0].name - labels = { - "app" = "pihole" - } - annotations = { - "metallb.universe.tf/allow-shared-ip" : "shared" - } - } - - spec { - selector = { - app = "pihole" - } - port { - name = "dns-web" - port = "80" - } - } -} - -resource "kubernetes_ingress_v1" "pihole" { - metadata { - name = "pihole-ingress" - namespace = kubernetes_namespace.pihole.metadata[0].name - annotations = { - "traefik.ingress.kubernetes.io/router.middlewares" = "traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd" - "traefik.ingress.kubernetes.io/router.entrypoints" = "websecure" - "traefik.ingress.kubernetes.io/router.tls.options" = "traefik-mtls@kubernetescrd" - } - } - - spec { - ingress_class_name = "traefik" - tls { - hosts = ["pihole.viktorbarzin.me"] - secret_name = var.tls_secret_name - } - rule { - host = "pihole.viktorbarzin.me" - http { - path { - path = "/" - backend { - service { - name = "pihole-web" - port { - number = 80 - } - } - } - } - } - } - } -} diff --git a/modules/kubernetes/vault/chart_values.tpl b/modules/kubernetes/vault/chart_values.tpl deleted file mode 100644 index a0bc188a..00000000 --- a/modules/kubernetes/vault/chart_values.tpl +++ /dev/null @@ -1,23 +0,0 @@ -global: - namespace: "vault" - image: - repository: "hashicorp/vault-k8s" - tag: "1.7.0" - agentImage: - repository: "hashicorp/vault" - tag: "1.20.4" -injector: - metrics: - enabled: true -server: - image: - repository: "hashicorp/vault" - tag: "1.20.4" - enabled: true - volumes: - - name: data - emptyDir: {} - ingress: - enabled: false -ui: - enabled: true diff --git a/modules/kubernetes/vault/main.tf b/modules/kubernetes/vault/main.tf deleted file mode 100644 index 8d4d4ded..00000000 --- a/modules/kubernetes/vault/main.tf +++ /dev/null @@ -1,61 +0,0 @@ -variable "tls_secret_name" {} -variable "host" { - default = "vault.viktorbarzin.me" -} -variable "tier" { type = string } - -resource "kubernetes_namespace" "vault" { - metadata { - name = "vault" - labels = { - tier = var.tier - } - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.vault.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "kubernetes_persistent_volume" "vault_data" { - metadata { - name = "vault-data-pv" - } - spec { - capacity = { - "storage" = "10Gi" - } - access_modes = ["ReadWriteOnce"] - persistent_volume_source { - nfs { - server = "10.0.10.15" - path = "/mnt/main/vault" - } - } - } -} - -resource "helm_release" "vault" { - namespace = kubernetes_namespace.vault.metadata[0].name - name = "vault" - atomic = true - - repository = "https://helm.releases.hashicorp.com" - chart = "vault" - - values = [templatefile("${path.module}/chart_values.tpl", { host = var.host, tls_secret_name = var.tls_secret_name })] - - depends_on = [kubernetes_persistent_volume.vault_data] -} - -module "ingress" { - source = "../ingress_factory" - namespace = kubernetes_namespace.vault.metadata[0].name - name = "vault" - service_name = "vault-ui" - port = 8200 - tls_secret_name = var.tls_secret_name - protected = true -} diff --git a/modules/kubernetes/versions.tf b/modules/kubernetes/versions.tf deleted file mode 100644 index 8a49eebc..00000000 --- a/modules/kubernetes/versions.tf +++ /dev/null @@ -1,8 +0,0 @@ -terraform { - required_providers { - kubernetes = { - source = "hashicorp/kubernetes" - version = "3.0.1" - } - } -} diff --git a/modules/kubernetes/vikunja/main.tf b/modules/kubernetes/vikunja/main.tf deleted file mode 100644 index 1936ab66..00000000 --- a/modules/kubernetes/vikunja/main.tf +++ /dev/null @@ -1,216 +0,0 @@ -variable "tls_secret_name" {} - -resource "kubernetes_namespace" "vikunja" { - metadata { - name = "vikunja" - } -} - -module "tls_secret" { - source = "../setup_tls_secret" - namespace = kubernetes_namespace.vikunja.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -resource "kubernetes_deployment" "vikunja" { - metadata { - name = "vikunja" - namespace = kubernetes_namespace.vikunja.metadata[0].name - labels = { - app = "vikunja" - } - annotations = { - "reloader.stakater.com/search" = "true" - } - } - spec { - replicas = 1 - strategy { - type = "Recreate" - } - selector { - match_labels = { - app = "vikunja" - } - } - template { - metadata { - labels = { - app = "vikunja" - } - } - spec { - container { - image = "vikunja/api" - name = "api" - # General settings - env { - name = "VIKUNJA_SERVICE_TIMEZONE" - value = "Europe/London" - } - env { - name = "VIKUNJA_SERVICE_ENABLEREGISTRATION" - value = "true" - } - env { - name = "VIKUNJA_LOG_LEVEL" - value = "DEBUG" - } - # Frontend Settings - env { - name = "VIKUNJA_SERVICE_JWTSECRET" - value = "vikunja" - } - env { - name = "VIKUNJA_SERVICE_FRONTENDURL" - value = "https://todo.viktorbarzin.me/" - } - # DB Settings - env { - name = "VIKUNJA_DATABASE_HOST" - value = "mysql.dbaas.svc.cluster.local" - } - env { - name = "VIKUNJA_DATABASE_PASSWORD" - value = "" # ADD ME - } - env { - name = "VIKUNJA_DATABASE_TYPE" - value = "mysql" - } - env { - name = "VIKUNJA_DATABASE_USER" - value = "vikunja" - } - env { - name = "VIKUNJA_DATABASE_DATABASE" - value = "vikunja" - } - env { - name = "VIKUNJA_LOG_DATABASE" - value = "true" - } - env { - name = "VIKUNJA_LOG_DATABASELEVEL" - value = "DEBUG" - } - # Mailser settings - env { - name = "VIKUNJA_MAILER_ENABLED" - value = "true" - } - env { - name = "VIKUNJA_MAILER_HOST" - value = "mailserver.mailserver.svc.cluster.local" - } - env { - name = "VIKUNJA_MAILER_USERNAME" - value = "me@viktorbarzin.me" - } - env { - name = "VIKUNJA_MAILER_PASSWORD" - value = "" # TODO: add me - } - env { - name = "VIKUNJA_MAILER_FROMEMAIL" - value = "todo@viktorbarzin.me" - } - # TODOIST settings - env { - name = "VIKUNJA_MIGRATION_TODOIST_ENABLE" - value = "true" - } - env { - name = "VIKUNJA_MIGRATION_TODOIST_CLIENTID" - value = "" # TODO: add me - } - env { - name = "VIKUNJA_MIGRATION_TODOIST_CLIENTSECRET" - value = "" # TODO: add me - } - env { - name = "VIKUNJA_MIGRATION_TODOIST_REDIRECTURL" - value = "https://todo.viktorbarzin.me/migrate/todoist" - } - port { - name = "api" - container_port = 3456 - } - } - - container { - image = "vikunja/frontend" - name = "frontend" - port { - name = "http" - container_port = 80 - } - } - } - } - } -} - -resource "kubernetes_service" "vikunja" { - metadata { - name = "vikunja" - namespace = kubernetes_namespace.vikunja.metadata[0].name - labels = { - "app" = "vikunja" - } - } - - spec { - selector = { - app = "vikunja" - } - port { - name = "http" - target_port = 80 - port = 80 - protocol = "TCP" - } - } -} - -resource "kubernetes_service" "api" { - metadata { - name = "api" - namespace = kubernetes_namespace.vikunja.metadata[0].name - labels = { - "app" = "vikunja" - } - } - - spec { - selector = { - app = "vikunja" - } - port { - name = "api" - target_port = 3456 - port = 3456 - protocol = "TCP" - } - } -} - -module "ingress" { - source = "../ingress_factory" - namespace = kubernetes_namespace.vikunja.metadata[0].name - name = "vikunja" - host = "todo" - tls_secret_name = var.tls_secret_name -} - -module "ingress-api" { - source = "../ingress_factory" - namespace = kubernetes_namespace.vikunja.metadata[0].name - name = "vikunja-api" - host = "todo" - service_name = "api" - port = 3456 - ingress_path = ["/api/"] - tls_secret_name = var.tls_secret_name -} - diff --git a/stacks/blog/main.tf b/stacks/blog/main.tf index 6cebc2c7..0235cb53 100644 --- a/stacks/blog/main.tf +++ b/stacks/blog/main.tf @@ -10,8 +10,6 @@ locals { } } -# variable "dockerhub_password" {} - resource "kubernetes_namespace" "website" { metadata { name = "website" @@ -28,12 +26,6 @@ module "tls_secret" { tls_secret_name = var.tls_secret_name } -# module "dockerhub_creds" { -# source = "../../modules/kubernetes/dockerhub_secret" -# namespace = kubernetes_namespace.website.metadata[0].name -# password = var.dockerhub_password -# } - resource "kubernetes_deployment" "blog" { metadata { name = "blog" diff --git a/stacks/city-guesser/main.tf b/stacks/city-guesser/main.tf index d946f936..cd402610 100644 --- a/stacks/city-guesser/main.tf +++ b/stacks/city-guesser/main.tf @@ -91,26 +91,6 @@ resource "kubernetes_service" "city-guesser" { } } } -# resource "kubernetes_service" "city-guesser-oauth" { -# metadata { -# name = "city-guesser-oauth" -# namespace = "city-guesser" -# labels = { -# "run" = "city-guesser-oauth" -# } -# } - -# spec { -# type = "ExternalName" -# external_name = "oauth-proxy.oauth.svc.cluster.local" - -# # port { -# # name = "tcp" -# # port = "80" -# # target_port = "80" -# # } -# } -# } module "ingress" { source = "../../modules/kubernetes/ingress_factory" @@ -119,45 +99,3 @@ module "ingress" { tls_secret_name = var.tls_secret_name protected = true } - -# resource "kubernetes_ingress_v1" "city-guesser-oauth" { -# metadata { -# name = "city-guesser-ingress-oauth" -# namespace = "city-guesser" -# annotations = { -# "kubernetes.io/ingress.class" = "nginx" -# } -# } - -# spec { -# tls { -# hosts = ["city-guesser.viktorbarzin.me"] -# secret_name = var.tls_secret_name -# } -# rule { -# host = "city-guesser.viktorbarzin.me" -# http { -# path { -# path = "/oauth2" -# backend { -# service_name = "city-guesser-oauth" -# service_port = "80" -# } -# } -# } -# } -# } -# } - - -# module "oauth" { -# source = "../../modules/kubernetes/oauth-proxy" -# # oauth_client_id = "3d8ce4bf7b893899d967" -# # oauth_client_secret = "REDACTED_OAUTH_SECRET" -# client_id = "3d8ce4bf7b893899d967" -# client_secret = "REDACTED_OAUTH_SECRET" -# namespace = "city-guesser" -# host = "city-guesser.viktorbarzin.me" -# tls_secret_name = var.tls_secret_name -# svc_name = "city-guesser-oauth" -# } diff --git a/stacks/travel_blog/main.tf b/stacks/travel_blog/main.tf index 185a39c3..8e6d699c 100644 --- a/stacks/travel_blog/main.tf +++ b/stacks/travel_blog/main.tf @@ -26,12 +26,6 @@ module "tls_secret" { tls_secret_name = var.tls_secret_name } -# module "dockerhub_creds" { -# source = "../../modules/kubernetes/dockerhub_secret" -# namespace = kubernetes_namespace.travel.metadata[0].name -# password = var.dockerhub_password -# } - resource "kubernetes_deployment" "blog" { metadata { name = "travel-blog" diff --git a/versions.tf b/versions.tf deleted file mode 100644 index b1ead449..00000000 --- a/versions.tf +++ /dev/null @@ -1,49 +0,0 @@ -# terraform { -# required_providers { -# kubernetes = { -# source = "hashicorp/kubernetes" -# } -# kubectl = { -# source = "gavinbunney/kubectl" -# version = ">= 1.10.0" -# } -# } -# required_version = ">= 0.13" -# } - -# terraform { -# required_providers { -# proxmox = { -# source = "telmate/proxmox" -# version = "2.9.14" -# } -# } -# } - -# provides more resources -# terraform { -# required_providers { -# proxmox = { -# source = "bpg/proxmox" -# version = "0.39.0" -# } -# } -# } - -# terraform { -# required_providers { -# cloudflare = { -# source = "cloudflare/cloudflare" -# version = "~> 4.0" -# } -# } -# } - -terraform { - required_providers { - proxmox = { - source = "telmate/proxmox" - version = "3.0.2-rc07" - } - } -}