diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 8540cb3d..46d57e11 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -61,6 +61,14 @@ For platform modules, use `source = "../../../../modules/kubernetes/nfs_volume"` **StorageClass**: `nfs-truenas` (deployed via `stacks/platform/modules/nfs-csi/`). **DO NOT use inline `nfs {}` blocks** — they mount with `hard,timeo=600` defaults which hang forever on stale mounts. +### iSCSI Storage for Databases +**StorageClass**: `iscsi-truenas` (deployed via `stacks/platform/modules/iscsi-csi/` using democratic-csi). +- Used by: PostgreSQL (CNPG), MySQL (InnoDB Cluster) — any pod, any node, same data +- Driver: `freenas-iscsi` (SSH-based, NOT `freenas-api-iscsi` which is TrueNAS SCALE only) +- ZFS datasets: `main/iscsi` (zvols), `main/iscsi-snaps` (snapshots) +- All K8s nodes have `open-iscsi` + `iscsid` running +- Redis stays on `local-path` (StatefulSet `volumeClaimTemplates` are immutable) + ### Adding NFS Exports 1. **Create the directory on TrueNAS first**: `ssh root@10.0.10.15 "mkdir -p /mnt/main/ && chmod 777 /mnt/main/"` 2. Edit `secrets/nfs_directories.txt` — add path, keep sorted diff --git a/.claude/cluster-health.sh b/.claude/cluster-health.sh index d14187c0..0d4f4ad5 100755 --- a/.claude/cluster-health.sh +++ b/.claude/cluster-health.sh @@ -1522,12 +1522,9 @@ main() { print_summary send_slack - # Exit code: 2 for failures, 1 for warnings, 0 for clean - if [[ "$FAIL_COUNT" -gt 0 ]]; then - exit 2 - elif [[ "$WARN_COUNT" -gt 0 ]]; then - exit 1 - fi + # Always exit 0 — reporting is done via Slack notification. + # Non-zero exits mark the CronJob as Failed, which triggers Prometheus + # JobFailed alerts, creating a circular alert loop. exit 0 } diff --git a/modules/create-template-vm/cloud_init.yaml b/modules/create-template-vm/cloud_init.yaml index 5c6678ef..b62430fb 100644 --- a/modules/create-template-vm/cloud_init.yaml +++ b/modules/create-template-vm/cloud_init.yaml @@ -38,6 +38,8 @@ packages: # kubernetes - kubeadm - kubelet + # iSCSI client for CSI-backed database storage + - open-iscsi %{endif} apt: @@ -60,6 +62,7 @@ runcmd: - containerd config default | sudo tee /etc/containerd/config.toml - ${containerd_config_update_command} - systemctl restart containerd + - systemctl enable --now iscsid - ${k8s_join_command} - systemctl enable kubelet - systemctl start kubelet diff --git a/stacks/audiobookshelf/main.tf b/stacks/audiobookshelf/main.tf index e1176926..8249b014 100644 --- a/stacks/audiobookshelf/main.tf +++ b/stacks/audiobookshelf/main.tf @@ -86,6 +86,26 @@ resource "kubernetes_deployment" "audiobookshelf" { port { container_port = 80 } + liveness_probe { + http_get { + path = "/healthcheck" + port = 80 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/healthcheck" + port = 80 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } volume_mount { name = "audiobooks" mount_path = "/audiobooks" diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 6cc19d96..85bdbaf0 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -515,6 +515,26 @@ resource "kubernetes_deployment" "immich-machine-learning" { "nvidia.com/gpu" = "1" } } + liveness_probe { + http_get { + path = "/ping" + port = 3003 + } + initial_delay_seconds = 30 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/ping" + port = 3003 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } } volume { name = "cache" diff --git a/stacks/ntfy/main.tf b/stacks/ntfy/main.tf index 0c571540..485607ca 100644 --- a/stacks/ntfy/main.tf +++ b/stacks/ntfy/main.tf @@ -62,6 +62,26 @@ resource "kubernetes_deployment" "ntfy" { port { container_port = 80 } + liveness_probe { + http_get { + path = "/v1/health" + port = 80 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/v1/health" + port = 80 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } env { name = "NTFY_BASE_URL" value = "https://ntfy.viktorbarzin.me" diff --git a/stacks/platform/main.tf b/stacks/platform/main.tf index 98100aed..7ad0ac6a 100644 --- a/stacks/platform/main.tf +++ b/stacks/platform/main.tf @@ -123,6 +123,16 @@ variable "webhook_handler_git_token" { type = string } variable "technitium_username" { type = string } variable "technitium_password" { type = string } +# --- iscsi-csi --- +variable "truenas_api_key" { + type = string + sensitive = true +} +variable "truenas_ssh_private_key" { + type = string + sensitive = true +} + # ============================================================================= # Module Calls # ============================================================================= @@ -318,6 +328,17 @@ module "nfs-csi" { nfs_server = var.nfs_server } +# ----------------------------------------------------------------------------- +# iSCSI CSI — democratic-csi for TrueNAS iSCSI (database storage) +# ----------------------------------------------------------------------------- +module "iscsi-csi" { + source = "./modules/iscsi-csi" + tier = local.tiers.cluster + truenas_host = var.nfs_server # Same TrueNAS host + truenas_api_key = var.truenas_api_key + truenas_ssh_private_key = var.truenas_ssh_private_key +} + # ----------------------------------------------------------------------------- # CNPG — CloudNativePG Operator + local-path-provisioner for database storage # ----------------------------------------------------------------------------- diff --git a/stacks/platform/modules/dbaas/main.tf b/stacks/platform/modules/dbaas/main.tf index c68574e3..47fda94c 100644 --- a/stacks/platform/modules/dbaas/main.tf +++ b/stacks/platform/modules/dbaas/main.tf @@ -128,7 +128,7 @@ resource "helm_release" "mysql_cluster" { } datadirVolumeClaimTemplate = { - storageClassName = "local-path" + storageClassName = "iscsi-truenas" resources = { requests = { storage = "30Gi" @@ -799,7 +799,7 @@ resource "null_resource" "pg_cluster" { instances = "2" image = "ghcr.io/cloudnative-pg/postgis:16" storage_size = "20Gi" - storage_class = "local-path" + storage_class = "iscsi-truenas" memory_limit = "4Gi" cpu_limit = "2" } @@ -822,7 +822,7 @@ resource "null_resource" "pg_cluster" { enableSuperuserAccess: true storage: size: 20Gi - storageClass: local-path + storageClass: iscsi-truenas resources: requests: cpu: "250m" diff --git a/stacks/platform/modules/headscale/main.tf b/stacks/platform/modules/headscale/main.tf index b18c069c..c41cff65 100644 --- a/stacks/platform/modules/headscale/main.tf +++ b/stacks/platform/modules/headscale/main.tf @@ -92,6 +92,27 @@ resource "kubernetes_deployment" "headscale" { container_port = 41641 } + liveness_probe { + http_get { + path = "/health" + port = 8080 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/health" + port = 8080 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } + volume_mount { name = "config-volume" mount_path = "/etc/headscale" diff --git a/stacks/platform/modules/infra-maintenance/main.tf b/stacks/platform/modules/infra-maintenance/main.tf index 6df6c57a..1f1451d1 100644 --- a/stacks/platform/modules/infra-maintenance/main.tf +++ b/stacks/platform/modules/infra-maintenance/main.tf @@ -100,9 +100,9 @@ resource "kubernetes_cron_job_v1" "backup-etcd" { host_network = true container { name = "backup-etcd" - image = "registry.k8s.io/etcd:3.6.5-0" - command = ["/bin/sh"] - args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"] + image = "registry.k8s.io/etcd:3.5.21-0" + command = ["etcdctl"] + args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "snapshot", "save", "/backup/etcd-snapshot-latest.db"] env { name = "ETCDCTL_API" value = "3" diff --git a/stacks/platform/modules/iscsi-csi/main.tf b/stacks/platform/modules/iscsi-csi/main.tf new file mode 100644 index 00000000..9c0a578e --- /dev/null +++ b/stacks/platform/modules/iscsi-csi/main.tf @@ -0,0 +1,102 @@ +resource "kubernetes_namespace" "iscsi_csi" { + metadata { + name = "iscsi-csi" + labels = { + tier = var.tier + } + } +} + +resource "helm_release" "democratic_csi" { + namespace = kubernetes_namespace.iscsi_csi.metadata[0].name + create_namespace = false + name = "democratic-csi-iscsi" + atomic = true + timeout = 300 + + repository = "https://democratic-csi.github.io/charts/" + chart = "democratic-csi" + + values = [yamlencode({ + csiDriver = { + name = "org.democratic-csi.iscsi" + } + + storageClasses = [{ + name = "iscsi-truenas" + defaultClass = false + reclaimPolicy = "Retain" + volumeBindingMode = "Immediate" + allowVolumeExpansion = true + parameters = { + fsType = "ext4" + } + mountOptions = [] + }] + + controller = { + driver = { + resources = { + requests = { cpu = "25m", memory = "64Mi" } + limits = { cpu = "250m", memory = "256Mi" } + } + } + } + + node = { + driver = { + resources = { + requests = { cpu = "25m", memory = "64Mi" } + limits = { cpu = "250m", memory = "256Mi" } + } + } + + hostPID = true + hostPath = "/lib/modules" + } + + driver = { + config = { + driver = "freenas-iscsi" + + instance_id = "truenas-iscsi" + + httpConnection = { + protocol = "http" + host = var.truenas_host + port = 80 + apiKey = var.truenas_api_key + } + + sshConnection = { + host = var.truenas_host + port = 22 + username = "root" + privateKey = var.truenas_ssh_private_key + } + + zfs = { + datasetParentName = "main/iscsi" + detachedSnapshotsDatasetParentName = "main/iscsi-snaps" + } + + iscsi = { + targetPortal = "${var.truenas_host}:3260" + namePrefix = "csi-" + nameSuffix = "" + targetGroups = [{ + targetGroupPortalGroup = 1 + targetGroupInitiatorGroup = 1 + targetGroupAuthType = "None" + }] + extentInsecureTpc = true + extentXenCompat = false + extentDisablePhysicalBlocksize = true + extentBlocksize = 512 + extentRpm = "SSD" + extentAvailThreshold = 0 + } + } + } + })] +} diff --git a/stacks/platform/modules/iscsi-csi/variables.tf b/stacks/platform/modules/iscsi-csi/variables.tf new file mode 100644 index 00000000..893fe396 --- /dev/null +++ b/stacks/platform/modules/iscsi-csi/variables.tf @@ -0,0 +1,10 @@ +variable "tier" { type = string } +variable "truenas_host" { type = string } +variable "truenas_api_key" { + type = string + sensitive = true +} +variable "truenas_ssh_private_key" { + type = string + sensitive = true +} diff --git a/stacks/platform/modules/uptime-kuma/main.tf b/stacks/platform/modules/uptime-kuma/main.tf index 52bd1bdf..6e69b850 100644 --- a/stacks/platform/modules/uptime-kuma/main.tf +++ b/stacks/platform/modules/uptime-kuma/main.tf @@ -79,6 +79,26 @@ resource "kubernetes_deployment" "uptime-kuma" { port { container_port = 3001 } + liveness_probe { + http_get { + path = "/" + port = 3001 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/" + port = 3001 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } volume_mount { name = "data" mount_path = "/app/data" diff --git a/stacks/platform/modules/vaultwarden/main.tf b/stacks/platform/modules/vaultwarden/main.tf index 8b84afbf..f1560228 100644 --- a/stacks/platform/modules/vaultwarden/main.tf +++ b/stacks/platform/modules/vaultwarden/main.tf @@ -109,6 +109,26 @@ resource "kubernetes_deployment" "vaultwarden" { port { container_port = 80 } + liveness_probe { + http_get { + path = "/alive" + port = 80 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/alive" + port = 80 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } volume_mount { name = "data" mount_path = "/data" diff --git a/stacks/rybbit/main.tf b/stacks/rybbit/main.tf index 70a123e0..f166602f 100644 --- a/stacks/rybbit/main.tf +++ b/stacks/rybbit/main.tf @@ -49,6 +49,9 @@ resource "kubernetes_deployment" "clickhouse" { } spec { replicas = 1 + strategy { + type = "Recreate" + } selector { match_labels = { app = "clickhouse" @@ -77,6 +80,26 @@ resource "kubernetes_deployment" "clickhouse" { protocol = "TCP" container_port = 8123 } + liveness_probe { + http_get { + path = "/ping" + port = 8123 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/ping" + port = 8123 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } volume_mount { name = "data" mount_path = "/var/lib/clickhouse" @@ -133,7 +156,7 @@ resource "kubernetes_cron_job_v1" "clickhouse_truncate_logs" { namespace = kubernetes_namespace.rybbit.metadata[0].name } spec { - schedule = "0 */6 * * *" + schedule = "0 */6 * * *" successful_jobs_history_limit = 1 failed_jobs_history_limit = 1 job_template { @@ -252,6 +275,26 @@ resource "kubernetes_deployment" "rybbit" { port { container_port = 3001 } + liveness_probe { + http_get { + path = "/api/health" + port = 3001 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/api/health" + port = 3001 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } resources { requests = { cpu = "25m" @@ -328,6 +371,26 @@ resource "kubernetes_deployment" "rybbit-client" { protocol = "TCP" container_port = 3002 } + liveness_probe { + http_get { + path = "/" + port = 3002 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/" + port = 3002 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } resources { requests = { cpu = "10m" diff --git a/stacks/url/main.tf b/stacks/url/main.tf index 4a8e9397..87206463 100644 --- a/stacks/url/main.tf +++ b/stacks/url/main.tf @@ -147,6 +147,26 @@ resource "kubernetes_deployment" "shlink" { port { container_port = 8080 } + liveness_probe { + http_get { + path = "/rest/v3/health" + port = 8080 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/rest/v3/health" + port = 8080 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } } } } @@ -252,6 +272,26 @@ resource "kubernetes_deployment" "shlink-web" { port { container_port = 8080 } + liveness_probe { + http_get { + path = "/" + port = 8080 + } + initial_delay_seconds = 15 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 5 + } + readiness_probe { + http_get { + path = "/" + port = 8080 + } + initial_delay_seconds = 5 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 3 + } } volume { name = "config" diff --git a/terraform.tfvars b/terraform.tfvars index 98b61e01..a6719ec3 100644 Binary files a/terraform.tfvars and b/terraform.tfvars differ