[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup

- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas - Add democratic-csi iSCSI driver module for TrueNAS - Add open-iscsi to cloud-init VM template - Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0) - Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh) - Fix cluster healthcheck CronJob: always exit 0 to prevent circular JobFailed alerts (reporting via Slack, not exit codes) - Fix Uptime Kuma nested list handling in cluster-health.sh - Add health probes to: audiobookshelf, immich ML, ntfy, headscale, uptime-kuma, vaultwarden, rybbit (clickhouse + server + client), shlink, shlink-web - Add iSCSI storage documentation to CLAUDE.md
2026-03-06 19:54:21 +00:00 · 2026-03-06 19:54:21 +00:00 · 1d80c49201
commit 1d80c49201
parent a8e07ad930
17 changed files with 378 additions and 13 deletions
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@ -61,6 +61,14 @@ For platform modules, use `source = "../../../../modules/kubernetes/nfs_volume"`
 **StorageClass**: `nfs-truenas` (deployed via `stacks/platform/modules/nfs-csi/`).
 **DO NOT use inline `nfs {}` blocks** — they mount with `hard,timeo=600` defaults which hang forever on stale mounts.

+### iSCSI Storage for Databases
+**StorageClass**: `iscsi-truenas` (deployed via `stacks/platform/modules/iscsi-csi/` using democratic-csi).
+- Used by: PostgreSQL (CNPG), MySQL (InnoDB Cluster) — any pod, any node, same data
+- Driver: `freenas-iscsi` (SSH-based, NOT `freenas-api-iscsi` which is TrueNAS SCALE only)
+- ZFS datasets: `main/iscsi` (zvols), `main/iscsi-snaps` (snapshots)
+- All K8s nodes have `open-iscsi` + `iscsid` running
+- Redis stays on `local-path` (StatefulSet `volumeClaimTemplates` are immutable)
+
 ### Adding NFS Exports
 1. **Create the directory on TrueNAS first**: `ssh root@10.0.10.15 "mkdir -p /mnt/main/<service> && chmod 777 /mnt/main/<service>"`
 2. Edit `secrets/nfs_directories.txt` — add path, keep sorted
--- a/.claude/cluster-health.sh
+++ b/.claude/cluster-health.sh
@ -1522,12 +1522,9 @@ main() {
    print_summary
    send_slack

-    # Exit code: 2 for failures, 1 for warnings, 0 for clean
-    if [[ "$FAIL_COUNT" -gt 0 ]]; then
-        exit 2
-    elif [[ "$WARN_COUNT" -gt 0 ]]; then
-        exit 1
-    fi
+    # Always exit 0 — reporting is done via Slack notification.
+    # Non-zero exits mark the CronJob as Failed, which triggers Prometheus
+    # JobFailed alerts, creating a circular alert loop.
    exit 0
 }

--- a/modules/create-template-vm/cloud_init.yaml
+++ b/modules/create-template-vm/cloud_init.yaml
@ -38,6 +38,8 @@ packages:
  # kubernetes
  - kubeadm
  - kubelet
+  # iSCSI client for CSI-backed database storage
+  - open-iscsi
  %{endif}

 apt:
@ -60,6 +62,7 @@ runcmd:
  - containerd config default | sudo tee /etc/containerd/config.toml
  - ${containerd_config_update_command}
  - systemctl restart containerd
+  - systemctl enable --now iscsid
  - ${k8s_join_command}
  - systemctl enable kubelet
  - systemctl start kubelet
--- a/stacks/audiobookshelf/main.tf
+++ b/stacks/audiobookshelf/main.tf
@ -86,6 +86,26 @@ resource "kubernetes_deployment" "audiobookshelf" {
          port {
            container_port = 80
          }
+          liveness_probe {
+            http_get {
+              path = "/healthcheck"
+              port = 80
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/healthcheck"
+              port = 80
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          volume_mount {
            name       = "audiobooks"
            mount_path = "/audiobooks"
--- a/stacks/immich/main.tf
+++ b/stacks/immich/main.tf
@ -515,6 +515,26 @@ resource "kubernetes_deployment" "immich-machine-learning" {
              "nvidia.com/gpu" = "1"
            }
          }
+          liveness_probe {
+            http_get {
+              path = "/ping"
+              port = 3003
+            }
+            initial_delay_seconds = 30
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/ping"
+              port = 3003
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
        }
        volume {
          name = "cache"
--- a/stacks/ntfy/main.tf
+++ b/stacks/ntfy/main.tf
@ -62,6 +62,26 @@ resource "kubernetes_deployment" "ntfy" {
          port {
            container_port = 80
          }
+          liveness_probe {
+            http_get {
+              path = "/v1/health"
+              port = 80
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/v1/health"
+              port = 80
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          env {
            name  = "NTFY_BASE_URL"
            value = "https://ntfy.viktorbarzin.me"
--- a/stacks/platform/main.tf
+++ b/stacks/platform/main.tf
@ -123,6 +123,16 @@ variable "webhook_handler_git_token" { type = string }
 variable "technitium_username" { type = string }
 variable "technitium_password" { type = string }

+# --- iscsi-csi ---
+variable "truenas_api_key" {
+  type      = string
+  sensitive = true
+}
+variable "truenas_ssh_private_key" {
+  type      = string
+  sensitive = true
+}
+
 # =============================================================================
 # Module Calls
 # =============================================================================
@ -318,6 +328,17 @@ module "nfs-csi" {
  nfs_server = var.nfs_server
 }

+# -----------------------------------------------------------------------------
+# iSCSI CSI — democratic-csi for TrueNAS iSCSI (database storage)
+# -----------------------------------------------------------------------------
+module "iscsi-csi" {
+  source                 = "./modules/iscsi-csi"
+  tier                   = local.tiers.cluster
+  truenas_host           = var.nfs_server # Same TrueNAS host
+  truenas_api_key        = var.truenas_api_key
+  truenas_ssh_private_key = var.truenas_ssh_private_key
+}
+
 # -----------------------------------------------------------------------------
 # CNPG — CloudNativePG Operator + local-path-provisioner for database storage
 # -----------------------------------------------------------------------------
--- a/stacks/platform/modules/dbaas/main.tf
+++ b/stacks/platform/modules/dbaas/main.tf
@ -128,7 +128,7 @@ resource "helm_release" "mysql_cluster" {
    }

    datadirVolumeClaimTemplate = {
-      storageClassName = "local-path"
+      storageClassName = "iscsi-truenas"
      resources = {
        requests = {
          storage = "30Gi"
@ -799,7 +799,7 @@ resource "null_resource" "pg_cluster" {
    instances     = "2"
    image         = "ghcr.io/cloudnative-pg/postgis:16"
    storage_size  = "20Gi"
-    storage_class = "local-path"
+    storage_class = "iscsi-truenas"
    memory_limit  = "4Gi"
    cpu_limit     = "2"
  }
@ -822,7 +822,7 @@ resource "null_resource" "pg_cluster" {
        enableSuperuserAccess: true
        storage:
          size: 20Gi
-          storageClass: local-path
+          storageClass: iscsi-truenas
        resources:
          requests:
            cpu: "250m"
--- a/stacks/platform/modules/headscale/main.tf
+++ b/stacks/platform/modules/headscale/main.tf
@ -92,6 +92,27 @@ resource "kubernetes_deployment" "headscale" {
            container_port = 41641
          }

+          liveness_probe {
+            http_get {
+              path = "/health"
+              port = 8080
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/health"
+              port = 8080
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
+
          volume_mount {
            name       = "config-volume"
            mount_path = "/etc/headscale"
--- a/stacks/platform/modules/infra-maintenance/main.tf
+++ b/stacks/platform/modules/infra-maintenance/main.tf
@ -100,9 +100,9 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
            host_network        = true
            container {
              name    = "backup-etcd"
-              image   = "registry.k8s.io/etcd:3.6.5-0"
-              command = ["/bin/sh"]
-              args    = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"]
+              image   = "registry.k8s.io/etcd:3.5.21-0"
+              command = ["etcdctl"]
+              args    = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
              env {
                name  = "ETCDCTL_API"
                value = "3"
--- a/stacks/platform/modules/iscsi-csi/main.tf
+++ b/stacks/platform/modules/iscsi-csi/main.tf
@ -0,0 +1,102 @@
+resource "kubernetes_namespace" "iscsi_csi" {
+  metadata {
+    name = "iscsi-csi"
+    labels = {
+      tier = var.tier
+    }
+  }
+}
+
+resource "helm_release" "democratic_csi" {
+  namespace        = kubernetes_namespace.iscsi_csi.metadata[0].name
+  create_namespace = false
+  name             = "democratic-csi-iscsi"
+  atomic           = true
+  timeout          = 300
+
+  repository = "https://democratic-csi.github.io/charts/"
+  chart      = "democratic-csi"
+
+  values = [yamlencode({
+    csiDriver = {
+      name = "org.democratic-csi.iscsi"
+    }
+
+    storageClasses = [{
+      name                 = "iscsi-truenas"
+      defaultClass         = false
+      reclaimPolicy        = "Retain"
+      volumeBindingMode    = "Immediate"
+      allowVolumeExpansion = true
+      parameters = {
+        fsType = "ext4"
+      }
+      mountOptions = []
+    }]
+
+    controller = {
+      driver = {
+        resources = {
+          requests = { cpu = "25m", memory = "64Mi" }
+          limits   = { cpu = "250m", memory = "256Mi" }
+        }
+      }
+    }
+
+    node = {
+      driver = {
+        resources = {
+          requests = { cpu = "25m", memory = "64Mi" }
+          limits   = { cpu = "250m", memory = "256Mi" }
+        }
+      }
+
+      hostPID  = true
+      hostPath = "/lib/modules"
+    }
+
+    driver = {
+      config = {
+        driver = "freenas-iscsi"
+
+        instance_id = "truenas-iscsi"
+
+        httpConnection = {
+          protocol = "http"
+          host     = var.truenas_host
+          port     = 80
+          apiKey   = var.truenas_api_key
+        }
+
+        sshConnection = {
+          host       = var.truenas_host
+          port       = 22
+          username   = "root"
+          privateKey = var.truenas_ssh_private_key
+        }
+
+        zfs = {
+          datasetParentName                  = "main/iscsi"
+          detachedSnapshotsDatasetParentName = "main/iscsi-snaps"
+        }
+
+        iscsi = {
+          targetPortal = "${var.truenas_host}:3260"
+          namePrefix   = "csi-"
+          nameSuffix   = ""
+          targetGroups = [{
+            targetGroupPortalGroup    = 1
+            targetGroupInitiatorGroup = 1
+            targetGroupAuthType       = "None"
+          }]
+          extentInsecureTpc              = true
+          extentXenCompat                = false
+          extentDisablePhysicalBlocksize = true
+          extentBlocksize                = 512
+          extentRpm                      = "SSD"
+          extentAvailThreshold           = 0
+        }
+      }
+    }
+  })]
+}
--- a/stacks/platform/modules/iscsi-csi/variables.tf
+++ b/stacks/platform/modules/iscsi-csi/variables.tf
@ -0,0 +1,10 @@
+variable "tier" { type = string }
+variable "truenas_host" { type = string }
+variable "truenas_api_key" {
+  type      = string
+  sensitive = true
+}
+variable "truenas_ssh_private_key" {
+  type      = string
+  sensitive = true
+}
--- a/stacks/platform/modules/uptime-kuma/main.tf
+++ b/stacks/platform/modules/uptime-kuma/main.tf
@ -79,6 +79,26 @@ resource "kubernetes_deployment" "uptime-kuma" {
          port {
            container_port = 3001
          }
+          liveness_probe {
+            http_get {
+              path = "/"
+              port = 3001
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/"
+              port = 3001
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          volume_mount {
            name       = "data"
            mount_path = "/app/data"
--- a/stacks/platform/modules/vaultwarden/main.tf
+++ b/stacks/platform/modules/vaultwarden/main.tf
@ -109,6 +109,26 @@ resource "kubernetes_deployment" "vaultwarden" {
          port {
            container_port = 80
          }
+          liveness_probe {
+            http_get {
+              path = "/alive"
+              port = 80
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/alive"
+              port = 80
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          volume_mount {
            name       = "data"
            mount_path = "/data"
--- a/stacks/rybbit/main.tf
+++ b/stacks/rybbit/main.tf
@ -49,6 +49,9 @@ resource "kubernetes_deployment" "clickhouse" {
  }
  spec {
    replicas = 1
+    strategy {
+      type = "Recreate"
+    }
    selector {
      match_labels = {
        app = "clickhouse"
@ -77,6 +80,26 @@ resource "kubernetes_deployment" "clickhouse" {
            protocol       = "TCP"
            container_port = 8123
          }
+          liveness_probe {
+            http_get {
+              path = "/ping"
+              port = 8123
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/ping"
+              port = 8123
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          volume_mount {
            name       = "data"
            mount_path = "/var/lib/clickhouse"
@ -133,7 +156,7 @@ resource "kubernetes_cron_job_v1" "clickhouse_truncate_logs" {
    namespace = kubernetes_namespace.rybbit.metadata[0].name
  }
  spec {
-    schedule = "0 */6 * * *"
+    schedule                      = "0 */6 * * *"
    successful_jobs_history_limit = 1
    failed_jobs_history_limit     = 1
    job_template {
@ -252,6 +275,26 @@ resource "kubernetes_deployment" "rybbit" {
          port {
            container_port = 3001
          }
+          liveness_probe {
+            http_get {
+              path = "/api/health"
+              port = 3001
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/api/health"
+              port = 3001
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          resources {
            requests = {
              cpu    = "25m"
@ -328,6 +371,26 @@ resource "kubernetes_deployment" "rybbit-client" {
            protocol       = "TCP"
            container_port = 3002
          }
+          liveness_probe {
+            http_get {
+              path = "/"
+              port = 3002
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/"
+              port = 3002
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
          resources {
            requests = {
              cpu    = "10m"
--- a/stacks/url/main.tf
+++ b/stacks/url/main.tf
@ -147,6 +147,26 @@ resource "kubernetes_deployment" "shlink" {
          port {
            container_port = 8080
          }
+          liveness_probe {
+            http_get {
+              path = "/rest/v3/health"
+              port = 8080
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/rest/v3/health"
+              port = 8080
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
        }
      }
    }
@ -252,6 +272,26 @@ resource "kubernetes_deployment" "shlink-web" {
          port {
            container_port = 8080
          }
+          liveness_probe {
+            http_get {
+              path = "/"
+              port = 8080
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/"
+              port = 8080
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
        }
        volume {
          name = "config"
--- a/terraform.tfvars
+++ b/terraform.tfvars