[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup

- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas
- Add democratic-csi iSCSI driver module for TrueNAS
- Add open-iscsi to cloud-init VM template
- Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0)
- Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh)
- Fix cluster healthcheck CronJob: always exit 0 to prevent circular
  JobFailed alerts (reporting via Slack, not exit codes)
- Fix Uptime Kuma nested list handling in cluster-health.sh
- Add health probes to: audiobookshelf, immich ML, ntfy, headscale,
  uptime-kuma, vaultwarden, rybbit (clickhouse + server + client),
  shlink, shlink-web
- Add iSCSI storage documentation to CLAUDE.md
This commit is contained in:
Viktor Barzin 2026-03-06 19:54:21 +00:00
parent a8e07ad930
commit 1d80c49201
No known key found for this signature in database
GPG key ID: 0EB088298288D958
17 changed files with 378 additions and 13 deletions

View file

@ -61,6 +61,14 @@ For platform modules, use `source = "../../../../modules/kubernetes/nfs_volume"`
**StorageClass**: `nfs-truenas` (deployed via `stacks/platform/modules/nfs-csi/`).
**DO NOT use inline `nfs {}` blocks** — they mount with `hard,timeo=600` defaults which hang forever on stale mounts.
### iSCSI Storage for Databases
**StorageClass**: `iscsi-truenas` (deployed via `stacks/platform/modules/iscsi-csi/` using democratic-csi).
- Used by: PostgreSQL (CNPG), MySQL (InnoDB Cluster) — any pod, any node, same data
- Driver: `freenas-iscsi` (SSH-based, NOT `freenas-api-iscsi` which is TrueNAS SCALE only)
- ZFS datasets: `main/iscsi` (zvols), `main/iscsi-snaps` (snapshots)
- All K8s nodes have `open-iscsi` + `iscsid` running
- Redis stays on `local-path` (StatefulSet `volumeClaimTemplates` are immutable)
### Adding NFS Exports
1. **Create the directory on TrueNAS first**: `ssh root@10.0.10.15 "mkdir -p /mnt/main/<service> && chmod 777 /mnt/main/<service>"`
2. Edit `secrets/nfs_directories.txt` — add path, keep sorted

View file

@ -1522,12 +1522,9 @@ main() {
print_summary
send_slack
# Exit code: 2 for failures, 1 for warnings, 0 for clean
if [[ "$FAIL_COUNT" -gt 0 ]]; then
exit 2
elif [[ "$WARN_COUNT" -gt 0 ]]; then
exit 1
fi
# Always exit 0 — reporting is done via Slack notification.
# Non-zero exits mark the CronJob as Failed, which triggers Prometheus
# JobFailed alerts, creating a circular alert loop.
exit 0
}

View file

@ -38,6 +38,8 @@ packages:
# kubernetes
- kubeadm
- kubelet
# iSCSI client for CSI-backed database storage
- open-iscsi
%{endif}
apt:
@ -60,6 +62,7 @@ runcmd:
- containerd config default | sudo tee /etc/containerd/config.toml
- ${containerd_config_update_command}
- systemctl restart containerd
- systemctl enable --now iscsid
- ${k8s_join_command}
- systemctl enable kubelet
- systemctl start kubelet

View file

@ -86,6 +86,26 @@ resource "kubernetes_deployment" "audiobookshelf" {
port {
container_port = 80
}
liveness_probe {
http_get {
path = "/healthcheck"
port = 80
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/healthcheck"
port = 80
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
volume_mount {
name = "audiobooks"
mount_path = "/audiobooks"

View file

@ -515,6 +515,26 @@ resource "kubernetes_deployment" "immich-machine-learning" {
"nvidia.com/gpu" = "1"
}
}
liveness_probe {
http_get {
path = "/ping"
port = 3003
}
initial_delay_seconds = 30
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/ping"
port = 3003
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
}
volume {
name = "cache"

View file

@ -62,6 +62,26 @@ resource "kubernetes_deployment" "ntfy" {
port {
container_port = 80
}
liveness_probe {
http_get {
path = "/v1/health"
port = 80
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/v1/health"
port = 80
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
env {
name = "NTFY_BASE_URL"
value = "https://ntfy.viktorbarzin.me"

View file

@ -123,6 +123,16 @@ variable "webhook_handler_git_token" { type = string }
variable "technitium_username" { type = string }
variable "technitium_password" { type = string }
# --- iscsi-csi ---
variable "truenas_api_key" {
type = string
sensitive = true
}
variable "truenas_ssh_private_key" {
type = string
sensitive = true
}
# =============================================================================
# Module Calls
# =============================================================================
@ -318,6 +328,17 @@ module "nfs-csi" {
nfs_server = var.nfs_server
}
# -----------------------------------------------------------------------------
# iSCSI CSI democratic-csi for TrueNAS iSCSI (database storage)
# -----------------------------------------------------------------------------
module "iscsi-csi" {
source = "./modules/iscsi-csi"
tier = local.tiers.cluster
truenas_host = var.nfs_server # Same TrueNAS host
truenas_api_key = var.truenas_api_key
truenas_ssh_private_key = var.truenas_ssh_private_key
}
# -----------------------------------------------------------------------------
# CNPG CloudNativePG Operator + local-path-provisioner for database storage
# -----------------------------------------------------------------------------

View file

@ -128,7 +128,7 @@ resource "helm_release" "mysql_cluster" {
}
datadirVolumeClaimTemplate = {
storageClassName = "local-path"
storageClassName = "iscsi-truenas"
resources = {
requests = {
storage = "30Gi"
@ -799,7 +799,7 @@ resource "null_resource" "pg_cluster" {
instances = "2"
image = "ghcr.io/cloudnative-pg/postgis:16"
storage_size = "20Gi"
storage_class = "local-path"
storage_class = "iscsi-truenas"
memory_limit = "4Gi"
cpu_limit = "2"
}
@ -822,7 +822,7 @@ resource "null_resource" "pg_cluster" {
enableSuperuserAccess: true
storage:
size: 20Gi
storageClass: local-path
storageClass: iscsi-truenas
resources:
requests:
cpu: "250m"

View file

@ -92,6 +92,27 @@ resource "kubernetes_deployment" "headscale" {
container_port = 41641
}
liveness_probe {
http_get {
path = "/health"
port = 8080
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/health"
port = 8080
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
volume_mount {
name = "config-volume"
mount_path = "/etc/headscale"

View file

@ -100,9 +100,9 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
host_network = true
container {
name = "backup-etcd"
image = "registry.k8s.io/etcd:3.6.5-0"
command = ["/bin/sh"]
args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"]
image = "registry.k8s.io/etcd:3.5.21-0"
command = ["etcdctl"]
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
env {
name = "ETCDCTL_API"
value = "3"

View file

@ -0,0 +1,102 @@
resource "kubernetes_namespace" "iscsi_csi" {
metadata {
name = "iscsi-csi"
labels = {
tier = var.tier
}
}
}
resource "helm_release" "democratic_csi" {
namespace = kubernetes_namespace.iscsi_csi.metadata[0].name
create_namespace = false
name = "democratic-csi-iscsi"
atomic = true
timeout = 300
repository = "https://democratic-csi.github.io/charts/"
chart = "democratic-csi"
values = [yamlencode({
csiDriver = {
name = "org.democratic-csi.iscsi"
}
storageClasses = [{
name = "iscsi-truenas"
defaultClass = false
reclaimPolicy = "Retain"
volumeBindingMode = "Immediate"
allowVolumeExpansion = true
parameters = {
fsType = "ext4"
}
mountOptions = []
}]
controller = {
driver = {
resources = {
requests = { cpu = "25m", memory = "64Mi" }
limits = { cpu = "250m", memory = "256Mi" }
}
}
}
node = {
driver = {
resources = {
requests = { cpu = "25m", memory = "64Mi" }
limits = { cpu = "250m", memory = "256Mi" }
}
}
hostPID = true
hostPath = "/lib/modules"
}
driver = {
config = {
driver = "freenas-iscsi"
instance_id = "truenas-iscsi"
httpConnection = {
protocol = "http"
host = var.truenas_host
port = 80
apiKey = var.truenas_api_key
}
sshConnection = {
host = var.truenas_host
port = 22
username = "root"
privateKey = var.truenas_ssh_private_key
}
zfs = {
datasetParentName = "main/iscsi"
detachedSnapshotsDatasetParentName = "main/iscsi-snaps"
}
iscsi = {
targetPortal = "${var.truenas_host}:3260"
namePrefix = "csi-"
nameSuffix = ""
targetGroups = [{
targetGroupPortalGroup = 1
targetGroupInitiatorGroup = 1
targetGroupAuthType = "None"
}]
extentInsecureTpc = true
extentXenCompat = false
extentDisablePhysicalBlocksize = true
extentBlocksize = 512
extentRpm = "SSD"
extentAvailThreshold = 0
}
}
}
})]
}

View file

@ -0,0 +1,10 @@
variable "tier" { type = string }
variable "truenas_host" { type = string }
variable "truenas_api_key" {
type = string
sensitive = true
}
variable "truenas_ssh_private_key" {
type = string
sensitive = true
}

View file

@ -79,6 +79,26 @@ resource "kubernetes_deployment" "uptime-kuma" {
port {
container_port = 3001
}
liveness_probe {
http_get {
path = "/"
port = 3001
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/"
port = 3001
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
volume_mount {
name = "data"
mount_path = "/app/data"

View file

@ -109,6 +109,26 @@ resource "kubernetes_deployment" "vaultwarden" {
port {
container_port = 80
}
liveness_probe {
http_get {
path = "/alive"
port = 80
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/alive"
port = 80
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
volume_mount {
name = "data"
mount_path = "/data"

View file

@ -49,6 +49,9 @@ resource "kubernetes_deployment" "clickhouse" {
}
spec {
replicas = 1
strategy {
type = "Recreate"
}
selector {
match_labels = {
app = "clickhouse"
@ -77,6 +80,26 @@ resource "kubernetes_deployment" "clickhouse" {
protocol = "TCP"
container_port = 8123
}
liveness_probe {
http_get {
path = "/ping"
port = 8123
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/ping"
port = 8123
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
volume_mount {
name = "data"
mount_path = "/var/lib/clickhouse"
@ -133,7 +156,7 @@ resource "kubernetes_cron_job_v1" "clickhouse_truncate_logs" {
namespace = kubernetes_namespace.rybbit.metadata[0].name
}
spec {
schedule = "0 */6 * * *"
schedule = "0 */6 * * *"
successful_jobs_history_limit = 1
failed_jobs_history_limit = 1
job_template {
@ -252,6 +275,26 @@ resource "kubernetes_deployment" "rybbit" {
port {
container_port = 3001
}
liveness_probe {
http_get {
path = "/api/health"
port = 3001
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/api/health"
port = 3001
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
resources {
requests = {
cpu = "25m"
@ -328,6 +371,26 @@ resource "kubernetes_deployment" "rybbit-client" {
protocol = "TCP"
container_port = 3002
}
liveness_probe {
http_get {
path = "/"
port = 3002
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/"
port = 3002
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
resources {
requests = {
cpu = "10m"

View file

@ -147,6 +147,26 @@ resource "kubernetes_deployment" "shlink" {
port {
container_port = 8080
}
liveness_probe {
http_get {
path = "/rest/v3/health"
port = 8080
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/rest/v3/health"
port = 8080
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
}
}
}
@ -252,6 +272,26 @@ resource "kubernetes_deployment" "shlink-web" {
port {
container_port = 8080
}
liveness_probe {
http_get {
path = "/"
port = 8080
}
initial_delay_seconds = 15
period_seconds = 30
timeout_seconds = 5
failure_threshold = 5
}
readiness_probe {
http_get {
path = "/"
port = 8080
}
initial_delay_seconds = 5
period_seconds = 30
timeout_seconds = 5
failure_threshold = 3
}
}
volume {
name = "config"

Binary file not shown.