[ci skip] iSCSI migration, healthcheck fixes, health probes, etcd backup
- Migrate MySQL/PostgreSQL storage from local-path to iscsi-truenas - Add democratic-csi iSCSI driver module for TrueNAS - Add open-iscsi to cloud-init VM template - Fix Shlink health probe path (/api/v3 -> /rest/v3 for Shlink 5.0) - Fix etcd backup: use etcd 3.5.21-0 (3.6.x is distroless, no /bin/sh) - Fix cluster healthcheck CronJob: always exit 0 to prevent circular JobFailed alerts (reporting via Slack, not exit codes) - Fix Uptime Kuma nested list handling in cluster-health.sh - Add health probes to: audiobookshelf, immich ML, ntfy, headscale, uptime-kuma, vaultwarden, rybbit (clickhouse + server + client), shlink, shlink-web - Add iSCSI storage documentation to CLAUDE.md
This commit is contained in:
parent
a8e07ad930
commit
1d80c49201
17 changed files with 378 additions and 13 deletions
|
|
@ -61,6 +61,14 @@ For platform modules, use `source = "../../../../modules/kubernetes/nfs_volume"`
|
|||
**StorageClass**: `nfs-truenas` (deployed via `stacks/platform/modules/nfs-csi/`).
|
||||
**DO NOT use inline `nfs {}` blocks** — they mount with `hard,timeo=600` defaults which hang forever on stale mounts.
|
||||
|
||||
### iSCSI Storage for Databases
|
||||
**StorageClass**: `iscsi-truenas` (deployed via `stacks/platform/modules/iscsi-csi/` using democratic-csi).
|
||||
- Used by: PostgreSQL (CNPG), MySQL (InnoDB Cluster) — any pod, any node, same data
|
||||
- Driver: `freenas-iscsi` (SSH-based, NOT `freenas-api-iscsi` which is TrueNAS SCALE only)
|
||||
- ZFS datasets: `main/iscsi` (zvols), `main/iscsi-snaps` (snapshots)
|
||||
- All K8s nodes have `open-iscsi` + `iscsid` running
|
||||
- Redis stays on `local-path` (StatefulSet `volumeClaimTemplates` are immutable)
|
||||
|
||||
### Adding NFS Exports
|
||||
1. **Create the directory on TrueNAS first**: `ssh root@10.0.10.15 "mkdir -p /mnt/main/<service> && chmod 777 /mnt/main/<service>"`
|
||||
2. Edit `secrets/nfs_directories.txt` — add path, keep sorted
|
||||
|
|
|
|||
|
|
@ -1522,12 +1522,9 @@ main() {
|
|||
print_summary
|
||||
send_slack
|
||||
|
||||
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
||||
if [[ "$FAIL_COUNT" -gt 0 ]]; then
|
||||
exit 2
|
||||
elif [[ "$WARN_COUNT" -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
# Always exit 0 — reporting is done via Slack notification.
|
||||
# Non-zero exits mark the CronJob as Failed, which triggers Prometheus
|
||||
# JobFailed alerts, creating a circular alert loop.
|
||||
exit 0
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,6 +38,8 @@ packages:
|
|||
# kubernetes
|
||||
- kubeadm
|
||||
- kubelet
|
||||
# iSCSI client for CSI-backed database storage
|
||||
- open-iscsi
|
||||
%{endif}
|
||||
|
||||
apt:
|
||||
|
|
@ -60,6 +62,7 @@ runcmd:
|
|||
- containerd config default | sudo tee /etc/containerd/config.toml
|
||||
- ${containerd_config_update_command}
|
||||
- systemctl restart containerd
|
||||
- systemctl enable --now iscsid
|
||||
- ${k8s_join_command}
|
||||
- systemctl enable kubelet
|
||||
- systemctl start kubelet
|
||||
|
|
|
|||
|
|
@ -86,6 +86,26 @@ resource "kubernetes_deployment" "audiobookshelf" {
|
|||
port {
|
||||
container_port = 80
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/healthcheck"
|
||||
port = 80
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/healthcheck"
|
||||
port = 80
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
volume_mount {
|
||||
name = "audiobooks"
|
||||
mount_path = "/audiobooks"
|
||||
|
|
|
|||
|
|
@ -515,6 +515,26 @@ resource "kubernetes_deployment" "immich-machine-learning" {
|
|||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/ping"
|
||||
port = 3003
|
||||
}
|
||||
initial_delay_seconds = 30
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/ping"
|
||||
port = 3003
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "cache"
|
||||
|
|
|
|||
|
|
@ -62,6 +62,26 @@ resource "kubernetes_deployment" "ntfy" {
|
|||
port {
|
||||
container_port = 80
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/v1/health"
|
||||
port = 80
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/v1/health"
|
||||
port = 80
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
env {
|
||||
name = "NTFY_BASE_URL"
|
||||
value = "https://ntfy.viktorbarzin.me"
|
||||
|
|
|
|||
|
|
@ -123,6 +123,16 @@ variable "webhook_handler_git_token" { type = string }
|
|||
variable "technitium_username" { type = string }
|
||||
variable "technitium_password" { type = string }
|
||||
|
||||
# --- iscsi-csi ---
|
||||
variable "truenas_api_key" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "truenas_ssh_private_key" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Module Calls
|
||||
# =============================================================================
|
||||
|
|
@ -318,6 +328,17 @@ module "nfs-csi" {
|
|||
nfs_server = var.nfs_server
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# iSCSI CSI — democratic-csi for TrueNAS iSCSI (database storage)
|
||||
# -----------------------------------------------------------------------------
|
||||
module "iscsi-csi" {
|
||||
source = "./modules/iscsi-csi"
|
||||
tier = local.tiers.cluster
|
||||
truenas_host = var.nfs_server # Same TrueNAS host
|
||||
truenas_api_key = var.truenas_api_key
|
||||
truenas_ssh_private_key = var.truenas_ssh_private_key
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CNPG — CloudNativePG Operator + local-path-provisioner for database storage
|
||||
# -----------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ resource "helm_release" "mysql_cluster" {
|
|||
}
|
||||
|
||||
datadirVolumeClaimTemplate = {
|
||||
storageClassName = "local-path"
|
||||
storageClassName = "iscsi-truenas"
|
||||
resources = {
|
||||
requests = {
|
||||
storage = "30Gi"
|
||||
|
|
@ -799,7 +799,7 @@ resource "null_resource" "pg_cluster" {
|
|||
instances = "2"
|
||||
image = "ghcr.io/cloudnative-pg/postgis:16"
|
||||
storage_size = "20Gi"
|
||||
storage_class = "local-path"
|
||||
storage_class = "iscsi-truenas"
|
||||
memory_limit = "4Gi"
|
||||
cpu_limit = "2"
|
||||
}
|
||||
|
|
@ -822,7 +822,7 @@ resource "null_resource" "pg_cluster" {
|
|||
enableSuperuserAccess: true
|
||||
storage:
|
||||
size: 20Gi
|
||||
storageClass: local-path
|
||||
storageClass: iscsi-truenas
|
||||
resources:
|
||||
requests:
|
||||
cpu: "250m"
|
||||
|
|
|
|||
|
|
@ -92,6 +92,27 @@ resource "kubernetes_deployment" "headscale" {
|
|||
container_port = 41641
|
||||
}
|
||||
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/health"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/health"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "config-volume"
|
||||
mount_path = "/etc/headscale"
|
||||
|
|
|
|||
|
|
@ -100,9 +100,9 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
host_network = true
|
||||
container {
|
||||
name = "backup-etcd"
|
||||
image = "registry.k8s.io/etcd:3.6.5-0"
|
||||
command = ["/bin/sh"]
|
||||
args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"]
|
||||
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||
command = ["etcdctl"]
|
||||
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
|
||||
env {
|
||||
name = "ETCDCTL_API"
|
||||
value = "3"
|
||||
|
|
|
|||
102
stacks/platform/modules/iscsi-csi/main.tf
Normal file
102
stacks/platform/modules/iscsi-csi/main.tf
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
resource "kubernetes_namespace" "iscsi_csi" {
|
||||
metadata {
|
||||
name = "iscsi-csi"
|
||||
labels = {
|
||||
tier = var.tier
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "helm_release" "democratic_csi" {
|
||||
namespace = kubernetes_namespace.iscsi_csi.metadata[0].name
|
||||
create_namespace = false
|
||||
name = "democratic-csi-iscsi"
|
||||
atomic = true
|
||||
timeout = 300
|
||||
|
||||
repository = "https://democratic-csi.github.io/charts/"
|
||||
chart = "democratic-csi"
|
||||
|
||||
values = [yamlencode({
|
||||
csiDriver = {
|
||||
name = "org.democratic-csi.iscsi"
|
||||
}
|
||||
|
||||
storageClasses = [{
|
||||
name = "iscsi-truenas"
|
||||
defaultClass = false
|
||||
reclaimPolicy = "Retain"
|
||||
volumeBindingMode = "Immediate"
|
||||
allowVolumeExpansion = true
|
||||
parameters = {
|
||||
fsType = "ext4"
|
||||
}
|
||||
mountOptions = []
|
||||
}]
|
||||
|
||||
controller = {
|
||||
driver = {
|
||||
resources = {
|
||||
requests = { cpu = "25m", memory = "64Mi" }
|
||||
limits = { cpu = "250m", memory = "256Mi" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node = {
|
||||
driver = {
|
||||
resources = {
|
||||
requests = { cpu = "25m", memory = "64Mi" }
|
||||
limits = { cpu = "250m", memory = "256Mi" }
|
||||
}
|
||||
}
|
||||
|
||||
hostPID = true
|
||||
hostPath = "/lib/modules"
|
||||
}
|
||||
|
||||
driver = {
|
||||
config = {
|
||||
driver = "freenas-iscsi"
|
||||
|
||||
instance_id = "truenas-iscsi"
|
||||
|
||||
httpConnection = {
|
||||
protocol = "http"
|
||||
host = var.truenas_host
|
||||
port = 80
|
||||
apiKey = var.truenas_api_key
|
||||
}
|
||||
|
||||
sshConnection = {
|
||||
host = var.truenas_host
|
||||
port = 22
|
||||
username = "root"
|
||||
privateKey = var.truenas_ssh_private_key
|
||||
}
|
||||
|
||||
zfs = {
|
||||
datasetParentName = "main/iscsi"
|
||||
detachedSnapshotsDatasetParentName = "main/iscsi-snaps"
|
||||
}
|
||||
|
||||
iscsi = {
|
||||
targetPortal = "${var.truenas_host}:3260"
|
||||
namePrefix = "csi-"
|
||||
nameSuffix = ""
|
||||
targetGroups = [{
|
||||
targetGroupPortalGroup = 1
|
||||
targetGroupInitiatorGroup = 1
|
||||
targetGroupAuthType = "None"
|
||||
}]
|
||||
extentInsecureTpc = true
|
||||
extentXenCompat = false
|
||||
extentDisablePhysicalBlocksize = true
|
||||
extentBlocksize = 512
|
||||
extentRpm = "SSD"
|
||||
extentAvailThreshold = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
})]
|
||||
}
|
||||
10
stacks/platform/modules/iscsi-csi/variables.tf
Normal file
10
stacks/platform/modules/iscsi-csi/variables.tf
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
variable "tier" { type = string }
|
||||
variable "truenas_host" { type = string }
|
||||
variable "truenas_api_key" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "truenas_ssh_private_key" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
|
@ -79,6 +79,26 @@ resource "kubernetes_deployment" "uptime-kuma" {
|
|||
port {
|
||||
container_port = 3001
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 3001
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 3001
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/app/data"
|
||||
|
|
|
|||
|
|
@ -109,6 +109,26 @@ resource "kubernetes_deployment" "vaultwarden" {
|
|||
port {
|
||||
container_port = 80
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/alive"
|
||||
port = 80
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/alive"
|
||||
port = 80
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/data"
|
||||
|
|
|
|||
|
|
@ -49,6 +49,9 @@ resource "kubernetes_deployment" "clickhouse" {
|
|||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
strategy {
|
||||
type = "Recreate"
|
||||
}
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "clickhouse"
|
||||
|
|
@ -77,6 +80,26 @@ resource "kubernetes_deployment" "clickhouse" {
|
|||
protocol = "TCP"
|
||||
container_port = 8123
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/ping"
|
||||
port = 8123
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/ping"
|
||||
port = 8123
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/var/lib/clickhouse"
|
||||
|
|
@ -133,7 +156,7 @@ resource "kubernetes_cron_job_v1" "clickhouse_truncate_logs" {
|
|||
namespace = kubernetes_namespace.rybbit.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
schedule = "0 */6 * * *"
|
||||
schedule = "0 */6 * * *"
|
||||
successful_jobs_history_limit = 1
|
||||
failed_jobs_history_limit = 1
|
||||
job_template {
|
||||
|
|
@ -252,6 +275,26 @@ resource "kubernetes_deployment" "rybbit" {
|
|||
port {
|
||||
container_port = 3001
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/api/health"
|
||||
port = 3001
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/api/health"
|
||||
port = 3001
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "25m"
|
||||
|
|
@ -328,6 +371,26 @@ resource "kubernetes_deployment" "rybbit-client" {
|
|||
protocol = "TCP"
|
||||
container_port = 3002
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 3002
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 3002
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
|
|
|
|||
|
|
@ -147,6 +147,26 @@ resource "kubernetes_deployment" "shlink" {
|
|||
port {
|
||||
container_port = 8080
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/rest/v3/health"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/rest/v3/health"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -252,6 +272,26 @@ resource "kubernetes_deployment" "shlink-web" {
|
|||
port {
|
||||
container_port = 8080
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "config"
|
||||
|
|
|
|||
BIN
terraform.tfvars
BIN
terraform.tfvars
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue