Add HAProxy for Redis HA master-only routing

The Redis K8s Service was load-balancing across both master and replica nodes, causing READONLY errors when clients hit the replica. This broke Nextcloud (DAV 500s, liveness probe timeouts, crash loops) and potentially other services. Replace the direct Service with HAProxy (2 replicas) that health-checks each Redis node via INFO replication and only routes to role:master. On Sentinel failover, HAProxy detects the new master within ~9 seconds.
2026-03-13 20:54:32 +00:00 · 2026-03-13 20:54:32 +00:00 · b323e567e4
commit b323e567e4
parent d05ff57b11
1 changed files with 125 additions and 10 deletions
--- a/stacks/platform/modules/redis/main.tf
+++ b/stacks/platform/modules/redis/main.tf
@ -18,9 +18,9 @@ module "tls_secret" {
 }

 # Redis with Sentinel HA via Bitnami Helm chart
-# Architecture: 1 master + 2 replicas + 3 sentinels
+# Architecture: 1 master + 1 replica + 2 sentinels (one per node)
 # Sentinel automatically promotes a replica if master fails
-# The K8s Service always points at the current master
+# HAProxy sits in front and routes only to the current master (see below)
 resource "helm_release" "redis" {
  namespace        = kubernetes_namespace.redis.metadata[0].name
  create_namespace = false
@ -109,9 +109,127 @@ resource "helm_release" "redis" {
  })]
 }

-# Override the Helm-managed service to pin to master pod
-# Sentinel clients can use the headless service for discovery,
-# but simple redis:// clients (paperless-ngx, etc.) need to hit the master
+# HAProxy-based master-only proxy for simple redis:// clients.
+# Health-checks each Redis node via INFO replication and only routes
+# to the current master. On Sentinel failover, HAProxy detects the
+# new master within seconds via its health check interval.
+# Previously this was a K8s Service that routed to all nodes, causing
+# READONLY errors when clients hit a replica.
+
+resource "kubernetes_config_map" "haproxy" {
+  metadata {
+    name      = "redis-haproxy"
+    namespace = kubernetes_namespace.redis.metadata[0].name
+  }
+  data = {
+    "haproxy.cfg" = <<-EOT
+      global
+        maxconn 256
+
+      defaults
+        mode tcp
+        timeout connect 5s
+        timeout client  30s
+        timeout server  30s
+        timeout check   3s
+
+      frontend redis_front
+        bind *:6379
+        default_backend redis_master
+
+      frontend sentinel_front
+        bind *:26379
+        default_backend redis_sentinel
+
+      backend redis_master
+        option tcp-check
+        tcp-check connect
+        tcp-check send "PING\r\n"
+        tcp-check expect string +PONG
+        tcp-check send "INFO replication\r\n"
+        tcp-check expect string role:master
+        tcp-check send "QUIT\r\n"
+        tcp-check expect string +OK
+        server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 3s fall 3 rise 2
+        server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 3s fall 3 rise 2
+
+      backend redis_sentinel
+        balance roundrobin
+        server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:26379 check inter 5s
+        server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:26379 check inter 5s
+    EOT
+  }
+}
+
+resource "kubernetes_deployment" "haproxy" {
+  metadata {
+    name      = "redis-haproxy"
+    namespace = kubernetes_namespace.redis.metadata[0].name
+    labels = {
+      app = "redis-haproxy"
+    }
+  }
+  spec {
+    replicas = 2
+    selector {
+      match_labels = {
+        app = "redis-haproxy"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "redis-haproxy"
+        }
+      }
+      spec {
+        container {
+          name  = "haproxy"
+          image = "docker.io/library/haproxy:3.1-alpine"
+          port {
+            container_port = 6379
+            name           = "redis"
+          }
+          port {
+            container_port = 26379
+            name           = "sentinel"
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/usr/local/etc/haproxy"
+            read_only  = true
+          }
+          resources {
+            requests = {
+              cpu    = "10m"
+              memory = "16Mi"
+            }
+            limits = {
+              cpu    = "100m"
+              memory = "32Mi"
+            }
+          }
+          liveness_probe {
+            tcp_socket {
+              port = 6379
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 10
+          }
+        }
+        volume {
+          name = "config"
+          config_map {
+            name = kubernetes_config_map.haproxy.metadata[0].name
+          }
+        }
+      }
+    }
+  }
+
+  depends_on = [helm_release.redis]
+}
+
 resource "kubernetes_service" "redis" {
  metadata {
    name      = "redis"
@ -119,10 +237,7 @@ resource "kubernetes_service" "redis" {
  }
  spec {
    selector = {
-      "app.kubernetes.io/component"        = "node"
-      "app.kubernetes.io/instance"         = "redis"
-      "app.kubernetes.io/name"             = "redis"
-      "statefulset.kubernetes.io/pod-name" = "redis-node-0"
+      app = "redis-haproxy"
    }
    port {
      name        = "tcp-redis"
@ -136,7 +251,7 @@ resource "kubernetes_service" "redis" {
    }
  }

-  depends_on = [helm_release.redis]
+  depends_on = [kubernetes_deployment.haproxy]
 }

 module "nfs_backup" {