[redis] Restore dynamic DNS in HAProxy to fix stale-IP outage

HAProxy resolved `redis-node-{0,1}.redis-headless.redis.svc.cluster.local` once at pod startup and cached the IPs forever. When redis-node pods cycled (new pod IPs), HAProxy kept connecting to the dead IPs — backends flapped between "Connection refused" and "Layer4 timeout", and Immich's ioredis client hit EPIPE until max-retries exhausted and the pod entered CrashLoopBackOff. This caused an Immich outage on 2026-04-19. Fix: - Add `resolvers kubernetes` stanza pointing at kube-dns (10s hold on every category so we pick up pod IP changes within a DNS TTL window). - Add `resolvers kubernetes init-addr last,libc,none` to every backend server line so HAProxy resolves at startup AND uses the dynamic resolver for runtime refresh. - Add `checksum/config` pod annotation to the HAProxy Deployment so a haproxy.cfg change actually rolls the pods (including this one). Closes: code-fd6
2026-04-19 12:39:09 +00:00 · 2026-04-19 12:39:09 +00:00 · d5a47e35fc
commit d5a47e35fc
parent 43fe11fffc
1 changed files with 27 additions and 4 deletions
--- a/stacks/redis/modules/redis/main.tf
+++ b/stacks/redis/modules/redis/main.tf
@ -148,6 +148,24 @@ resource "kubernetes_config_map" "haproxy" {
        timeout server  30s
        timeout check   3s

+      # Dynamic DNS resolution via cluster CoreDNS. Without this, haproxy
+      # resolves server hostnames once at startup and caches forever, so
+      # when redis-node-X pods restart and get new IPs, haproxy keeps
+      # connecting to the old (dead) IPs and returns "Connection refused"
+      # until haproxy itself is restarted. This caused an immich outage
+      # on 2026-04-19 after a redis pod cycle.
+      resolvers kubernetes
+        nameserver coredns kube-dns.kube-system.svc.cluster.local:53
+        resolve_retries 3
+        timeout resolve 1s
+        timeout retry   1s
+        hold other      10s
+        hold refused    10s
+        hold nx         10s
+        hold timeout    10s
+        hold valid      10s
+        hold obsolete   10s
+
      frontend redis_front
        bind *:6379
        default_backend redis_master
@ -167,13 +185,13 @@ resource "kubernetes_config_map" "haproxy" {
        tcp-check expect rstring role:master
        tcp-check send "QUIT\r\n"
        tcp-check expect string +OK
-        server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2
-        server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2
+        server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none
+        server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none

      backend redis_sentinel
        balance roundrobin
-        server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:26379 check inter 5s
-        server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:26379 check inter 5s
+        server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:26379 check inter 5s resolvers kubernetes init-addr last,libc,none
+        server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:26379 check inter 5s resolvers kubernetes init-addr last,libc,none
    EOT
  }
 }
@ -198,6 +216,11 @@ resource "kubernetes_deployment" "haproxy" {
        labels = {
          app = "redis-haproxy"
        }
+        annotations = {
+          # Roll the deployment whenever haproxy.cfg content changes so a
+          # config update (e.g. DNS resolver tweaks) actually takes effect.
+          "checksum/config" = sha256(kubernetes_config_map.haproxy.data["haproxy.cfg"])
+        }
      }
      spec {
        container {