From d5a47e35fcacb99d5d3b752ea2e4683df96cf2e1 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 19 Apr 2026 12:39:09 +0000 Subject: [PATCH] [redis] Restore dynamic DNS in HAProxy to fix stale-IP outage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HAProxy resolved `redis-node-{0,1}.redis-headless.redis.svc.cluster.local` once at pod startup and cached the IPs forever. When redis-node pods cycled (new pod IPs), HAProxy kept connecting to the dead IPs — backends flapped between "Connection refused" and "Layer4 timeout", and Immich's ioredis client hit EPIPE until max-retries exhausted and the pod entered CrashLoopBackOff. This caused an Immich outage on 2026-04-19. Fix: - Add `resolvers kubernetes` stanza pointing at kube-dns (10s hold on every category so we pick up pod IP changes within a DNS TTL window). - Add `resolvers kubernetes init-addr last,libc,none` to every backend server line so HAProxy resolves at startup AND uses the dynamic resolver for runtime refresh. - Add `checksum/config` pod annotation to the HAProxy Deployment so a haproxy.cfg change actually rolls the pods (including this one). Closes: code-fd6 --- stacks/redis/modules/redis/main.tf | 31 ++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/stacks/redis/modules/redis/main.tf b/stacks/redis/modules/redis/main.tf index c0c9d7b6..50c4e3a4 100644 --- a/stacks/redis/modules/redis/main.tf +++ b/stacks/redis/modules/redis/main.tf @@ -148,6 +148,24 @@ resource "kubernetes_config_map" "haproxy" { timeout server 30s timeout check 3s + # Dynamic DNS resolution via cluster CoreDNS. Without this, haproxy + # resolves server hostnames once at startup and caches forever, so + # when redis-node-X pods restart and get new IPs, haproxy keeps + # connecting to the old (dead) IPs and returns "Connection refused" + # until haproxy itself is restarted. This caused an immich outage + # on 2026-04-19 after a redis pod cycle. + resolvers kubernetes + nameserver coredns kube-dns.kube-system.svc.cluster.local:53 + resolve_retries 3 + timeout resolve 1s + timeout retry 1s + hold other 10s + hold refused 10s + hold nx 10s + hold timeout 10s + hold valid 10s + hold obsolete 10s + frontend redis_front bind *:6379 default_backend redis_master @@ -167,13 +185,13 @@ resource "kubernetes_config_map" "haproxy" { tcp-check expect rstring role:master tcp-check send "QUIT\r\n" tcp-check expect string +OK - server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 - server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 + server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none + server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none backend redis_sentinel balance roundrobin - server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:26379 check inter 5s - server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:26379 check inter 5s + server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:26379 check inter 5s resolvers kubernetes init-addr last,libc,none + server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:26379 check inter 5s resolvers kubernetes init-addr last,libc,none EOT } } @@ -198,6 +216,11 @@ resource "kubernetes_deployment" "haproxy" { labels = { app = "redis-haproxy" } + annotations = { + # Roll the deployment whenever haproxy.cfg content changes so a + # config update (e.g. DNS resolver tweaks) actually takes effect. + "checksum/config" = sha256(kubernetes_config_map.haproxy.data["haproxy.cfg"]) + } } spec { container {