infra/stacks/platform/modules/redis/main.tf
Viktor Barzin b323e567e4 Add HAProxy for Redis HA master-only routing
The Redis K8s Service was load-balancing across both master and replica
nodes, causing READONLY errors when clients hit the replica. This broke
Nextcloud (DAV 500s, liveness probe timeouts, crash loops) and
potentially other services.

Replace the direct Service with HAProxy (2 replicas) that health-checks
each Redis node via INFO replication and only routes to role:master.
On Sentinel failover, HAProxy detects the new master within ~9 seconds.
2026-03-13 22:21:10 +00:00

314 lines
7.7 KiB
HCL

variable "tls_secret_name" {}
variable "tier" { type = string }
variable "nfs_server" { type = string }
resource "kubernetes_namespace" "redis" {
metadata {
name = "redis"
labels = {
tier = var.tier
}
}
}
module "tls_secret" {
source = "../../../../modules/kubernetes/setup_tls_secret"
namespace = kubernetes_namespace.redis.metadata[0].name
tls_secret_name = var.tls_secret_name
}
# Redis with Sentinel HA via Bitnami Helm chart
# Architecture: 1 master + 1 replica + 2 sentinels (one per node)
# Sentinel automatically promotes a replica if master fails
# HAProxy sits in front and routes only to the current master (see below)
resource "helm_release" "redis" {
namespace = kubernetes_namespace.redis.metadata[0].name
create_namespace = false
name = "redis"
atomic = true
timeout = 600
repository = "oci://10.0.20.10:5000/bitnamicharts"
chart = "redis"
version = "25.3.2"
values = [yamlencode({
architecture = "replication"
auth = {
enabled = false
}
sentinel = {
enabled = true
quorum = 2
masterSet = "mymaster"
automateCluster = true
resources = {
requests = {
cpu = "50m"
memory = "64Mi"
}
limits = {
cpu = "200m"
memory = "128Mi"
}
}
}
master = {
persistence = {
enabled = true
storageClass = "iscsi-truenas"
size = "2Gi"
}
resources = {
requests = {
cpu = "100m"
memory = "64Mi"
}
limits = {
cpu = "500m"
memory = "256Mi"
}
}
}
replica = {
replicaCount = 2
persistence = {
enabled = true
storageClass = "iscsi-truenas"
size = "2Gi"
}
resources = {
requests = {
cpu = "50m"
memory = "64Mi"
}
limits = {
cpu = "500m"
memory = "256Mi"
}
}
}
# Metrics for Prometheus
metrics = {
enabled = false
}
# Use the existing service name so clients don't need changes
# Sentinel-enabled Bitnami chart creates a headless service
# and a regular service pointing at the master
nameOverride = "redis"
})]
}
# HAProxy-based master-only proxy for simple redis:// clients.
# Health-checks each Redis node via INFO replication and only routes
# to the current master. On Sentinel failover, HAProxy detects the
# new master within seconds via its health check interval.
# Previously this was a K8s Service that routed to all nodes, causing
# READONLY errors when clients hit a replica.
resource "kubernetes_config_map" "haproxy" {
metadata {
name = "redis-haproxy"
namespace = kubernetes_namespace.redis.metadata[0].name
}
data = {
"haproxy.cfg" = <<-EOT
global
maxconn 256
defaults
mode tcp
timeout connect 5s
timeout client 30s
timeout server 30s
timeout check 3s
frontend redis_front
bind *:6379
default_backend redis_master
frontend sentinel_front
bind *:26379
default_backend redis_sentinel
backend redis_master
option tcp-check
tcp-check connect
tcp-check send "PING\r\n"
tcp-check expect string +PONG
tcp-check send "INFO replication\r\n"
tcp-check expect string role:master
tcp-check send "QUIT\r\n"
tcp-check expect string +OK
server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 3s fall 3 rise 2
server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 3s fall 3 rise 2
backend redis_sentinel
balance roundrobin
server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:26379 check inter 5s
server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:26379 check inter 5s
EOT
}
}
resource "kubernetes_deployment" "haproxy" {
metadata {
name = "redis-haproxy"
namespace = kubernetes_namespace.redis.metadata[0].name
labels = {
app = "redis-haproxy"
}
}
spec {
replicas = 2
selector {
match_labels = {
app = "redis-haproxy"
}
}
template {
metadata {
labels = {
app = "redis-haproxy"
}
}
spec {
container {
name = "haproxy"
image = "docker.io/library/haproxy:3.1-alpine"
port {
container_port = 6379
name = "redis"
}
port {
container_port = 26379
name = "sentinel"
}
volume_mount {
name = "config"
mount_path = "/usr/local/etc/haproxy"
read_only = true
}
resources {
requests = {
cpu = "10m"
memory = "16Mi"
}
limits = {
cpu = "100m"
memory = "32Mi"
}
}
liveness_probe {
tcp_socket {
port = 6379
}
initial_delay_seconds = 5
period_seconds = 10
}
}
volume {
name = "config"
config_map {
name = kubernetes_config_map.haproxy.metadata[0].name
}
}
}
}
}
depends_on = [helm_release.redis]
}
resource "kubernetes_service" "redis" {
metadata {
name = "redis"
namespace = kubernetes_namespace.redis.metadata[0].name
}
spec {
selector = {
app = "redis-haproxy"
}
port {
name = "tcp-redis"
port = 6379
target_port = 6379
}
port {
name = "tcp-sentinel"
port = 26379
target_port = 26379
}
}
depends_on = [kubernetes_deployment.haproxy]
}
module "nfs_backup" {
source = "../../../../modules/kubernetes/nfs_volume"
name = "redis-backup"
namespace = kubernetes_namespace.redis.metadata[0].name
nfs_server = var.nfs_server
nfs_path = "/mnt/main/redis-backup"
}
# Hourly backup: copy RDB snapshot from master to NFS
resource "kubernetes_cron_job_v1" "redis-backup" {
metadata {
name = "redis-backup"
namespace = kubernetes_namespace.redis.metadata[0].name
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 3
schedule = "0 * * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 3
job_template {
metadata {}
spec {
backoff_limit = 2
ttl_seconds_after_finished = 60
template {
metadata {}
spec {
container {
name = "redis-backup"
image = "redis:7-alpine"
command = ["/bin/sh", "-c", <<-EOT
set -eux
# Trigger a fresh RDB save on the master
redis-cli -h redis.redis BGSAVE
sleep 5
# Copy the RDB via redis-cli --rdb
redis-cli -h redis.redis --rdb /backup/dump.rdb
echo "Backup complete: $(ls -lh /backup/dump.rdb)"
EOT
]
volume_mount {
name = "backup"
mount_path = "/backup"
}
}
volume {
name = "backup"
persistent_volume_claim {
claim_name = module.nfs_backup.claim_name
}
}
}
}
}
}
}
}