The redis-v2 Sentinel cluster split-brained: redis-v2-0 booted during a network partition, hit the init script's deterministic "pod-0 = bootstrap master" fallback, and became a SECOND master alongside the sentinel-elected redis-v2-2. HAProxy's `expect rstring role:master` matched both and round-robined client connections across the two diverging masters, so Immich enqueued BullMQ jobs on one while its workers blocked-popped on the other -> every queue wedged and new-upload thumbnails 404'd cluster-wide. Third Sentinel-class incident in ~6 weeks (after the 2026-04-19 PM quorum drift and 2026-04-22 flap cascade). Revert to a single standalone instance: replicas=1; drop Sentinel + HAProxy + init bootstrap configmap + both PDBs; redis container only (+ exporter). maxmemory-policy allkeys-lru -> volatile-lru so one shared instance serves both workload classes correctly: evict only TTL'd cache keys, never TTL-less Immich BullMQ / Celery job keys. redis-master service name/DNS unchanged -> no consumer edits; collapsed onto redis-v2-0's existing dataset (queued jobs preserved). Applied via tg (Tier 1 / PG-authoritative state); this commit syncs source + docs only, hence [ci skip]. Monitoring: drop RedisReplicationLagHigh + RedisReplicasMissing (no replicas now; the latter would false-fire), RedisMemoryPressure 85%->80% volatile-lru backstop. Docs: rewrite databases.md Redis section (single-instance design + incident history); add post-mortem 2026-05-30-redis-split-brain.md. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
405 lines
13 KiB
HCL
405 lines
13 KiB
HCL
variable "tls_secret_name" {}
|
|
variable "tier" { type = string }
|
|
variable "nfs_server" { type = string }
|
|
|
|
resource "kubernetes_namespace" "redis" {
|
|
metadata {
|
|
name = "redis"
|
|
labels = {
|
|
tier = var.tier
|
|
"keel.sh/enrolled" = "true"
|
|
}
|
|
}
|
|
lifecycle {
|
|
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
|
|
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
|
}
|
|
}
|
|
|
|
module "tls_secret" {
|
|
source = "../../../../modules/kubernetes/setup_tls_secret"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
tls_secret_name = var.tls_secret_name
|
|
}
|
|
|
|
#### Redis — SINGLE standalone instance (reverted from 3-node Sentinel HA 2026-05-30)
|
|
#
|
|
# History: a 3-node StatefulSet + Sentinel + HAProxy (the "redis-v2" rework of
|
|
# 2026-04-19, beads code-v2b) was built to eliminate the 2-sentinel split-brain
|
|
# of the 2026-04-19 PM incident. It STILL split-brained on 2026-05-30:
|
|
# redis-v2-0 booted during a network partition, hit the init script's
|
|
# "pod-0 is always the bootstrap master" fallback, and became a SECOND master
|
|
# alongside the sentinel-elected redis-v2-2. HAProxy's `expect rstring
|
|
# role:master` matched BOTH, so it round-robined client connections across
|
|
# both masters — Immich enqueued BullMQ jobs on one instance while its workers
|
|
# blocked-popped on the other, wedging every queue (new-upload thumbnails 404'd
|
|
# cluster-wide). Third Redis HA incident in ~6 weeks.
|
|
#
|
|
# Decision (Viktor, 2026-05-30): revert to a SINGLE instance. A homelab
|
|
# cache/broker does not need HA; a few seconds of downtime on a pod restart is
|
|
# an acceptable trade for structurally removing the entire split-brain class
|
|
# (no sentinel quorum, no second master, no HAProxy master fan-out).
|
|
#
|
|
# eviction policy `volatile-lru` (was `allkeys-lru`): the instance is shared by
|
|
# ~15 consumers split between CACHES (want LRU eviction of disposable keys) and
|
|
# QUEUES (Immich BullMQ `bull:*`, Celery `_kombu:*` — must NEVER be evicted or
|
|
# jobs vanish). `volatile-lru` evicts only keys that carry a TTL (caches set
|
|
# them) and never touches TTL-less keys (queue jobs), so it serves both
|
|
# correctly in one instance. Backstop: PrometheusRule RedisMemoryHigh (>80%)
|
|
# in the monitoring stack — if it ever fills with non-volatile keys, writes
|
|
# error like noeviction, and we want to know before that happens.
|
|
#
|
|
# Service name `redis-master.redis.svc.cluster.local:6379` is UNCHANGED so all
|
|
# ~15 consumers keep working without edits — it now selects the redis pod
|
|
# directly instead of HAProxy. Confirmed (2026-05-30) no consumer used the
|
|
# Sentinel port (26379); Nextcloud dropped its in-process sentinel query in the
|
|
# 2026-04-19 rework. Pattern mirrors the MySQL standalone (memory 711).
|
|
|
|
resource "kubernetes_config_map" "redis_v2_conf" {
|
|
metadata {
|
|
name = "redis-v2-conf"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
}
|
|
data = {
|
|
"redis.conf" = <<-EOT
|
|
bind 0.0.0.0 -::*
|
|
port 6379
|
|
protected-mode no
|
|
dir /data
|
|
|
|
maxmemory 640mb
|
|
# volatile-lru: evict only keys WITH a TTL (caches) under memory
|
|
# pressure; never evict TTL-less keys (Immich BullMQ + Celery jobs).
|
|
# See the header comment for the full rationale. Was allkeys-lru, which
|
|
# silently evicted queue jobs.
|
|
maxmemory-policy volatile-lru
|
|
|
|
save 900 1
|
|
save 300 100
|
|
save 60 10000
|
|
rdbcompression yes
|
|
rdbchecksum yes
|
|
stop-writes-on-bgsave-error no
|
|
|
|
appendonly yes
|
|
appendfsync everysec
|
|
no-appendfsync-on-rewrite no
|
|
auto-aof-rewrite-percentage 200
|
|
auto-aof-rewrite-min-size 128mb
|
|
aof-load-truncated yes
|
|
aof-use-rdb-preamble yes
|
|
# Allow loading an AOF with up to 1KB of garbage at the tail (post-2026-05-26
|
|
# node2 unclean reboot corrupted an incremental AOF; without this redis
|
|
# crashlooped). Redis truncates the corrupted tail and continues.
|
|
aof-load-corrupt-tail-max-size 1024
|
|
|
|
timeout 0
|
|
tcp-keepalive 300
|
|
tcp-backlog 511
|
|
databases 16
|
|
|
|
loglevel notice
|
|
EOT
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_service" "redis_v2_headless" {
|
|
metadata {
|
|
name = "redis-v2-headless"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
labels = {
|
|
app = "redis-v2"
|
|
}
|
|
}
|
|
spec {
|
|
cluster_ip = "None"
|
|
publish_not_ready_addresses = false
|
|
selector = {
|
|
app = "redis-v2"
|
|
}
|
|
port {
|
|
name = "redis"
|
|
port = 6379
|
|
}
|
|
port {
|
|
name = "exporter"
|
|
port = 9121
|
|
}
|
|
}
|
|
}
|
|
|
|
# Stable client-facing service for all ~15 Redis consumers.
|
|
# Name/DNS (redis-master.redis.svc.cluster.local) unchanged across the HA
|
|
# teardown; now selects the redis pod directly (HAProxy removed).
|
|
resource "kubernetes_service" "redis_master" {
|
|
metadata {
|
|
name = "redis-master"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
labels = {
|
|
app = "redis-v2"
|
|
}
|
|
}
|
|
spec {
|
|
selector = {
|
|
app = "redis-v2"
|
|
}
|
|
port {
|
|
name = "redis"
|
|
port = 6379
|
|
target_port = 6379
|
|
}
|
|
}
|
|
}
|
|
|
|
module "nfs_backup_host" {
|
|
source = "../../../../modules/kubernetes/nfs_volume"
|
|
name = "redis-backup-host"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
nfs_server = "192.168.1.127"
|
|
nfs_path = "/srv/nfs/redis-backup"
|
|
}
|
|
|
|
resource "kubernetes_stateful_set_v1" "redis_v2" {
|
|
metadata {
|
|
name = "redis-v2"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
labels = {
|
|
app = "redis-v2"
|
|
# Keel opt-out: a :8-alpine -> :8.0.6-alpine patch bump (also a
|
|
# semantic downgrade) rejected `aof-load-corrupt-tail-max-size` and
|
|
# crashed redis. Both LABEL + ANNOTATION required for full opt-out.
|
|
"keel.sh/policy" = "never"
|
|
}
|
|
annotations = {
|
|
"keel.sh/policy" = "never"
|
|
}
|
|
}
|
|
spec {
|
|
service_name = kubernetes_service.redis_v2_headless.metadata[0].name
|
|
replicas = 1
|
|
# pod_management_policy is immutable on a StatefulSet — kept as "Parallel"
|
|
# (unchanged from the 3-node era) so this revert does NOT force a
|
|
# destroy/recreate of the STS (which would detach the data PVC).
|
|
pod_management_policy = "Parallel"
|
|
|
|
selector {
|
|
match_labels = {
|
|
app = "redis-v2"
|
|
}
|
|
}
|
|
|
|
template {
|
|
metadata {
|
|
labels = {
|
|
app = "redis-v2"
|
|
}
|
|
annotations = {
|
|
"prometheus.io/scrape" = "true"
|
|
"prometheus.io/port" = "9121"
|
|
"checksum/conf" = sha256(kubernetes_config_map.redis_v2_conf.data["redis.conf"])
|
|
}
|
|
}
|
|
spec {
|
|
termination_grace_period_seconds = 30
|
|
|
|
container {
|
|
name = "redis"
|
|
image = "docker.io/library/redis:8-alpine"
|
|
command = ["redis-server", "/etc/redis/redis.conf"]
|
|
|
|
port {
|
|
container_port = 6379
|
|
name = "redis"
|
|
}
|
|
|
|
resources {
|
|
requests = {
|
|
cpu = "100m"
|
|
memory = "768Mi"
|
|
}
|
|
limits = {
|
|
memory = "768Mi"
|
|
}
|
|
}
|
|
|
|
volume_mount {
|
|
name = "data"
|
|
mount_path = "/data"
|
|
}
|
|
volume_mount {
|
|
name = "conf"
|
|
mount_path = "/etc/redis"
|
|
read_only = true
|
|
}
|
|
|
|
liveness_probe {
|
|
exec {
|
|
command = ["redis-cli", "PING"]
|
|
}
|
|
initial_delay_seconds = 15
|
|
period_seconds = 10
|
|
timeout_seconds = 10
|
|
failure_threshold = 5
|
|
}
|
|
readiness_probe {
|
|
exec {
|
|
command = ["redis-cli", "PING"]
|
|
}
|
|
initial_delay_seconds = 5
|
|
period_seconds = 5
|
|
timeout_seconds = 3
|
|
failure_threshold = 3
|
|
}
|
|
}
|
|
|
|
container {
|
|
name = "exporter"
|
|
image = "docker.io/oliver006/redis_exporter:v1.62.0"
|
|
|
|
port {
|
|
container_port = 9121
|
|
name = "exporter"
|
|
}
|
|
|
|
env {
|
|
name = "REDIS_ADDR"
|
|
value = "redis://localhost:6379"
|
|
}
|
|
|
|
resources {
|
|
requests = {
|
|
cpu = "10m"
|
|
memory = "32Mi"
|
|
}
|
|
limits = {
|
|
memory = "32Mi"
|
|
}
|
|
}
|
|
|
|
liveness_probe {
|
|
http_get {
|
|
path = "/"
|
|
port = 9121
|
|
}
|
|
initial_delay_seconds = 15
|
|
period_seconds = 30
|
|
timeout_seconds = 5
|
|
}
|
|
}
|
|
|
|
volume {
|
|
name = "conf"
|
|
config_map {
|
|
name = kubernetes_config_map.redis_v2_conf.metadata[0].name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
volume_claim_template {
|
|
metadata {
|
|
name = "data"
|
|
annotations = {
|
|
# NOTE: VCT is immutable on a live StatefulSet — this must match the
|
|
# live value (drifted to 80% out-of-band) or apply fails with
|
|
# "updates to statefulset spec ... forbidden". Don't "fix" to 10%.
|
|
"resize.topolvm.io/threshold" = "80%"
|
|
"resize.topolvm.io/increase" = "100%"
|
|
"resize.topolvm.io/storage_limit" = "20Gi"
|
|
}
|
|
}
|
|
spec {
|
|
access_modes = ["ReadWriteOnce"]
|
|
storage_class_name = "proxmox-lvm-encrypted"
|
|
resources {
|
|
requests = {
|
|
storage = "5Gi"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
lifecycle {
|
|
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
|
ignore_changes = [spec[0].template[0].spec[0].dns_config]
|
|
}
|
|
}
|
|
|
|
# Weekly backup: copy RDB snapshot to NFS
|
|
resource "kubernetes_cron_job_v1" "redis-backup" {
|
|
metadata {
|
|
name = "redis-backup"
|
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
|
}
|
|
spec {
|
|
concurrency_policy = "Replace"
|
|
failed_jobs_history_limit = 3
|
|
schedule = "0 3 * * 0"
|
|
starting_deadline_seconds = 10
|
|
successful_jobs_history_limit = 3
|
|
job_template {
|
|
metadata {}
|
|
spec {
|
|
backoff_limit = 2
|
|
ttl_seconds_after_finished = 60
|
|
template {
|
|
metadata {}
|
|
spec {
|
|
container {
|
|
name = "redis-backup"
|
|
image = "redis:7-alpine"
|
|
command = ["/bin/sh", "-c", <<-EOT
|
|
set -eux
|
|
_t0=$(date +%s)
|
|
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
|
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
|
|
|
TIMESTAMP=$(date +%Y%m%d-%H%M)
|
|
# Trigger a fresh RDB save on the master
|
|
redis-cli -h redis-master.redis BGSAVE
|
|
sleep 5
|
|
# Copy the RDB via redis-cli --rdb
|
|
redis-cli -h redis-master.redis --rdb /backup/redis-$TIMESTAMP.rdb
|
|
# Rotate — 28-day retention
|
|
find /backup -name 'redis-*.rdb' -type f -mtime +28 -delete
|
|
|
|
_dur=$(($(date +%s) - _t0))
|
|
_rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
|
_wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
|
echo "=== Backup IO Stats ==="
|
|
echo "duration: $${_dur}s"
|
|
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
|
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
|
echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')"
|
|
|
|
_out_bytes=$(stat -c%s /backup/redis-$TIMESTAMP.rdb)
|
|
wget -qO- --post-data "backup_duration_seconds $${_dur}
|
|
backup_read_bytes $(( _rb1 - _rb0 ))
|
|
backup_written_bytes $(( _wb1 - _wb0 ))
|
|
backup_output_bytes $${_out_bytes}
|
|
backup_last_success_timestamp $(date +%s)
|
|
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true
|
|
EOT
|
|
]
|
|
volume_mount {
|
|
name = "backup"
|
|
mount_path = "/backup"
|
|
}
|
|
}
|
|
volume {
|
|
name = "backup"
|
|
persistent_volume_claim {
|
|
claim_name = module.nfs_backup_host.claim_name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
lifecycle {
|
|
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
|
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
|
}
|
|
}
|