fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]
6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6d224861c4
commit
fd0f4a0365
1166 changed files with 358546 additions and 0 deletions
405
stacks/redis/modules/redis/main.tf
Normal file
405
stacks/redis/modules/redis/main.tf
Normal file
|
|
@ -0,0 +1,405 @@
|
|||
variable "tls_secret_name" {}
|
||||
variable "tier" { type = string }
|
||||
variable "nfs_server" { type = string }
|
||||
|
||||
resource "kubernetes_namespace" "redis" {
|
||||
metadata {
|
||||
name = "redis"
|
||||
labels = {
|
||||
tier = var.tier
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
|
||||
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
||||
}
|
||||
}
|
||||
|
||||
module "tls_secret" {
|
||||
source = "../../../../modules/kubernetes/setup_tls_secret"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
tls_secret_name = var.tls_secret_name
|
||||
}
|
||||
|
||||
#### Redis — SINGLE standalone instance (reverted from 3-node Sentinel HA 2026-05-30)
|
||||
#
|
||||
# History: a 3-node StatefulSet + Sentinel + HAProxy (the "redis-v2" rework of
|
||||
# 2026-04-19, beads code-v2b) was built to eliminate the 2-sentinel split-brain
|
||||
# of the 2026-04-19 PM incident. It STILL split-brained on 2026-05-30:
|
||||
# redis-v2-0 booted during a network partition, hit the init script's
|
||||
# "pod-0 is always the bootstrap master" fallback, and became a SECOND master
|
||||
# alongside the sentinel-elected redis-v2-2. HAProxy's `expect rstring
|
||||
# role:master` matched BOTH, so it round-robined client connections across
|
||||
# both masters — Immich enqueued BullMQ jobs on one instance while its workers
|
||||
# blocked-popped on the other, wedging every queue (new-upload thumbnails 404'd
|
||||
# cluster-wide). Third Redis HA incident in ~6 weeks.
|
||||
#
|
||||
# Decision (Viktor, 2026-05-30): revert to a SINGLE instance. A homelab
|
||||
# cache/broker does not need HA; a few seconds of downtime on a pod restart is
|
||||
# an acceptable trade for structurally removing the entire split-brain class
|
||||
# (no sentinel quorum, no second master, no HAProxy master fan-out).
|
||||
#
|
||||
# eviction policy `volatile-lru` (was `allkeys-lru`): the instance is shared by
|
||||
# ~15 consumers split between CACHES (want LRU eviction of disposable keys) and
|
||||
# QUEUES (Immich BullMQ `bull:*`, Celery `_kombu:*` — must NEVER be evicted or
|
||||
# jobs vanish). `volatile-lru` evicts only keys that carry a TTL (caches set
|
||||
# them) and never touches TTL-less keys (queue jobs), so it serves both
|
||||
# correctly in one instance. Backstop: PrometheusRule RedisMemoryHigh (>80%)
|
||||
# in the monitoring stack — if it ever fills with non-volatile keys, writes
|
||||
# error like noeviction, and we want to know before that happens.
|
||||
#
|
||||
# Service name `redis-master.redis.svc.cluster.local:6379` is UNCHANGED so all
|
||||
# ~15 consumers keep working without edits — it now selects the redis pod
|
||||
# directly instead of HAProxy. Confirmed (2026-05-30) no consumer used the
|
||||
# Sentinel port (26379); Nextcloud dropped its in-process sentinel query in the
|
||||
# 2026-04-19 rework. Pattern mirrors the MySQL standalone (memory 711).
|
||||
|
||||
resource "kubernetes_config_map" "redis_v2_conf" {
|
||||
metadata {
|
||||
name = "redis-v2-conf"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"redis.conf" = <<-EOT
|
||||
bind 0.0.0.0 -::*
|
||||
port 6379
|
||||
protected-mode no
|
||||
dir /data
|
||||
|
||||
maxmemory 640mb
|
||||
# volatile-lru: evict only keys WITH a TTL (caches) under memory
|
||||
# pressure; never evict TTL-less keys (Immich BullMQ + Celery jobs).
|
||||
# See the header comment for the full rationale. Was allkeys-lru, which
|
||||
# silently evicted queue jobs.
|
||||
maxmemory-policy volatile-lru
|
||||
|
||||
save 900 1
|
||||
save 300 100
|
||||
save 60 10000
|
||||
rdbcompression yes
|
||||
rdbchecksum yes
|
||||
stop-writes-on-bgsave-error no
|
||||
|
||||
appendonly yes
|
||||
appendfsync everysec
|
||||
no-appendfsync-on-rewrite no
|
||||
auto-aof-rewrite-percentage 200
|
||||
auto-aof-rewrite-min-size 128mb
|
||||
aof-load-truncated yes
|
||||
aof-use-rdb-preamble yes
|
||||
# Allow loading an AOF with up to 1KB of garbage at the tail (post-2026-05-26
|
||||
# node2 unclean reboot corrupted an incremental AOF; without this redis
|
||||
# crashlooped). Redis truncates the corrupted tail and continues.
|
||||
aof-load-corrupt-tail-max-size 1024
|
||||
|
||||
timeout 0
|
||||
tcp-keepalive 300
|
||||
tcp-backlog 511
|
||||
databases 16
|
||||
|
||||
loglevel notice
|
||||
EOT
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "redis_v2_headless" {
|
||||
metadata {
|
||||
name = "redis-v2-headless"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
labels = {
|
||||
app = "redis-v2"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
cluster_ip = "None"
|
||||
publish_not_ready_addresses = false
|
||||
selector = {
|
||||
app = "redis-v2"
|
||||
}
|
||||
port {
|
||||
name = "redis"
|
||||
port = 6379
|
||||
}
|
||||
port {
|
||||
name = "exporter"
|
||||
port = 9121
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Stable client-facing service for all ~15 Redis consumers.
|
||||
# Name/DNS (redis-master.redis.svc.cluster.local) unchanged across the HA
|
||||
# teardown; now selects the redis pod directly (HAProxy removed).
|
||||
resource "kubernetes_service" "redis_master" {
|
||||
metadata {
|
||||
name = "redis-master"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
labels = {
|
||||
app = "redis-v2"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
selector = {
|
||||
app = "redis-v2"
|
||||
}
|
||||
port {
|
||||
name = "redis"
|
||||
port = 6379
|
||||
target_port = 6379
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module "nfs_backup_host" {
|
||||
source = "../../../../modules/kubernetes/nfs_volume"
|
||||
name = "redis-backup-host"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
nfs_server = "192.168.1.127"
|
||||
nfs_path = "/srv/nfs/redis-backup"
|
||||
}
|
||||
|
||||
resource "kubernetes_stateful_set_v1" "redis_v2" {
|
||||
metadata {
|
||||
name = "redis-v2"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
labels = {
|
||||
app = "redis-v2"
|
||||
# Keel opt-out: a :8-alpine -> :8.0.6-alpine patch bump (also a
|
||||
# semantic downgrade) rejected `aof-load-corrupt-tail-max-size` and
|
||||
# crashed redis. Both LABEL + ANNOTATION required for full opt-out.
|
||||
"keel.sh/policy" = "never"
|
||||
}
|
||||
annotations = {
|
||||
"keel.sh/policy" = "never"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
service_name = kubernetes_service.redis_v2_headless.metadata[0].name
|
||||
replicas = 1
|
||||
# pod_management_policy is immutable on a StatefulSet — kept as "Parallel"
|
||||
# (unchanged from the 3-node era) so this revert does NOT force a
|
||||
# destroy/recreate of the STS (which would detach the data PVC).
|
||||
pod_management_policy = "Parallel"
|
||||
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "redis-v2"
|
||||
}
|
||||
}
|
||||
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "redis-v2"
|
||||
}
|
||||
annotations = {
|
||||
"prometheus.io/scrape" = "true"
|
||||
"prometheus.io/port" = "9121"
|
||||
"checksum/conf" = sha256(kubernetes_config_map.redis_v2_conf.data["redis.conf"])
|
||||
}
|
||||
}
|
||||
spec {
|
||||
termination_grace_period_seconds = 30
|
||||
|
||||
container {
|
||||
name = "redis"
|
||||
image = "docker.io/library/redis:8-alpine"
|
||||
command = ["redis-server", "/etc/redis/redis.conf"]
|
||||
|
||||
port {
|
||||
container_port = 6379
|
||||
name = "redis"
|
||||
}
|
||||
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
memory = "768Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "768Mi"
|
||||
}
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/data"
|
||||
}
|
||||
volume_mount {
|
||||
name = "conf"
|
||||
mount_path = "/etc/redis"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
liveness_probe {
|
||||
exec {
|
||||
command = ["redis-cli", "PING"]
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 10
|
||||
timeout_seconds = 10
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
exec {
|
||||
command = ["redis-cli", "PING"]
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 5
|
||||
timeout_seconds = 3
|
||||
failure_threshold = 3
|
||||
}
|
||||
}
|
||||
|
||||
container {
|
||||
name = "exporter"
|
||||
image = "docker.io/oliver006/redis_exporter:v1.62.0"
|
||||
|
||||
port {
|
||||
container_port = 9121
|
||||
name = "exporter"
|
||||
}
|
||||
|
||||
env {
|
||||
name = "REDIS_ADDR"
|
||||
value = "redis://localhost:6379"
|
||||
}
|
||||
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "32Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "32Mi"
|
||||
}
|
||||
}
|
||||
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 9121
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
}
|
||||
}
|
||||
|
||||
volume {
|
||||
name = "conf"
|
||||
config_map {
|
||||
name = kubernetes_config_map.redis_v2_conf.metadata[0].name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
volume_claim_template {
|
||||
metadata {
|
||||
name = "data"
|
||||
annotations = {
|
||||
# NOTE: VCT is immutable on a live StatefulSet — this must match the
|
||||
# live value (drifted to 80% out-of-band) or apply fails with
|
||||
# "updates to statefulset spec ... forbidden". Don't "fix" to 10%.
|
||||
"resize.topolvm.io/threshold" = "80%"
|
||||
"resize.topolvm.io/increase" = "100%"
|
||||
"resize.topolvm.io/storage_limit" = "20Gi"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
access_modes = ["ReadWriteOnce"]
|
||||
storage_class_name = "proxmox-lvm-encrypted"
|
||||
resources {
|
||||
requests = {
|
||||
storage = "5Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
|
||||
# Weekly backup: copy RDB snapshot to NFS
|
||||
resource "kubernetes_cron_job_v1" "redis-backup" {
|
||||
metadata {
|
||||
name = "redis-backup"
|
||||
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Replace"
|
||||
failed_jobs_history_limit = 3
|
||||
schedule = "0 3 * * 0"
|
||||
starting_deadline_seconds = 10
|
||||
successful_jobs_history_limit = 3
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 2
|
||||
ttl_seconds_after_finished = 60
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
container {
|
||||
name = "redis-backup"
|
||||
image = "redis:7-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -eux
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M)
|
||||
# Trigger a fresh RDB save on the master
|
||||
redis-cli -h redis-master.redis BGSAVE
|
||||
sleep 5
|
||||
# Copy the RDB via redis-cli --rdb
|
||||
redis-cli -h redis-master.redis --rdb /backup/redis-$TIMESTAMP.rdb
|
||||
# Rotate — 28-day retention
|
||||
find /backup -name 'redis-*.rdb' -type f -mtime +28 -delete
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')"
|
||||
|
||||
_out_bytes=$(stat -c%s /backup/redis-$TIMESTAMP.rdb)
|
||||
wget -qO- --post-data "backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
name = "backup"
|
||||
mount_path = "/backup"
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "backup"
|
||||
persistent_volume_claim {
|
||||
claim_name = module.nfs_backup_host.claim_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue