From 6cfc4b78362a66bf4c04d40491a89f1217249974 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 19 Apr 2026 00:10:35 +0000 Subject: [PATCH] [mailserver] Add backup CronJob for Roundcube html + enigma PVCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Roundcube webmail runs with two encrypted RWO PVCs (see roundcubemail.tf: `roundcubemail-html-encrypted`, `roundcubemail-enigma-encrypted`). These carry user-visible state that is NOT regenerable without user action: - `html` PVC → Apache docroot, plugin installs, skin overrides, session artefacts (two_factor_webauthn keys, persistent_login tokens, rcguard throttle state) - `enigma` PVC → user-uploaded PGP private keyrings Per the subdir CLAUDE.md "Storage & Backup Architecture" rule every proxmox-lvm* PVC MUST have a backup CronJob writing to NFS `/mnt/main/-backup/`. Mailserver already complies via code-z26's `mailserver-backup` CronJob; Roundcube does not. Losing either Roundcube PVC means users must re-add 2FA devices, re-install plugins, and re-import PGP keys — none of it recoverable from a database dump. Target task: `code-1f6`. ## This change - Adds `module.nfs_roundcube_backup_host` sourcing `modules/kubernetes/nfs_volume` pointed at `/srv/nfs/roundcube-backup` on the Proxmox host (NFSv4, inotify change-tracker picks it up for Synology offsite). - Adds `kubernetes_cron_job_v1.roundcube-backup`: - Schedule `10 3 * * *` — 10 minutes after `mailserver-backup` (`0 3 * * *`) to avoid NFS write-window contention. Roundcube PVCs are tiny (<200 MiB combined on current cluster) so the window is well under 10 min. - `pod_affinity` on `app=roundcubemail` (Roundcube runs 1 replica with `Recreate` strategy on a fresh node per pod; the backup pod must co-locate because both PVCs are RWO). - `rsync -aH --delete --link-dest=/backup/` into `/backup//{html,enigma}/` — hardlinks unchanged files vs the previous weekly snapshot, keeping storage cost ~= delta only. - Weekly rotation retains 8 snapshots (~2 months), matching `mailserver-backup`. - Pushgateway metrics under `job=roundcube-backup` so existing `BackupDurationHigh` / `BackupStale` alert patterns detect regressions without extra wiring. - `KYVERNO_LIFECYCLE_V1` `ignore_changes` for mutated `dns_config`. ## Layout ``` NFS server 192.168.1.127:/srv/nfs/ ├── mailserver-backup/ (0 3 * * * — code-z26) │ └── /{data,state,log}/ └── roundcube-backup/ (10 3 * * * — this change) └── /{html,enigma}/ ``` ## What is NOT in this change - Changing the mailserver-backup CronJob to also cover Roundcube. Two separate CronJobs keep the concerns (and pod anti-affinity/affinity) clean; the 10-min stagger eliminates the contention justification for merging them. - Retention alerting tuning — existing Pushgateway/Prometheus rule ecosystem suffices for now. - Restore tooling — follows the standard pattern in `docs/runbooks/` (rsync back, fix perms). ## Reproduce locally 1. Plan: `cd stacks/mailserver && scripts/tg plan -lock=false` → 2 new resources (nfs_volume module + CronJob). 2. Apply, then trigger a one-shot run: `kubectl -n mailserver create job --from=cronjob/roundcube-backup roundcube-backup-manual-1` 3. Expected on success: - `kubectl -n mailserver logs job/roundcube-backup-manual-1` → "=== Backup IO Stats ===". - On Proxmox host: `ls /srv/nfs/roundcube-backup/$(date +%Y-%W)/` → `html`, `enigma`. - `/mnt/backup/.nfs-changes.log` (Proxmox) lists fresh paths under `roundcube-backup/` within ~1s of the rsync finishing. - Pushgateway: `curl -s prometheus-prometheus-pushgateway.monitoring:9091/metrics | grep roundcube` shows `backup_duration_seconds`, `backup_last_success_timestamp`. ## Automated - `terraform fmt -check -recursive stacks/mailserver/modules/mailserver/` → clean. - `scripts/tg plan -lock=false` in stacks/mailserver expected to show `+ module.nfs_roundcube_backup_host.*`, `+ kubernetes_cron_job_v1.roundcube-backup`. Closes: code-1f6 Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/mailserver/modules/mailserver/main.tf | 159 +++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/stacks/mailserver/modules/mailserver/main.tf b/stacks/mailserver/modules/mailserver/main.tf index f4ee2c8a..d85b0fdd 100644 --- a/stacks/mailserver/modules/mailserver/main.tf +++ b/stacks/mailserver/modules/mailserver/main.tf @@ -974,3 +974,162 @@ resource "kubernetes_cron_job_v1" "mailserver-backup" { } } +# ============================================================================= +# Roundcube Backup — Daily rsync of html + enigma PVCs to NFS +# Roundcube uses two encrypted RWO PVCs (see roundcubemail.tf): +# - roundcubemail-html-encrypted → /var/www/html (plugins, user sessions, skin overrides) +# - roundcubemail-enigma-encrypted → /var/roundcube/enigma (user-uploaded PGP keys) +# Losing either one = users lose plugin state + have to re-import PGP keys. +# Mirrors the mailserver-backup pattern but: +# - pod_affinity targets app=roundcubemail (both PVCs attach to the +# Roundcube pod, not mailserver) +# - schedule offset by +10m (03:10) so two NFS-writers don't overlap +# - writes to /srv/nfs/roundcube-backup//{html,enigma}/ +# ============================================================================= +module "nfs_roundcube_backup_host" { + source = "../../../../modules/kubernetes/nfs_volume" + name = "roundcube-backup-host" + namespace = kubernetes_namespace.mailserver.metadata[0].name + nfs_server = var.nfs_server + nfs_path = "/srv/nfs/roundcube-backup" +} + +resource "kubernetes_cron_job_v1" "roundcube-backup" { + metadata { + name = "roundcube-backup" + namespace = kubernetes_namespace.mailserver.metadata[0].name + } + spec { + concurrency_policy = "Replace" + failed_jobs_history_limit = 5 + # +10 min offset vs mailserver-backup (03:00) to avoid NFS contention. + schedule = "10 3 * * *" + starting_deadline_seconds = 10 + successful_jobs_history_limit = 10 + job_template { + metadata {} + spec { + backoff_limit = 3 + ttl_seconds_after_finished = 10 + template { + metadata {} + spec { + # RWO co-location: Roundcube PVCs are ReadWriteOnce; the backup + # pod must land on the same node as the Roundcube pod (single + # replica, Recreate strategy — see roundcubemail.tf). + affinity { + pod_affinity { + required_during_scheduling_ignored_during_execution { + label_selector { + match_labels = { + app = "roundcubemail" + } + } + topology_key = "kubernetes.io/hostname" + } + } + } + container { + name = "roundcube-backup" + image = "docker.io/library/alpine" + command = ["/bin/sh", "-c", <<-EOT + set -euxo pipefail + apk add --no-cache rsync + _t0=$(date +%s) + _rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + + week=$(date +"%Y-%W") + prev_week=$(date -d "-7 days" +"%Y-%W" 2>/dev/null || echo "") + dst=/backup/$week + mkdir -p "$dst" + + # Use --link-dest against previous week for space-efficient + # incrementals (unchanged files are hardlinked, not re-copied). + link_dest_arg="" + if [ -n "$prev_week" ] && [ -d "/backup/$prev_week" ]; then + link_dest_arg="--link-dest=/backup/$prev_week" + fi + + # Roundcube data layout (from deployment volume mounts in roundcubemail.tf): + # /src/html -> roundcubemail-html-encrypted (html PVC) + # /src/enigma -> roundcubemail-enigma-encrypted (enigma PVC, PGP keys) + for src in /src/html /src/enigma; do + [ -d "$src" ] || { echo "SKIP missing $src"; continue; } + name=$(basename "$src") + rsync -aH --delete $link_dest_arg "$src/" "$dst/$name/" + done + + # Rotate — keep 8 weekly snapshots (~2 months) + find /backup -maxdepth 1 -mindepth 1 -type d -regex '.*/[0-9]+-[0-9]+$' | sort | head -n -8 | xargs -r rm -rf + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(du -sh "$dst" | awk '{print $1}')" + + _out_bytes=$(du -sb "$dst" | awk '{print $1}') + wget -qO- --post-data "backup_duration_seconds $${_dur} + backup_read_bytes $(( _rb1 - _rb0 )) + backup_written_bytes $(( _wb1 - _wb0 )) + backup_output_bytes $${_out_bytes} + backup_last_success_timestamp $(date +%s) + " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/roundcube-backup" || true + EOT + ] + volume_mount { + name = "html" + mount_path = "/src/html" + read_only = true + } + volume_mount { + name = "enigma" + mount_path = "/src/enigma" + read_only = true + } + volume_mount { + name = "backup" + mount_path = "/backup" + } + } + volume { + name = "html" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.roundcube_html_encrypted.metadata[0].name + read_only = true + } + } + volume { + name = "enigma" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.roundcube_enigma_encrypted.metadata[0].name + read_only = true + } + } + volume { + name = "backup" + persistent_volume_claim { + claim_name = module.nfs_roundcube_backup_host.claim_name + } + } + dns_config { + option { + name = "ndots" + value = "2" + } + } + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } +} +