feat: add per-database backups for PostgreSQL and MySQL

Add separate CronJobs that dump each database individually: - postgresql-backup-per-db: pg_dump -Fc per DB (daily 00:15) - mysql-backup-per-db: mysqldump per DB (daily 00:45) Dumps go to /backup/per-db/<dbname>/ on the same NFS PVC. Enables single-database restore without affecting other databases. Also fixed CNPG superuser password sync and added --single-transaction --set-gtid-purged=OFF to MySQL per-db dumps. Updated restore runbooks with per-database restore procedures. [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 22:39:33 +00:00 · 2026-04-14 22:39:33 +00:00 · 0256ccdccc
commit 0256ccdccc
parent ca1ae23f34
3 changed files with 271 additions and 0 deletions
--- a/docs/runbooks/restore-mysql.md
+++ b/docs/runbooks/restore-mysql.md
@ -96,6 +96,38 @@ kubectl exec -it mysql-cluster-0 -n dbaas -c mysql -- mysqlsh root@localhost --p
 kubectl exec -it mysql-cluster-0 -n dbaas -c mysql -- mysqlsh root@localhost --password="$ROOT_PWD" -- cluster rejoinInstance root@mysql-cluster-1:3306
 ```

+## Restore Single Database (from per-db backup)
+
+Per-database backups are stored at `/mnt/main/mysql-backup/per-db/<dbname>/` as gzipped SQL dumps.
+
+### 1. List available per-db backups
+```bash
+ls -lt /mnt/main/mysql-backup/per-db/<dbname>/
+```
+
+### 2. Restore a single database
+```bash
+# Port-forward to MySQL
+kubectl port-forward svc/mysql -n dbaas 3307:3306 &
+ROOT_PWD=$(kubectl get secret cluster-secret -n dbaas -o jsonpath='{.data.ROOT_PASSWORD}' | base64 -d)
+
+# Restore single database (this replaces only the target database)
+zcat /path/to/per-db/<dbname>/dump_YYYY_MM_DD_HH_MM.sql.gz | mysql -u root -p"$ROOT_PWD" --host 127.0.0.1 --port 3307 <dbname>
+```
+
+### 3. Verify
+```bash
+mysql -u root -p"$ROOT_PWD" --host 127.0.0.1 --port 3307 -e \
+  "SELECT TABLE_NAME, TABLE_ROWS FROM information_schema.TABLES WHERE TABLE_SCHEMA='<dbname>' ORDER BY TABLE_ROWS DESC LIMIT 10;"
+```
+
+### 4. Restart the affected service only
+```bash
+kubectl rollout restart deployment -n <namespace>
+```
+
+**Advantages over full restore**: Only the target database is affected. All other databases continue running with their current data.
+
 ## Alternative: Restore from sda Backup

 If TrueNAS NFS is unavailable but the PVE host is accessible:
--- a/docs/runbooks/restore-postgresql.md
+++ b/docs/runbooks/restore-postgresql.md
@ -84,6 +84,43 @@ kubectl rollout restart deployment -n linkwarden
 # ... repeat for all PG-dependent services (excluding trading — disabled)
 ```

+## Restore Single Database (from per-db backup)
+
+Per-database backups use `pg_dump -Fc` (custom format) and are stored at `/mnt/main/postgresql-backup/per-db/<dbname>/`.
+
+### 1. List available per-db backups
+```bash
+ls -lt /mnt/main/postgresql-backup/per-db/<dbname>/
+
+# Or via a pod:
+kubectl exec -n dbaas pg-cluster-1 -c postgres -- ls -lt /backup/per-db/<dbname>/ 2>/dev/null || \
+  echo "Mount a backup pod — see Option A below"
+```
+
+### 2. Restore a single database
+```bash
+# Port-forward to the CNPG primary
+kubectl port-forward svc/pg-cluster-rw -n dbaas 5433:5432 &
+
+# Restore single database (drops and recreates objects in that DB only)
+PGPASSWORD=$(kubectl get secret pg-cluster-superuser -n dbaas -o jsonpath='{.data.password}' | base64 -d) \
+  pg_restore -h 127.0.0.1 -p 5433 -U postgres -d <dbname> --clean --if-exists \
+  /path/to/per-db/<dbname>/dump_YYYY_MM_DD_HH_MM.dump
+```
+
+### 3. Verify
+```bash
+PGPASSWORD=$PGPASSWORD psql -h 127.0.0.1 -p 5433 -U postgres -d <dbname> -c \
+  "SELECT schemaname, tablename, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC LIMIT 10;"
+```
+
+### 4. Restart the affected service only
+```bash
+kubectl rollout restart deployment -n <namespace>
+```
+
+**Advantages over full restore**: Only the target database is affected. All other databases continue running with their current data.
+
 ## Alternative: Restore from sda Backup

 If TrueNAS NFS is unavailable but the PVE host is accessible:
--- a/stacks/dbaas/modules/dbaas/main.tf
+++ b/stacks/dbaas/modules/dbaas/main.tf
@ -516,6 +516,101 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
  }
 }

+# Per-database MySQL backups (enables single-database restore without affecting others)
+resource "kubernetes_cron_job_v1" "mysql-backup-per-db" {
+  metadata {
+    name      = "mysql-backup-per-db"
+    namespace = kubernetes_namespace.dbaas.metadata[0].name
+  }
+  spec {
+    concurrency_policy        = "Replace"
+    failed_jobs_history_limit = 3
+    schedule                  = "45 0 * * *"
+    starting_deadline_seconds     = 10
+    successful_jobs_history_limit = 3
+    job_template {
+      metadata {}
+      spec {
+        backoff_limit              = 3
+        ttl_seconds_after_finished = 10
+        template {
+          metadata {}
+          spec {
+            container {
+              name  = "mysql-backup-per-db"
+              image = "docker.io/library/mysql:8.0"
+              env {
+                name = "MYSQL_PWD"
+                value_from {
+                  secret_key_ref {
+                    name = "cluster-secret"
+                    key  = "ROOT_PASSWORD"
+                  }
+                }
+              }
+              command = ["/bin/bash", "-c", <<-EOT
+                set -euo pipefail
+                _t0=$(date +%s)
+                now=$(date +"%Y_%m_%d_%H_%M")
+                MYSQL_HOST=mysql.dbaas.svc.cluster.local
+                failed=0
+                total=0
+                ok=0
+
+                # Discover all user databases
+                dbs=$(mysql -u root --host $MYSQL_HOST -N -e \
+                  "SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT IN ('mysql','information_schema','performance_schema','sys','mysql_innodb_cluster_metadata');")
+
+                for db in $dbs; do
+                  total=$((total + 1))
+                  mkdir -p /backup/per-db/$db
+                  echo "=== Backing up $db ==="
+                  if mysqldump -u root --host $MYSQL_HOST --single-transaction --set-gtid-purged=OFF "$db" | gzip -9 > "/backup/per-db/$db/dump_$now.sql.gz"; then
+                    _size=$(stat -c%s "/backup/per-db/$db/dump_$now.sql.gz")
+                    echo "  OK — $(( _size / 1024 )) KiB"
+                    ok=$((ok + 1))
+                  else
+                    echo "  FAILED"
+                    rm -f "/backup/per-db/$db/dump_$now.sql.gz"
+                    failed=$((failed + 1))
+                  fi
+                done
+
+                # Rotate — 14 day retention per database
+                find /backup/per-db -name "dump_*.sql.gz" -type f -mtime +14 -delete
+
+                _dur=$(($(date +%s) - _t0))
+                echo "=== Per-DB Backup Summary ==="
+                echo "databases: $total (ok: $ok, failed: $failed)"
+                echo "duration: $${_dur}s"
+
+                curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup-per-db" <<PGEOF || true
+                backup_duration_seconds $${_dur}
+                backup_databases_total $total
+                backup_databases_ok $ok
+                backup_databases_failed $failed
+                backup_last_success_timestamp $(date +%s)
+                PGEOF
+              EOT
+              ]
+              volume_mount {
+                name       = "mysql-backup"
+                mount_path = "/backup"
+              }
+            }
+            volume {
+              name = "mysql-backup"
+              persistent_volume_claim {
+                claim_name = module.nfs_mysql_backup_host.claim_name
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 # resource "kubernetes_persistent_volume" "mysql" {
 #   metadata {
 #     name = "mysql-pv"
@ -1283,3 +1378,110 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
    }
  }
 }
+
+# Per-database PostgreSQL backups (enables single-database restore without affecting others)
+resource "kubernetes_cron_job_v1" "postgresql-backup-per-db" {
+  metadata {
+    name      = "postgresql-backup-per-db"
+    namespace = kubernetes_namespace.dbaas.metadata[0].name
+  }
+  spec {
+    concurrency_policy        = "Replace"
+    failed_jobs_history_limit = 3
+    schedule                  = "15 0 * * *"
+    starting_deadline_seconds     = 10
+    successful_jobs_history_limit = 3
+    job_template {
+      metadata {}
+      spec {
+        backoff_limit              = 3
+        ttl_seconds_after_finished = 10
+        template {
+          metadata {}
+          spec {
+            container {
+              name  = "postgresql-backup-per-db"
+              image = "docker.io/library/postgres:16.4-bullseye"
+              env {
+                name = "PGPASSWORD"
+                value_from {
+                  secret_key_ref {
+                    name = "pg-cluster-superuser"
+                    key  = "password"
+                  }
+                }
+              }
+              command = ["/bin/bash", "-c", <<-EOT
+                set -euo pipefail
+                apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
+
+                _t0=$(date +%s)
+                now=$(date +"%Y_%m_%d_%H_%M")
+                PGHOST=pg-cluster-rw.dbaas
+                PGUSER=postgres
+                failed=0
+                total=0
+                ok=0
+
+                # Discover all user databases
+                dbs=$(PGPASSWORD=$PGPASSWORD psql -h $PGHOST -U $PGUSER -t -A -c \
+                  "SELECT datname FROM pg_database WHERE datistemplate = false AND datname != 'postgres' ORDER BY datname;")
+
+                for db in $dbs; do
+                  total=$((total + 1))
+                  mkdir -p /backup/per-db/$db
+                  echo "=== Backing up $db ==="
+                  if PGPASSWORD=$PGPASSWORD pg_dump -Fc -h $PGHOST -U $PGUSER "$db" > "/backup/per-db/$db/dump_$now.dump"; then
+                    _size=$(stat -c%s "/backup/per-db/$db/dump_$now.dump")
+                    echo "  OK — $(( _size / 1024 )) KiB"
+                    ok=$((ok + 1))
+                  else
+                    echo "  FAILED"
+                    rm -f "/backup/per-db/$db/dump_$now.dump"
+                    failed=$((failed + 1))
+                  fi
+                done
+
+                # Rotate — 14 day retention per database
+                find /backup/per-db -name "dump_*.dump" -type f -mtime +14 -delete
+
+                _dur=$(($(date +%s) - _t0))
+                echo "=== Per-DB Backup Summary ==="
+                echo "databases: $total (ok: $ok, failed: $failed)"
+                echo "duration: $${_dur}s"
+
+                curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup-per-db" <<PGEOF || true
+                backup_duration_seconds $${_dur}
+                backup_databases_total $total
+                backup_databases_ok $ok
+                backup_databases_failed $failed
+                backup_last_success_timestamp $(date +%s)
+                PGEOF
+              EOT
+              ]
+              volume_mount {
+                name       = "postgresql-backup"
+                mount_path = "/backup"
+              }
+              resources {
+                requests = {
+                  memory = "256Mi"
+                  cpu    = "50m"
+                }
+                limits = {
+                  memory = "512Mi"
+                }
+              }
+            }
+            volume {
+              name = "postgresql-backup"
+              persistent_volume_claim {
+                claim_name = module.nfs_postgresql_backup_host.claim_name
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}