harden vaultwarden iSCSI storage and increase backup frequency

- Increase backup from daily to every 6 hours (0 */6 * * *) - Add pre/post-flight SQLite integrity checks to backup job - Harden iSCSI on all nodes: increase recovery timeout (300s), enable CRC32C data/header digests for bit-flip detection - Fix restore runbook PVC name (vaultwarden-data-iscsi) Motivated by SQLite corruption from iSCSI I/O errors.
2026-03-23 00:36:11 +02:00 · 2026-03-23 00:36:11 +02:00 · a44f35bcf8
commit a44f35bcf8
parent 469fcb12b5
4 changed files with 41 additions and 6 deletions
--- a/docs/runbooks/restore-vaultwarden.md
+++ b/docs/runbooks/restore-vaultwarden.md
@ -9,7 +9,8 @@
 - Each backup contains: `db.sqlite3`, `rsa_key.pem`, `rsa_key.pub.pem`, `attachments/`, `sends/`, `config.json`
 - Replicated to Synology NAS (192.168.1.13) via TrueNAS ZFS replication
 - Retention: 30 days
- Schedule: Daily at 00:00
+- Schedule: Every 6 hours (00:00, 06:00, 12:00, 18:00)
+- Integrity check: Both source and backup are verified before/after each backup

 ## Backup Contents
 | File | Purpose | Critical? |
@ -40,7 +41,7 @@ kubectl scale deployment vaultwarden -n vaultwarden --replicas=0
 BACKUP_DIR="YYYY_MM_DD_HH_MM"  # Set to desired backup

 kubectl run vw-restore --rm -it --image=alpine \
-  --overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \
+  --overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data-iscsi"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \
  -n vaultwarden
 ```

--- a/modules/create-template-vm/cloud_init.yaml
+++ b/modules/create-template-vm/cloud_init.yaml
@ -67,6 +67,18 @@ runcmd:
  - ${containerd_config_update_command}
  - systemctl restart containerd
  - systemctl enable --now iscsid
+  # Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable
+  # CRC32C data/header digests to detect bit flips over the network.
+  # Prevents SQLite corruption from transient iSCSI session drops.
+  - sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf
+  - sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf
+  - sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf
+  - |
+    if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then
+      echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
+      echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
+    fi
+  - systemctl restart iscsid
  # Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet)
  - mkdir -p /sentinel
  # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
--- a/stacks/platform/modules/vaultwarden/main.tf
+++ b/stacks/platform/modules/vaultwarden/main.tf
@ -199,7 +199,7 @@ module "ingress" {
 }

 # -----------------------------------------------------------------------------
-# Backup — Daily SQLite + data files to NFS
+# Backup — Every 6h SQLite + data files to NFS
 # -----------------------------------------------------------------------------

 module "nfs_vaultwarden_backup" {
@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
  spec {
    concurrency_policy            = "Replace"
    failed_jobs_history_limit     = 5
-    schedule                      = "0 0 * * *"
+    schedule                      = "0 */6 * * *"
    starting_deadline_seconds     = 10
    successful_jobs_history_limit = 10
    job_template {
@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
                set -euxo pipefail
                apk add --no-cache sqlite
                now=$(date +"%Y_%m_%d_%H_%M")
+                # Pre-flight: verify source DB is healthy before backing up
+                if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
+                  echo "ERROR: source database failed integrity check, skipping backup"
+                  exit 1
+                fi
                mkdir -p /backup/$now
                # Safe SQLite backup (handles WAL/locks)
                sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
+                # Verify the backup copy is also healthy
+                if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
+                  echo "ERROR: backup copy failed integrity check, removing"
+                  rm -rf /backup/$now
+                  exit 1
+                fi
                # Copy RSA keys, attachments, sends, config
                cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
                cp -a /data/attachments /backup/$now/ 2>/dev/null || true
--- a/stacks/vaultwarden/modules/vaultwarden/main.tf
+++ b/stacks/vaultwarden/modules/vaultwarden/main.tf
@ -199,7 +199,7 @@ module "ingress" {
 }

 # -----------------------------------------------------------------------------
-# Backup — Daily SQLite + data files to NFS
+# Backup — Every 6h SQLite + data files to NFS
 # -----------------------------------------------------------------------------

 module "nfs_vaultwarden_backup" {
@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
  spec {
    concurrency_policy            = "Replace"
    failed_jobs_history_limit     = 5
-    schedule                      = "0 0 * * *"
+    schedule                      = "0 */6 * * *"
    starting_deadline_seconds     = 10
    successful_jobs_history_limit = 10
    job_template {
@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
                set -euxo pipefail
                apk add --no-cache sqlite
                now=$(date +"%Y_%m_%d_%H_%M")
+                # Pre-flight: verify source DB is healthy before backing up
+                if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
+                  echo "ERROR: source database failed integrity check, skipping backup"
+                  exit 1
+                fi
                mkdir -p /backup/$now
                # Safe SQLite backup (handles WAL/locks)
                sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
+                # Verify the backup copy is also healthy
+                if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
+                  echo "ERROR: backup copy failed integrity check, removing"
+                  rm -rf /backup/$now
+                  exit 1
+                fi
                # Copy RSA keys, attachments, sends, config
                cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
                cp -a /data/attachments /backup/$now/ 2>/dev/null || true