diff --git a/docs/runbooks/restore-vaultwarden.md b/docs/runbooks/restore-vaultwarden.md index 7786b770..75ce81ff 100644 --- a/docs/runbooks/restore-vaultwarden.md +++ b/docs/runbooks/restore-vaultwarden.md @@ -9,7 +9,8 @@ - Each backup contains: `db.sqlite3`, `rsa_key.pem`, `rsa_key.pub.pem`, `attachments/`, `sends/`, `config.json` - Replicated to Synology NAS (192.168.1.13) via TrueNAS ZFS replication - Retention: 30 days -- Schedule: Daily at 00:00 +- Schedule: Every 6 hours (00:00, 06:00, 12:00, 18:00) +- Integrity check: Both source and backup are verified before/after each backup ## Backup Contents | File | Purpose | Critical? | @@ -40,7 +41,7 @@ kubectl scale deployment vaultwarden -n vaultwarden --replicas=0 BACKUP_DIR="YYYY_MM_DD_HH_MM" # Set to desired backup kubectl run vw-restore --rm -it --image=alpine \ - --overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \ + --overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data-iscsi"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \ -n vaultwarden ``` diff --git a/modules/create-template-vm/cloud_init.yaml b/modules/create-template-vm/cloud_init.yaml index 82a57b52..028af5a2 100644 --- a/modules/create-template-vm/cloud_init.yaml +++ b/modules/create-template-vm/cloud_init.yaml @@ -67,6 +67,18 @@ runcmd: - ${containerd_config_update_command} - systemctl restart containerd - systemctl enable --now iscsid + # Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable + # CRC32C data/header digests to detect bit flips over the network. + # Prevents SQLite corruption from transient iSCSI session drops. + - sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf + - sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf + - sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf + - | + if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then + echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf + echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf + fi + - systemctl restart iscsid # Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet) - mkdir -p /sentinel # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical) diff --git a/stacks/platform/modules/vaultwarden/main.tf b/stacks/platform/modules/vaultwarden/main.tf index cc4a9723..7de11e85 100644 --- a/stacks/platform/modules/vaultwarden/main.tf +++ b/stacks/platform/modules/vaultwarden/main.tf @@ -199,7 +199,7 @@ module "ingress" { } # ----------------------------------------------------------------------------- -# Backup — Daily SQLite + data files to NFS +# Backup — Every 6h SQLite + data files to NFS # ----------------------------------------------------------------------------- module "nfs_vaultwarden_backup" { @@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { spec { concurrency_policy = "Replace" failed_jobs_history_limit = 5 - schedule = "0 0 * * *" + schedule = "0 */6 * * *" starting_deadline_seconds = 10 successful_jobs_history_limit = 10 job_template { @@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { set -euxo pipefail apk add --no-cache sqlite now=$(date +"%Y_%m_%d_%H_%M") + # Pre-flight: verify source DB is healthy before backing up + if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then + echo "ERROR: source database failed integrity check, skipping backup" + exit 1 + fi mkdir -p /backup/$now # Safe SQLite backup (handles WAL/locks) sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3" + # Verify the backup copy is also healthy + if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then + echo "ERROR: backup copy failed integrity check, removing" + rm -rf /backup/$now + exit 1 + fi # Copy RSA keys, attachments, sends, config cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true cp -a /data/attachments /backup/$now/ 2>/dev/null || true diff --git a/stacks/vaultwarden/modules/vaultwarden/main.tf b/stacks/vaultwarden/modules/vaultwarden/main.tf index cc4a9723..7de11e85 100644 --- a/stacks/vaultwarden/modules/vaultwarden/main.tf +++ b/stacks/vaultwarden/modules/vaultwarden/main.tf @@ -199,7 +199,7 @@ module "ingress" { } # ----------------------------------------------------------------------------- -# Backup — Daily SQLite + data files to NFS +# Backup — Every 6h SQLite + data files to NFS # ----------------------------------------------------------------------------- module "nfs_vaultwarden_backup" { @@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { spec { concurrency_policy = "Replace" failed_jobs_history_limit = 5 - schedule = "0 0 * * *" + schedule = "0 */6 * * *" starting_deadline_seconds = 10 successful_jobs_history_limit = 10 job_template { @@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { set -euxo pipefail apk add --no-cache sqlite now=$(date +"%Y_%m_%d_%H_%M") + # Pre-flight: verify source DB is healthy before backing up + if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then + echo "ERROR: source database failed integrity check, skipping backup" + exit 1 + fi mkdir -p /backup/$now # Safe SQLite backup (handles WAL/locks) sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3" + # Verify the backup copy is also healthy + if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then + echo "ERROR: backup copy failed integrity check, removing" + rm -rf /backup/$now + exit 1 + fi # Copy RSA keys, attachments, sends, config cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true cp -a /data/attachments /backup/$now/ 2>/dev/null || true