harden vaultwarden iSCSI storage and increase backup frequency

- Increase backup from daily to every 6 hours (0 */6 * * *)
- Add pre/post-flight SQLite integrity checks to backup job
- Harden iSCSI on all nodes: increase recovery timeout (300s),
  enable CRC32C data/header digests for bit-flip detection
- Fix restore runbook PVC name (vaultwarden-data-iscsi)

Motivated by SQLite corruption from iSCSI I/O errors.
This commit is contained in:
Viktor Barzin 2026-03-23 00:36:11 +02:00
parent 469fcb12b5
commit a44f35bcf8
4 changed files with 41 additions and 6 deletions

View file

@ -9,7 +9,8 @@
- Each backup contains: `db.sqlite3`, `rsa_key.pem`, `rsa_key.pub.pem`, `attachments/`, `sends/`, `config.json`
- Replicated to Synology NAS (192.168.1.13) via TrueNAS ZFS replication
- Retention: 30 days
- Schedule: Daily at 00:00
- Schedule: Every 6 hours (00:00, 06:00, 12:00, 18:00)
- Integrity check: Both source and backup are verified before/after each backup
## Backup Contents
| File | Purpose | Critical? |
@ -40,7 +41,7 @@ kubectl scale deployment vaultwarden -n vaultwarden --replicas=0
BACKUP_DIR="YYYY_MM_DD_HH_MM" # Set to desired backup
kubectl run vw-restore --rm -it --image=alpine \
--overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \
--overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data-iscsi"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \
-n vaultwarden
```

View file

@ -67,6 +67,18 @@ runcmd:
- ${containerd_config_update_command}
- systemctl restart containerd
- systemctl enable --now iscsid
# Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable
# CRC32C data/header digests to detect bit flips over the network.
# Prevents SQLite corruption from transient iSCSI session drops.
- sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf
- sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf
- sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf
- |
if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then
echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
fi
- systemctl restart iscsid
# Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet)
- mkdir -p /sentinel
# Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)

View file

@ -199,7 +199,7 @@ module "ingress" {
}
# -----------------------------------------------------------------------------
# Backup Daily SQLite + data files to NFS
# Backup Every 6h SQLite + data files to NFS
# -----------------------------------------------------------------------------
module "nfs_vaultwarden_backup" {
@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "0 0 * * *"
schedule = "0 */6 * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 10
job_template {
@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
set -euxo pipefail
apk add --no-cache sqlite
now=$(date +"%Y_%m_%d_%H_%M")
# Pre-flight: verify source DB is healthy before backing up
if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
echo "ERROR: source database failed integrity check, skipping backup"
exit 1
fi
mkdir -p /backup/$now
# Safe SQLite backup (handles WAL/locks)
sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
# Verify the backup copy is also healthy
if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
echo "ERROR: backup copy failed integrity check, removing"
rm -rf /backup/$now
exit 1
fi
# Copy RSA keys, attachments, sends, config
cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
cp -a /data/attachments /backup/$now/ 2>/dev/null || true

View file

@ -199,7 +199,7 @@ module "ingress" {
}
# -----------------------------------------------------------------------------
# Backup Daily SQLite + data files to NFS
# Backup Every 6h SQLite + data files to NFS
# -----------------------------------------------------------------------------
module "nfs_vaultwarden_backup" {
@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "0 0 * * *"
schedule = "0 */6 * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 10
job_template {
@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
set -euxo pipefail
apk add --no-cache sqlite
now=$(date +"%Y_%m_%d_%H_%M")
# Pre-flight: verify source DB is healthy before backing up
if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
echo "ERROR: source database failed integrity check, skipping backup"
exit 1
fi
mkdir -p /backup/$now
# Safe SQLite backup (handles WAL/locks)
sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
# Verify the backup copy is also healthy
if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
echo "ERROR: backup copy failed integrity check, removing"
rm -rf /backup/$now
exit 1
fi
# Copy RSA keys, attachments, sends, config
cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
cp -a /data/attachments /backup/$now/ 2>/dev/null || true