harden vaultwarden iSCSI storage and increase backup frequency
- Increase backup from daily to every 6 hours (0 */6 * * *) - Add pre/post-flight SQLite integrity checks to backup job - Harden iSCSI on all nodes: increase recovery timeout (300s), enable CRC32C data/header digests for bit-flip detection - Fix restore runbook PVC name (vaultwarden-data-iscsi) Motivated by SQLite corruption from iSCSI I/O errors.
This commit is contained in:
parent
469fcb12b5
commit
a44f35bcf8
4 changed files with 41 additions and 6 deletions
|
|
@ -9,7 +9,8 @@
|
|||
- Each backup contains: `db.sqlite3`, `rsa_key.pem`, `rsa_key.pub.pem`, `attachments/`, `sends/`, `config.json`
|
||||
- Replicated to Synology NAS (192.168.1.13) via TrueNAS ZFS replication
|
||||
- Retention: 30 days
|
||||
- Schedule: Daily at 00:00
|
||||
- Schedule: Every 6 hours (00:00, 06:00, 12:00, 18:00)
|
||||
- Integrity check: Both source and backup are verified before/after each backup
|
||||
|
||||
## Backup Contents
|
||||
| File | Purpose | Critical? |
|
||||
|
|
@ -40,7 +41,7 @@ kubectl scale deployment vaultwarden -n vaultwarden --replicas=0
|
|||
BACKUP_DIR="YYYY_MM_DD_HH_MM" # Set to desired backup
|
||||
|
||||
kubectl run vw-restore --rm -it --image=alpine \
|
||||
--overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \
|
||||
--overrides='{"spec":{"volumes":[{"name":"backup","persistentVolumeClaim":{"claimName":"vaultwarden-backup"}},{"name":"data","persistentVolumeClaim":{"claimName":"vaultwarden-data-iscsi"}}],"containers":[{"name":"vw-restore","image":"alpine","volumeMounts":[{"name":"backup","mountPath":"/backup"},{"name":"data","mountPath":"/data"}],"command":["/bin/sh","-c","cp /backup/'$BACKUP_DIR'/db.sqlite3 /data/db.sqlite3 && cp /backup/'$BACKUP_DIR'/rsa_key.pem /data/ && cp /backup/'$BACKUP_DIR'/rsa_key.pub.pem /data/ && cp -a /backup/'$BACKUP_DIR'/attachments /data/ 2>/dev/null; echo Restore complete"]}]}}' \
|
||||
-n vaultwarden
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -67,6 +67,18 @@ runcmd:
|
|||
- ${containerd_config_update_command}
|
||||
- systemctl restart containerd
|
||||
- systemctl enable --now iscsid
|
||||
# Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable
|
||||
# CRC32C data/header digests to detect bit flips over the network.
|
||||
# Prevents SQLite corruption from transient iSCSI session drops.
|
||||
- sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf
|
||||
- sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf
|
||||
- sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf
|
||||
- |
|
||||
if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then
|
||||
echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
|
||||
echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
|
||||
fi
|
||||
- systemctl restart iscsid
|
||||
# Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet)
|
||||
- mkdir -p /sentinel
|
||||
# Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ module "ingress" {
|
|||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Backup — Daily SQLite + data files to NFS
|
||||
# Backup — Every 6h SQLite + data files to NFS
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
module "nfs_vaultwarden_backup" {
|
||||
|
|
@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
spec {
|
||||
concurrency_policy = "Replace"
|
||||
failed_jobs_history_limit = 5
|
||||
schedule = "0 0 * * *"
|
||||
schedule = "0 */6 * * *"
|
||||
starting_deadline_seconds = 10
|
||||
successful_jobs_history_limit = 10
|
||||
job_template {
|
||||
|
|
@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
set -euxo pipefail
|
||||
apk add --no-cache sqlite
|
||||
now=$(date +"%Y_%m_%d_%H_%M")
|
||||
# Pre-flight: verify source DB is healthy before backing up
|
||||
if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
|
||||
echo "ERROR: source database failed integrity check, skipping backup"
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p /backup/$now
|
||||
# Safe SQLite backup (handles WAL/locks)
|
||||
sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
|
||||
# Verify the backup copy is also healthy
|
||||
if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
|
||||
echo "ERROR: backup copy failed integrity check, removing"
|
||||
rm -rf /backup/$now
|
||||
exit 1
|
||||
fi
|
||||
# Copy RSA keys, attachments, sends, config
|
||||
cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
|
||||
cp -a /data/attachments /backup/$now/ 2>/dev/null || true
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ module "ingress" {
|
|||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Backup — Daily SQLite + data files to NFS
|
||||
# Backup — Every 6h SQLite + data files to NFS
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
module "nfs_vaultwarden_backup" {
|
||||
|
|
@ -218,7 +218,7 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
spec {
|
||||
concurrency_policy = "Replace"
|
||||
failed_jobs_history_limit = 5
|
||||
schedule = "0 0 * * *"
|
||||
schedule = "0 */6 * * *"
|
||||
starting_deadline_seconds = 10
|
||||
successful_jobs_history_limit = 10
|
||||
job_template {
|
||||
|
|
@ -248,9 +248,20 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
set -euxo pipefail
|
||||
apk add --no-cache sqlite
|
||||
now=$(date +"%Y_%m_%d_%H_%M")
|
||||
# Pre-flight: verify source DB is healthy before backing up
|
||||
if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
|
||||
echo "ERROR: source database failed integrity check, skipping backup"
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p /backup/$now
|
||||
# Safe SQLite backup (handles WAL/locks)
|
||||
sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
|
||||
# Verify the backup copy is also healthy
|
||||
if ! sqlite3 /backup/$now/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
|
||||
echo "ERROR: backup copy failed integrity check, removing"
|
||||
rm -rf /backup/$now
|
||||
exit 1
|
||||
fi
|
||||
# Copy RSA keys, attachments, sends, config
|
||||
cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
|
||||
cp -a /data/attachments /backup/$now/ 2>/dev/null || true
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue