add hourly SQLite integrity check for vaultwarden with Prometheus alerting

- New CronJob runs PRAGMA integrity_check every hour
- Pushes vaultwarden_sqlite_integrity_ok metric to Prometheus pushgateway
- VaultwardenSQLiteCorrupt alert fires immediately on corruption (critical)
- VaultwardenIntegrityCheckStale alert if check hasn't run in 2h (warning)
- Prevents running for days on a corrupted DB unnoticed
This commit is contained in:
Viktor Barzin 2026-03-23 00:50:15 +02:00
parent 3b89a7d7e4
commit 311ff5dd9e
3 changed files with 182 additions and 0 deletions

View file

@ -636,6 +636,20 @@ serverFiles:
severity: critical
annotations:
summary: "Vaultwarden has no available replicas — password manager down"
- alert: VaultwardenSQLiteCorrupt
expr: vaultwarden_sqlite_integrity_ok == 0
for: 0m
labels:
severity: critical
annotations:
summary: "Vaultwarden SQLite database failed integrity check — data corruption detected"
- alert: VaultwardenIntegrityCheckStale
expr: (time() - vaultwarden_sqlite_integrity_check_timestamp) > 7200
for: 15m
labels:
severity: warning
annotations:
summary: "Vaultwarden integrity check hasn't run in {{ $value | humanizeDuration }} (expected hourly)"
- alert: RedisBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"}) > 691200
for: 30m

View file

@ -306,3 +306,87 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
}
}
}
# -----------------------------------------------------------------------------
# Integrity Check Hourly SQLite PRAGMA check, pushes metric to Prometheus
# -----------------------------------------------------------------------------
resource "kubernetes_cron_job_v1" "vaultwarden-integrity-check" {
metadata {
name = "vaultwarden-integrity-check"
namespace = kubernetes_namespace.vaultwarden.metadata[0].name
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "30 * * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 3
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 10
template {
metadata {}
spec {
affinity {
pod_affinity {
required_during_scheduling_ignored_during_execution {
label_selector {
match_labels = {
app = "vaultwarden"
}
}
topology_key = "kubernetes.io/hostname"
}
}
}
container {
name = "integrity-check"
image = "docker.io/library/alpine"
command = ["/bin/sh", "-c", <<-EOT
set -euo pipefail
apk add --no-cache sqlite curl >/dev/null 2>&1
PUSHGW="http://prometheus-prometheus-pushgateway.monitoring.svc.cluster.local:9091"
result=$(sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" 2>&1)
if echo "$$result" | grep -q "^ok$$"; then
echo "SQLite integrity check passed"
cat <<METRICS | curl -s --data-binary @- "$$PUSHGW/metrics/job/vaultwarden-integrity/instance/vaultwarden"
vaultwarden_sqlite_integrity_ok 1
vaultwarden_sqlite_integrity_check_timestamp $(date +%s)
METRICS
else
echo "ERROR: SQLite integrity check FAILED: $$result"
cat <<METRICS | curl -s --data-binary @- "$$PUSHGW/metrics/job/vaultwarden-integrity/instance/vaultwarden"
vaultwarden_sqlite_integrity_ok 0
vaultwarden_sqlite_integrity_check_timestamp $(date +%s)
METRICS
exit 1
fi
EOT
]
volume_mount {
name = "data"
mount_path = "/data"
read_only = true
}
}
volume {
name = "data"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.vaultwarden_data.metadata[0].name
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
}
}

View file

@ -306,3 +306,87 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
}
}
}
# -----------------------------------------------------------------------------
# Integrity Check Hourly SQLite PRAGMA check, pushes metric to Prometheus
# -----------------------------------------------------------------------------
resource "kubernetes_cron_job_v1" "vaultwarden-integrity-check" {
metadata {
name = "vaultwarden-integrity-check"
namespace = kubernetes_namespace.vaultwarden.metadata[0].name
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "30 * * * *"
starting_deadline_seconds = 10
successful_jobs_history_limit = 3
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 10
template {
metadata {}
spec {
affinity {
pod_affinity {
required_during_scheduling_ignored_during_execution {
label_selector {
match_labels = {
app = "vaultwarden"
}
}
topology_key = "kubernetes.io/hostname"
}
}
}
container {
name = "integrity-check"
image = "docker.io/library/alpine"
command = ["/bin/sh", "-c", <<-EOT
set -euo pipefail
apk add --no-cache sqlite curl >/dev/null 2>&1
PUSHGW="http://prometheus-prometheus-pushgateway.monitoring.svc.cluster.local:9091"
result=$(sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" 2>&1)
if echo "$$result" | grep -q "^ok$$"; then
echo "SQLite integrity check passed"
cat <<METRICS | curl -s --data-binary @- "$$PUSHGW/metrics/job/vaultwarden-integrity/instance/vaultwarden"
vaultwarden_sqlite_integrity_ok 1
vaultwarden_sqlite_integrity_check_timestamp $(date +%s)
METRICS
else
echo "ERROR: SQLite integrity check FAILED: $$result"
cat <<METRICS | curl -s --data-binary @- "$$PUSHGW/metrics/job/vaultwarden-integrity/instance/vaultwarden"
vaultwarden_sqlite_integrity_ok 0
vaultwarden_sqlite_integrity_check_timestamp $(date +%s)
METRICS
exit 1
fi
EOT
]
volume_mount {
name = "data"
mount_path = "/data"
read_only = true
}
}
volume {
name = "data"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.vaultwarden_data.metadata[0].name
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
}
}