backup & DR: add alerting, fix rotation, secure MySQL password, add runbooks
Phase 1: Add 12 PrometheusRules for backup health alerting - PostgreSQL, MySQL, Vault, Vaultwarden, Redis staleness + never-succeeded alerts - CSIDriverCrashLoop alert for nfs-csi/iscsi-csi namespaces - Generic BackupCronJobFailed alert Phase 2: Fix backup rotation - etcd: timestamped snapshots instead of overwriting single file - Redis: timestamped RDB files with 7-day retention purge - PostgreSQL: retention increased from 7 to 14 days Phase 3: Fix MySQL password exposure - Move root password from command line arg to MYSQL_PWD env var via secretKeyRef Phase 5: Add restore runbooks - PostgreSQL, MySQL, Vault, etcd, Vaultwarden, full cluster rebuild
This commit is contained in:
parent
62d42657e6
commit
af2222fce8
9 changed files with 657 additions and 4 deletions
|
|
@ -101,8 +101,8 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
container {
|
||||
name = "backup-etcd"
|
||||
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||
command = ["etcdctl"]
|
||||
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = ["ETCDCTL_API=3 etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d-%H%M%S).db"]
|
||||
env {
|
||||
name = "ETCDCTL_API"
|
||||
value = "3"
|
||||
|
|
|
|||
|
|
@ -502,6 +502,90 @@ serverFiles:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: "etcd backup CronJob has never completed successfully"
|
||||
- alert: PostgreSQLBackupStale
|
||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="postgresql-backup", namespace="dbaas"}) > 129600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
||||
- alert: PostgreSQLBackupNeverSucceeded
|
||||
expr: kube_cronjob_status_last_successful_time{cronjob="postgresql-backup", namespace="dbaas"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL backup CronJob has never completed successfully"
|
||||
- alert: MySQLBackupStale
|
||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="mysql-backup", namespace="dbaas"}) > 129600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MySQL backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
||||
- alert: MySQLBackupNeverSucceeded
|
||||
expr: kube_cronjob_status_last_successful_time{cronjob="mysql-backup", namespace="dbaas"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MySQL backup CronJob has never completed successfully"
|
||||
- alert: VaultBackupStale
|
||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"}) > 129600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Vault backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
||||
- alert: VaultBackupNeverSucceeded
|
||||
expr: kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Vault backup CronJob has never completed successfully"
|
||||
- alert: VaultwardenBackupStale
|
||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"}) > 129600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Vaultwarden backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
||||
- alert: VaultwardenBackupNeverSucceeded
|
||||
expr: kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Vaultwarden backup CronJob has never completed successfully"
|
||||
- alert: RedisBackupStale
|
||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"}) > 14400
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis backup is {{ $value | humanizeDuration }} old (threshold: 4h)"
|
||||
- alert: RedisBackupNeverSucceeded
|
||||
expr: kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis backup CronJob has never completed successfully"
|
||||
- alert: CSIDriverCrashLoop
|
||||
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|iscsi-csi"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "CSI driver CrashLoopBackOff in {{ $labels.namespace }}/{{ $labels.pod }} — storage-layer failure risk"
|
||||
- alert: BackupCronJobFailed
|
||||
expr: kube_job_status_failed{job_name=~".*backup.*"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||
- alert: NewTailscaleClient
|
||||
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
|
||||
for: 5m
|
||||
|
|
|
|||
|
|
@ -283,12 +283,15 @@ resource "kubernetes_cron_job_v1" "redis-backup" {
|
|||
image = "redis:7-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -eux
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M)
|
||||
# Trigger a fresh RDB save on the master
|
||||
redis-cli -h redis.redis BGSAVE
|
||||
sleep 5
|
||||
# Copy the RDB via redis-cli --rdb
|
||||
redis-cli -h redis.redis --rdb /backup/dump.rdb
|
||||
echo "Backup complete: $(ls -lh /backup/dump.rdb)"
|
||||
redis-cli -h redis.redis --rdb /backup/redis-$TIMESTAMP.rdb
|
||||
# Rotate — 7-day retention
|
||||
find /backup -name 'redis-*.rdb' -type f -mtime +7 -delete
|
||||
echo "Backup complete: redis-$TIMESTAMP.rdb"
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue