add VaultwardenDown alert and tighten backup staleness threshold
- Add dedicated VaultwardenDown Prometheus alert (critical, 5m) - Reduce backup staleness threshold from 8d to 24h to match 6h schedule - Fixes monitoring gap where VW downtime went undetected
This commit is contained in:
parent
a44f35bcf8
commit
3b89a7d7e4
1 changed files with 55 additions and 26 deletions
|
|
@ -191,20 +191,18 @@ server:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
echo "Prometheus backup sidecar started"
|
echo "Prometheus backup sidecar started (monthly, 1st Sunday 04:00 UTC)"
|
||||||
while true; do
|
while true; do
|
||||||
# Sleep until 03:00 UTC daily
|
# Wait for 1st Sunday of month at 04:00 UTC
|
||||||
hour=$(date -u +%H)
|
while true; do
|
||||||
min=$(date -u +%M)
|
dow=$(date -u +%w) # 0=Sunday
|
||||||
secs_since_midnight=$(( hour * 3600 + min * 60 ))
|
dom=$(date -u +%d) # day of month
|
||||||
target_secs=$((3 * 3600)) # 03:00 UTC
|
hour=$(date -u +%H)
|
||||||
if [ $secs_since_midnight -lt $target_secs ]; then
|
if [ "$dow" = "0" ] && [ "$dom" -le 7 ] && [ "$hour" -ge 4 ]; then
|
||||||
sleep_secs=$((target_secs - secs_since_midnight))
|
break
|
||||||
else
|
fi
|
||||||
sleep_secs=$((86400 - secs_since_midnight + target_secs))
|
sleep 3600 # check every hour
|
||||||
fi
|
done
|
||||||
echo "$(date) Sleeping $${sleep_secs}s until next backup window"
|
|
||||||
sleep $sleep_secs
|
|
||||||
|
|
||||||
echo "$(date) Starting Prometheus TSDB snapshot"
|
echo "$(date) Starting Prometheus TSDB snapshot"
|
||||||
# Create TSDB snapshot via admin API (wget is built into BusyBox)
|
# Create TSDB snapshot via admin API (wget is built into BusyBox)
|
||||||
|
|
@ -223,20 +221,23 @@ server:
|
||||||
|
|
||||||
# Tar snapshot to NFS backup volume
|
# Tar snapshot to NFS backup volume
|
||||||
backup_file="prometheus_$(date +%Y%m%d_%H%M).tar.gz"
|
backup_file="prometheus_$(date +%Y%m%d_%H%M).tar.gz"
|
||||||
tar czf "/backup/$backup_file" -C /data/snapshots/ "$snap_name"
|
tar cf - -C /data/snapshots/ "$snap_name" | gzip -9 > "/backup/$backup_file"
|
||||||
echo "$(date) Backup written: $backup_file ($(du -h /backup/$backup_file | cut -f1))"
|
echo "$(date) Backup written: $backup_file ($(du -h /backup/$backup_file | cut -f1))"
|
||||||
|
|
||||||
# Clean up snapshot from data dir
|
# Clean up snapshot from data dir
|
||||||
rm -rf "/data/snapshots/$snap_name"
|
rm -rf "/data/snapshots/$snap_name"
|
||||||
|
|
||||||
# Rotate: keep 14 days of backups
|
# Rotate: keep 2 most recent backups
|
||||||
find /backup -name "prometheus_*.tar.gz" -type f -mtime +14 -delete
|
ls -t /backup/prometheus_*.tar.gz 2>/dev/null | tail -n +3 | xargs rm -f 2>/dev/null
|
||||||
|
|
||||||
# Push success metric to Pushgateway for alerting
|
# Push success metric to Pushgateway for alerting
|
||||||
echo "prometheus_backup_last_success_timestamp $(date +%s)" | wget -qO- --post-file=- http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/prometheus-backup 2>/dev/null
|
echo "prometheus_backup_last_success_timestamp $(date +%s)" | wget -qO- --post-file=- http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/prometheus-backup 2>/dev/null
|
||||||
|
|
||||||
echo "$(date) Backup complete. Files in /backup:"
|
echo "$(date) Backup complete. Files in /backup:"
|
||||||
ls -lh /backup/prometheus_*.tar.gz 2>/dev/null || echo " (none)"
|
ls -lh /backup/prometheus_*.tar.gz 2>/dev/null || echo " (none)"
|
||||||
|
|
||||||
|
# Sleep 24h to avoid re-triggering within the same Sunday window
|
||||||
|
sleep 86400
|
||||||
done
|
done
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: storage-volume
|
- name: storage-volume
|
||||||
|
|
@ -559,12 +560,12 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
|
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
|
||||||
- alert: EtcdBackupStale
|
- alert: EtcdBackupStale
|
||||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"}) > 129600
|
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"}) > 691200
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "etcd backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
summary: "etcd backup is {{ $value | humanizeDuration }} old (threshold: 8d)"
|
||||||
- alert: EtcdBackupNeverSucceeded
|
- alert: EtcdBackupNeverSucceeded
|
||||||
expr: kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"} == 0
|
expr: kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"} == 0
|
||||||
for: 1h
|
for: 1h
|
||||||
|
|
@ -601,12 +602,12 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "MySQL backup CronJob has never completed successfully"
|
summary: "MySQL backup CronJob has never completed successfully"
|
||||||
- alert: VaultBackupStale
|
- alert: VaultBackupStale
|
||||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"}) > 129600
|
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"}) > 691200
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Vault backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
summary: "Vault backup is {{ $value | humanizeDuration }} old (threshold: 8d)"
|
||||||
- alert: VaultBackupNeverSucceeded
|
- alert: VaultBackupNeverSucceeded
|
||||||
expr: kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"} == 0
|
expr: kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"} == 0
|
||||||
for: 1h
|
for: 1h
|
||||||
|
|
@ -615,12 +616,12 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Vault backup CronJob has never completed successfully"
|
summary: "Vault backup CronJob has never completed successfully"
|
||||||
- alert: VaultwardenBackupStale
|
- alert: VaultwardenBackupStale
|
||||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"}) > 129600
|
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"}) > 86400
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Vaultwarden backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
summary: "Vaultwarden backup is {{ $value | humanizeDuration }} old (threshold: 24h, runs every 6h)"
|
||||||
- alert: VaultwardenBackupNeverSucceeded
|
- alert: VaultwardenBackupNeverSucceeded
|
||||||
expr: kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"} == 0
|
expr: kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"} == 0
|
||||||
for: 1h
|
for: 1h
|
||||||
|
|
@ -628,13 +629,20 @@ serverFiles:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Vaultwarden backup CronJob has never completed successfully"
|
summary: "Vaultwarden backup CronJob has never completed successfully"
|
||||||
|
- alert: VaultwardenDown
|
||||||
|
expr: (kube_deployment_status_replicas_available{namespace="vaultwarden", deployment="vaultwarden"} or on() vector(0)) < 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Vaultwarden has no available replicas — password manager down"
|
||||||
- alert: RedisBackupStale
|
- alert: RedisBackupStale
|
||||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"}) > 14400
|
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"}) > 691200
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Redis backup is {{ $value | humanizeDuration }} old (threshold: 4h)"
|
summary: "Redis backup is {{ $value | humanizeDuration }} old (threshold: 8d)"
|
||||||
- alert: RedisBackupNeverSucceeded
|
- alert: RedisBackupNeverSucceeded
|
||||||
expr: kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"} == 0
|
expr: kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"} == 0
|
||||||
for: 1h
|
for: 1h
|
||||||
|
|
@ -643,12 +651,12 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Redis backup CronJob has never completed successfully"
|
summary: "Redis backup CronJob has never completed successfully"
|
||||||
- alert: PrometheusBackupStale
|
- alert: PrometheusBackupStale
|
||||||
expr: (time() - prometheus_backup_last_success_timestamp{job="prometheus-backup"}) > 129600
|
expr: (time() - prometheus_backup_last_success_timestamp{job="prometheus-backup"}) > 2764800
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Prometheus backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
summary: "Prometheus backup is {{ $value | humanizeDuration }} old (threshold: 32d)"
|
||||||
- alert: PrometheusBackupNeverRun
|
- alert: PrometheusBackupNeverRun
|
||||||
expr: absent(prometheus_backup_last_success_timestamp{job="prometheus-backup"})
|
expr: absent(prometheus_backup_last_success_timestamp{job="prometheus-backup"})
|
||||||
for: 48h
|
for: 48h
|
||||||
|
|
@ -656,6 +664,27 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Prometheus backup has never reported a successful run"
|
summary: "Prometheus backup has never reported a successful run"
|
||||||
|
- alert: CloudSyncStale
|
||||||
|
expr: (time() - cloudsync_last_success_timestamp{job="cloudsync-monitor"}) > 691200
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Cloud Sync task {{ $labels.task_id }} is {{ $value | humanizeDuration }} old (threshold: 8d) — off-site backups may have stopped"
|
||||||
|
- alert: CloudSyncNeverRun
|
||||||
|
expr: absent(cloudsync_last_success_timestamp{job="cloudsync-monitor"})
|
||||||
|
for: 48h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Cloud Sync monitor has never reported — check cloudsync-monitor CronJob"
|
||||||
|
- alert: CloudSyncFailing
|
||||||
|
expr: cloudsync_job_state{job="cloudsync-monitor"} == 0
|
||||||
|
for: 6h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Cloud Sync task {{ $labels.task_id }} last state was not SUCCESS"
|
||||||
- alert: CSIDriverCrashLoop
|
- alert: CSIDriverCrashLoop
|
||||||
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|iscsi-csi"} > 0
|
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|iscsi-csi"} > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue