add backup IO logging, Pushgateway metrics, and Grafana dashboard

- Add /proc/self/io read/write tracking to vault raft-backup and etcd backup
- Push backup_duration_seconds, backup_read_bytes, backup_written_bytes,
  backup_last_success_timestamp to Pushgateway from all 6 backup CronJobs
  (etcd skipped — distroless image has no wget/curl)
- Add cloudsync_duration_seconds metric to cloudsync-monitor
- New "Backup Health" Grafana dashboard with 8 panels: time since last backup,
  overview table, duration/IO trends, cloud sync status, alerts, CronJob schedule
This commit is contained in:
Viktor Barzin 2026-03-23 12:19:01 +02:00
parent 0b595751c5
commit 0a294a30a6
8 changed files with 530 additions and 8 deletions

View file

@ -340,6 +340,10 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
}
command = ["/bin/bash", "-c", <<-EOT
set -euxo pipefail
_t0=$(date +%s)
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
export now=$(date +"%Y_%m_%d_%H_%M")
mysqldump --all-databases -u root --host mysql.dbaas.svc.cluster.local | gzip -9 > /backup/dump_$now.sql.gz
@ -347,7 +351,22 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
cd /backup
find . -name "dump_*.sql.gz" -type f -mtime +14 -delete
find . -name "dump_*.sql" -type f -mtime +14 -delete # clean up old uncompressed
echo Done
_dur=$(($(date +%s) - _t0))
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
echo "=== Backup IO Stats ==="
echo "duration: $${_dur}s"
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup" <<PGEOF || true
backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_last_success_timestamp $(date +%s)
PGEOF
EOT
]
# To restore (from outside of the cluster):
@ -1077,6 +1096,10 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
}
command = ["/bin/bash", "-c", <<-EOT
set -euxo pipefail
_t0=$(date +%s)
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
export now=$(date +"%Y_%m_%d_%H_%M")
PGPASSWORD=$PGPASSWORD pg_dumpall -h postgresql.dbaas -U postgres | gzip -9 > /backup/dump_$now.sql.gz
@ -1084,7 +1107,22 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
cd /backup
find . -name "dump_*.sql.gz" -type f -mtime +14 -delete
find . -name "dump_*.sql" -type f -mtime +14 -delete # clean up old uncompressed
echo Done
_dur=$(($(date +%s) - _t0))
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
echo "=== Backup IO Stats ==="
echo "duration: $${_dur}s"
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup" <<PGEOF || true
backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_last_success_timestamp $(date +%s)
PGEOF
EOT
]
volume_mount {