add backup_output_bytes metric and cloudsync_transferred_bytes to backup dashboard

- All 7 backup CronJobs now push backup_output_bytes (file size after backup)
- Cloud Sync monitor parses rclone transfer stats into cloudsync_transferred_bytes
- Grafana dashboard: new Output (MiB) table column, Output Size Trend panel,
  Write Throughput panel, Cloud Sync Transfer Volume bargauge
- All timeseries panels use points-only draw style (discrete backup snapshots)
- etcd backup restructured: init_container for etcdctl (distroless image),
  busybox sidecar for metrics push + purge, ClusterFirstWithHostNet DNS
- Fixed pre-existing curl missing in postgres:16.4-bullseye (immich, dbaas PG)
- Fixed grep -oP not available in alpine/busybox (cloud sync monitor)
This commit is contained in:
Viktor Barzin 2026-03-25 10:44:53 +02:00
parent f289f76882
commit d20c5e5535
8 changed files with 186 additions and 46 deletions

View file

@ -361,10 +361,12 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
_out_bytes=$(stat -c%s /backup/dump_$now.sql.gz)
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup" <<PGEOF || true
backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_output_bytes $${_out_bytes}
backup_last_success_timestamp $(date +%s)
PGEOF
EOT
@ -1096,6 +1098,7 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
}
command = ["/bin/bash", "-c", <<-EOT
set -euxo pipefail
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
_t0=$(date +%s)
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
@ -1117,10 +1120,12 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
_out_bytes=$(stat -c%s /backup/dump_$now.sql.gz)
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup" <<PGEOF || true
backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_output_bytes $${_out_bytes}
backup_last_success_timestamp $(date +%s)
PGEOF
EOT

View file

@ -684,6 +684,7 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
name = "postgresql-backup"
image = "postgres:16.4-bullseye"
command = ["/bin/sh", "-c", <<-EOT
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
_t0=$(date +%s)
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
@ -704,10 +705,12 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/dump_$now.sql | awk '{print $5}')"
_out_bytes=$(stat -c%s /backup/dump_$now.sql)
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/immich-postgresql-backup" <<PGEOF || true
backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_output_bytes $${_out_bytes}
backup_last_success_timestamp $(date +%s)
PGEOF
EOT

View file

@ -98,38 +98,40 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
node_name = "k8s-master"
priority_class_name = "system-cluster-critical"
host_network = true
container {
dns_policy = "ClusterFirstWithHostNet"
init_container {
name = "backup-etcd"
image = "registry.k8s.io/etcd:3.5.21-0"
command = ["/bin/sh", "-c"]
args = [<<-EOT
set -eu
_t0=$(date +%s)
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
ETCDCTL_API=3 etcdctl \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
--key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
snapshot save /backup/etcd-snapshot-$TIMESTAMP.db
_dur=$(($(date +%s) - _t0))
_rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
_wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
echo "=== Backup IO Stats ==="
echo "duration: $${_dur}s"
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/etcd-snapshot-$TIMESTAMP.db | awk '{print $5}')"
EOT
]
command = ["etcdctl", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
resources {
requests = {
memory = "256Mi"
cpu = "50m"
}
limits = {
memory = "512Mi"
}
}
env {
name = "ETCDCTL_API"
value = "3"
}
env {
name = "ETCDCTL_ENDPOINTS"
value = "https://127.0.0.1:2379"
}
env {
name = "ETCDCTL_CACERT"
value = "/etc/kubernetes/pki/etcd/ca.crt"
}
env {
name = "ETCDCTL_CERT"
value = "/etc/kubernetes/pki/etcd/healthcheck-client.crt"
}
env {
name = "ETCDCTL_KEY"
value = "/etc/kubernetes/pki/etcd/healthcheck-client.key"
}
volume_mount {
mount_path = "/backup"
name = "backup"
@ -141,11 +143,26 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
}
}
container {
name = "backup-purge"
image = "busybox:1.31.1"
command = ["/bin/sh"]
args = ["-c", "find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \\;"]
name = "backup-manage"
image = "busybox:1.37"
command = ["/bin/sh", "-c"]
args = [<<-EOT
set -eu
# Rename snapshot with timestamp
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
mv /backup/etcd-snapshot-latest.db /backup/etcd-snapshot-$TIMESTAMP.db
_out_bytes=$(stat -c%s /backup/etcd-snapshot-$TIMESTAMP.db 2>/dev/null || echo 0)
echo "Backup done: etcd-snapshot-$TIMESTAMP.db ($${_out_bytes} bytes)"
# Rotate 30 day retention
find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \;
# Push metrics to Pushgateway
wget -qO- --post-data "backup_output_bytes $${_out_bytes}
backup_last_success_timestamp $(date +%s)
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/backup-etcd" || true
EOT
]
volume_mount {
mount_path = "/backup"
name = "backup"

View file

@ -84,6 +84,10 @@
"matcher": { "id": "byName", "options": "Written (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Output (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Last Success" },
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
@ -106,6 +110,7 @@
"Value #Duration": "Duration (s)",
"Value #Read": "Read (MiB)",
"Value #Written": "Written (MiB)",
"Value #Output": "Output (MiB)",
"Value #LastSuccess": "Last Success",
"job": "Backup"
},
@ -139,6 +144,13 @@
"instant": true,
"format": "table"
},
{
"expr": "backup_output_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Output",
"instant": true,
"format": "table"
},
{
"expr": "backup_last_success_timestamp * 1000",
"legendFormat": "{{ job }}",
@ -157,11 +169,9 @@
"defaults": {
"unit": "s",
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto"
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
@ -187,11 +197,9 @@
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto"
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
@ -213,10 +221,68 @@
}
]
},
{
"title": "Backup Output Size Trend",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_output_bytes",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Write Throughput",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": {
"drawStyle": "bars",
"lineWidth": 1,
"fillOpacity": 50,
"pointSize": 5,
"showPoints": "never"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_written_bytes / backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Cloud Sync Status",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 30 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
@ -250,7 +316,7 @@
{
"title": "Cloud Sync Duration",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 30 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
@ -280,10 +346,43 @@
}
]
},
{
"title": "Cloud Sync Transfer Volume",
"type": "bargauge",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 36 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1073741824 },
{ "color": "red", "value": 10737418240 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"displayMode": "gradient",
"orientation": "horizontal",
"showUnfilled": true
},
"targets": [
{
"expr": "cloudsync_transferred_bytes",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "Active Backup Alerts",
"type": "alertlist",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 },
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 42 },
"datasource": { "type": "datasource", "uid": "grafana" },
"options": {
"showOptions": "current",
@ -305,7 +404,7 @@
{
"title": "CronJob Last Schedule",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},

View file

@ -146,7 +146,15 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
# Extract transfer stats from job progress description (rclone output)
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0)
TX_NUM=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*\([0-9.]*\).*/\1/p' | head -1)
TX_NUM=$${TX_NUM:-0}
TX_UNIT=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*[0-9.]*[[:space:]]*\([A-Za-z]*\).*/\1/p' | head -1)
TX_UNIT=$${TX_UNIT:-Bytes}
case "$TX_UNIT" in
Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;;
GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;;
esac
TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}')
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
@ -168,6 +176,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
# TYPE cloudsync_duration_seconds gauge
cloudsync_duration_seconds $SYNC_DURATION
# HELP cloudsync_transferred_bytes Bytes transferred during Cloud Sync run
# TYPE cloudsync_transferred_bytes gauge
cloudsync_transferred_bytes $TRANSFERRED_BYTES
METRICS
done

View file

@ -305,9 +305,11 @@ resource "kubernetes_cron_job_v1" "redis-backup" {
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')"
_out_bytes=$(stat -c%s /backup/redis-$TIMESTAMP.rdb)
wget -qO- --post-data "backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_output_bytes $${_out_bytes}
backup_last_success_timestamp $(date +%s)
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true
EOT

View file

@ -283,7 +283,8 @@ resource "kubernetes_cron_job_v1" "vault_backup" {
"echo \"read: $(( (_rb1 - _rb0) / 1048576 )) MiB\"; ",
"echo \"written: $(( (_wb1 - _wb0) / 1048576 )) MiB\"; ",
"echo \"output: $(ls -lh /backup/vault-raft-$TIMESTAMP.db | awk '{print $5}')\"; ",
"wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_last_success_timestamp $(date +%s)\n\" ",
"_out_bytes=$(stat -c%s /backup/vault-raft-$TIMESTAMP.db); ",
"wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_output_bytes $${_out_bytes}\nbackup_last_success_timestamp $(date +%s)\n\" ",
"\"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vault-raft-backup\" || true"
])]
volume_mount {

View file

@ -283,9 +283,11 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
echo "output: $(du -sh /backup/$$now | awk '{print $$1}')"
_out_bytes=$(du -sb /backup/$now | awk '{print $1}')
wget -qO- --post-data "backup_duration_seconds $${_dur}
backup_read_bytes $(( _rb1 - _rb0 ))
backup_written_bytes $(( _wb1 - _wb0 ))
backup_output_bytes $${_out_bytes}
backup_last_success_timestamp $(date +%s)
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vaultwarden-backup" || true
EOT