From d20c5e553562d1e99dda9a67d2d6e47f4f6921f2 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 25 Mar 2026 10:44:53 +0200 Subject: [PATCH] add backup_output_bytes metric and cloudsync_transferred_bytes to backup dashboard - All 7 backup CronJobs now push backup_output_bytes (file size after backup) - Cloud Sync monitor parses rclone transfer stats into cloudsync_transferred_bytes - Grafana dashboard: new Output (MiB) table column, Output Size Trend panel, Write Throughput panel, Cloud Sync Transfer Volume bargauge - All timeseries panels use points-only draw style (discrete backup snapshots) - etcd backup restructured: init_container for etcdctl (distroless image), busybox sidecar for metrics push + purge, ClusterFirstWithHostNet DNS - Fixed pre-existing curl missing in postgres:16.4-bullseye (immich, dbaas PG) - Fixed grep -oP not available in alpine/busybox (cloud sync monitor) --- stacks/dbaas/modules/dbaas/main.tf | 5 + stacks/immich/main.tf | 3 + .../modules/infra-maintenance/main.tf | 77 ++++++----- .../monitoring/dashboards/backup_health.json | 127 ++++++++++++++++-- stacks/monitoring/modules/monitoring/main.tf | 13 +- stacks/redis/modules/redis/main.tf | 2 + stacks/vault/main.tf | 3 +- .../vaultwarden/modules/vaultwarden/main.tf | 2 + 8 files changed, 186 insertions(+), 46 deletions(-) diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 54e1d5fb..a0157235 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -361,10 +361,12 @@ resource "kubernetes_cron_job_v1" "mysql-backup" { echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')" + _out_bytes=$(stat -c%s /backup/dump_$now.sql.gz) curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup" </dev/null 2>&1 || true _t0=$(date +%s) _rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) _wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) @@ -1117,10 +1120,12 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" { echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')" + _out_bytes=$(stat -c%s /backup/dump_$now.sql.gz) curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup" </dev/null 2>&1 || true _t0=$(date +%s) _rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) _wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) @@ -704,10 +705,12 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" { echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" echo "output: $(ls -lh /backup/dump_$now.sql | awk '{print $5}')" + _out_bytes=$(stat -c%s /backup/dump_$now.sql) curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/immich-postgresql-backup" </dev/null || echo 0) - _wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) - - TIMESTAMP=$(date +%Y%m%d-%H%M%S) - ETCDCTL_API=3 etcdctl \ - --endpoints=https://127.0.0.1:2379 \ - --cacert=/etc/kubernetes/pki/etcd/ca.crt \ - --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \ - --key=/etc/kubernetes/pki/etcd/healthcheck-client.key \ - snapshot save /backup/etcd-snapshot-$TIMESTAMP.db - - _dur=$(($(date +%s) - _t0)) - _rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) - _wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) - echo "=== Backup IO Stats ===" - echo "duration: $${_dur}s" - echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" - echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" - echo "output: $(ls -lh /backup/etcd-snapshot-$TIMESTAMP.db | awk '{print $5}')" - EOT - ] + command = ["etcdctl", "snapshot", "save", "/backup/etcd-snapshot-latest.db"] + resources { + requests = { + memory = "256Mi" + cpu = "50m" + } + limits = { + memory = "512Mi" + } + } env { name = "ETCDCTL_API" value = "3" } + env { + name = "ETCDCTL_ENDPOINTS" + value = "https://127.0.0.1:2379" + } + env { + name = "ETCDCTL_CACERT" + value = "/etc/kubernetes/pki/etcd/ca.crt" + } + env { + name = "ETCDCTL_CERT" + value = "/etc/kubernetes/pki/etcd/healthcheck-client.crt" + } + env { + name = "ETCDCTL_KEY" + value = "/etc/kubernetes/pki/etcd/healthcheck-client.key" + } volume_mount { mount_path = "/backup" name = "backup" @@ -141,11 +143,26 @@ resource "kubernetes_cron_job_v1" "backup-etcd" { } } container { - name = "backup-purge" - image = "busybox:1.31.1" - command = ["/bin/sh"] - args = ["-c", "find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \\;"] + name = "backup-manage" + image = "busybox:1.37" + command = ["/bin/sh", "-c"] + args = [<<-EOT + set -eu + # Rename snapshot with timestamp + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + mv /backup/etcd-snapshot-latest.db /backup/etcd-snapshot-$TIMESTAMP.db + _out_bytes=$(stat -c%s /backup/etcd-snapshot-$TIMESTAMP.db 2>/dev/null || echo 0) + echo "Backup done: etcd-snapshot-$TIMESTAMP.db ($${_out_bytes} bytes)" + # Rotate — 30 day retention + find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \; + + # Push metrics to Pushgateway + wget -qO- --post-data "backup_output_bytes $${_out_bytes} + backup_last_success_timestamp $(date +%s) + " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/backup-etcd" || true + EOT + ] volume_mount { mount_path = "/backup" name = "backup" diff --git a/stacks/monitoring/modules/monitoring/dashboards/backup_health.json b/stacks/monitoring/modules/monitoring/dashboards/backup_health.json index 2c99019c..0b46fdb1 100644 --- a/stacks/monitoring/modules/monitoring/dashboards/backup_health.json +++ b/stacks/monitoring/modules/monitoring/dashboards/backup_health.json @@ -84,6 +84,10 @@ "matcher": { "id": "byName", "options": "Written (MiB)" }, "properties": [{ "id": "unit", "value": "decmbytes" }] }, + { + "matcher": { "id": "byName", "options": "Output (MiB)" }, + "properties": [{ "id": "unit", "value": "decmbytes" }] + }, { "matcher": { "id": "byName", "options": "Last Success" }, "properties": [{ "id": "unit", "value": "dateTimeFromNow" }] @@ -106,6 +110,7 @@ "Value #Duration": "Duration (s)", "Value #Read": "Read (MiB)", "Value #Written": "Written (MiB)", + "Value #Output": "Output (MiB)", "Value #LastSuccess": "Last Success", "job": "Backup" }, @@ -139,6 +144,13 @@ "instant": true, "format": "table" }, + { + "expr": "backup_output_bytes / 1048576", + "legendFormat": "{{ job }}", + "refId": "Output", + "instant": true, + "format": "table" + }, { "expr": "backup_last_success_timestamp * 1000", "legendFormat": "{{ job }}", @@ -157,11 +169,9 @@ "defaults": { "unit": "s", "custom": { - "drawStyle": "line", - "lineWidth": 2, - "fillOpacity": 10, - "pointSize": 5, - "showPoints": "auto" + "drawStyle": "points", + "pointSize": 8, + "showPoints": "always" } }, "overrides": [] @@ -187,11 +197,9 @@ "defaults": { "unit": "bytes", "custom": { - "drawStyle": "line", - "lineWidth": 2, - "fillOpacity": 10, - "pointSize": 5, - "showPoints": "auto" + "drawStyle": "points", + "pointSize": 8, + "showPoints": "always" } }, "overrides": [] @@ -213,10 +221,68 @@ } ] }, + { + "title": "Backup Output Size Trend", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "points", + "pointSize": 8, + "showPoints": "always" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "targets": [ + { + "expr": "backup_output_bytes", + "legendFormat": "{{ job }}", + "refId": "A" + } + ] + }, + { + "title": "Write Throughput", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "drawStyle": "bars", + "lineWidth": 1, + "fillOpacity": 50, + "pointSize": 5, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "targets": [ + { + "expr": "backup_written_bytes / backup_duration_seconds", + "legendFormat": "{{ job }}", + "refId": "A" + } + ] + }, { "title": "Cloud Sync Status", "type": "stat", - "gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 30 }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { @@ -250,7 +316,7 @@ { "title": "Cloud Sync Duration", "type": "stat", - "gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 30 }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { @@ -280,10 +346,43 @@ } ] }, + { + "title": "Cloud Sync Transfer Volume", + "type": "bargauge", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 36 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1073741824 }, + { "color": "red", "value": 10737418240 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "displayMode": "gradient", + "orientation": "horizontal", + "showUnfilled": true + }, + "targets": [ + { + "expr": "cloudsync_transferred_bytes", + "legendFormat": "Task {{ task_id }}", + "refId": "A" + } + ] + }, { "title": "Active Backup Alerts", "type": "alertlist", - "gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 42 }, "datasource": { "type": "datasource", "uid": "grafana" }, "options": { "showOptions": "current", @@ -305,7 +404,7 @@ { "title": "CronJob Last Schedule", "type": "table", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": {}, diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index a49c0587..4ffe4f0f 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -146,7 +146,15 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" { # Extract transfer stats from job progress description (rclone output) JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""') - BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0) + TX_NUM=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*\([0-9.]*\).*/\1/p' | head -1) + TX_NUM=$${TX_NUM:-0} + TX_UNIT=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*[0-9.]*[[:space:]]*\([A-Za-z]*\).*/\1/p' | head -1) + TX_UNIT=$${TX_UNIT:-Bytes} + case "$TX_UNIT" in + Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;; + GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;; + esac + TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}') JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0') JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then @@ -168,6 +176,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" { # HELP cloudsync_duration_seconds Duration of the last Cloud Sync run # TYPE cloudsync_duration_seconds gauge cloudsync_duration_seconds $SYNC_DURATION + # HELP cloudsync_transferred_bytes Bytes transferred during Cloud Sync run + # TYPE cloudsync_transferred_bytes gauge + cloudsync_transferred_bytes $TRANSFERRED_BYTES METRICS done diff --git a/stacks/redis/modules/redis/main.tf b/stacks/redis/modules/redis/main.tf index 49b19fda..2450eda7 100644 --- a/stacks/redis/modules/redis/main.tf +++ b/stacks/redis/modules/redis/main.tf @@ -305,9 +305,11 @@ resource "kubernetes_cron_job_v1" "redis-backup" { echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')" + _out_bytes=$(stat -c%s /backup/redis-$TIMESTAMP.rdb) wget -qO- --post-data "backup_duration_seconds $${_dur} backup_read_bytes $(( _rb1 - _rb0 )) backup_written_bytes $(( _wb1 - _wb0 )) + backup_output_bytes $${_out_bytes} backup_last_success_timestamp $(date +%s) " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true EOT diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index bb777571..15c14d6c 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -283,7 +283,8 @@ resource "kubernetes_cron_job_v1" "vault_backup" { "echo \"read: $(( (_rb1 - _rb0) / 1048576 )) MiB\"; ", "echo \"written: $(( (_wb1 - _wb0) / 1048576 )) MiB\"; ", "echo \"output: $(ls -lh /backup/vault-raft-$TIMESTAMP.db | awk '{print $5}')\"; ", - "wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_last_success_timestamp $(date +%s)\n\" ", + "_out_bytes=$(stat -c%s /backup/vault-raft-$TIMESTAMP.db); ", + "wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_output_bytes $${_out_bytes}\nbackup_last_success_timestamp $(date +%s)\n\" ", "\"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vault-raft-backup\" || true" ])] volume_mount { diff --git a/stacks/vaultwarden/modules/vaultwarden/main.tf b/stacks/vaultwarden/modules/vaultwarden/main.tf index b909b005..03c49fe8 100644 --- a/stacks/vaultwarden/modules/vaultwarden/main.tf +++ b/stacks/vaultwarden/modules/vaultwarden/main.tf @@ -283,9 +283,11 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" echo "output: $(du -sh /backup/$$now | awk '{print $$1}')" + _out_bytes=$(du -sb /backup/$now | awk '{print $1}') wget -qO- --post-data "backup_duration_seconds $${_dur} backup_read_bytes $(( _rb1 - _rb0 )) backup_written_bytes $(( _wb1 - _wb0 )) + backup_output_bytes $${_out_bytes} backup_last_success_timestamp $(date +%s) " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vaultwarden-backup" || true EOT