add backup_output_bytes metric and cloudsync_transferred_bytes to backup dashboard

- All 7 backup CronJobs now push backup_output_bytes (file size after backup)
- Cloud Sync monitor parses rclone transfer stats into cloudsync_transferred_bytes
- Grafana dashboard: new Output (MiB) table column, Output Size Trend panel,
  Write Throughput panel, Cloud Sync Transfer Volume bargauge
- All timeseries panels use points-only draw style (discrete backup snapshots)
- etcd backup restructured: init_container for etcdctl (distroless image),
  busybox sidecar for metrics push + purge, ClusterFirstWithHostNet DNS
- Fixed pre-existing curl missing in postgres:16.4-bullseye (immich, dbaas PG)
- Fixed grep -oP not available in alpine/busybox (cloud sync monitor)
This commit is contained in:
Viktor Barzin 2026-03-25 10:44:53 +02:00
parent f289f76882
commit d20c5e5535
8 changed files with 186 additions and 46 deletions

View file

@ -84,6 +84,10 @@
"matcher": { "id": "byName", "options": "Written (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Output (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Last Success" },
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
@ -106,6 +110,7 @@
"Value #Duration": "Duration (s)",
"Value #Read": "Read (MiB)",
"Value #Written": "Written (MiB)",
"Value #Output": "Output (MiB)",
"Value #LastSuccess": "Last Success",
"job": "Backup"
},
@ -139,6 +144,13 @@
"instant": true,
"format": "table"
},
{
"expr": "backup_output_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Output",
"instant": true,
"format": "table"
},
{
"expr": "backup_last_success_timestamp * 1000",
"legendFormat": "{{ job }}",
@ -157,11 +169,9 @@
"defaults": {
"unit": "s",
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto"
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
@ -187,11 +197,9 @@
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto"
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
@ -213,10 +221,68 @@
}
]
},
{
"title": "Backup Output Size Trend",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_output_bytes",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Write Throughput",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": {
"drawStyle": "bars",
"lineWidth": 1,
"fillOpacity": 50,
"pointSize": 5,
"showPoints": "never"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_written_bytes / backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Cloud Sync Status",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 30 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
@ -250,7 +316,7 @@
{
"title": "Cloud Sync Duration",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 30 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
@ -280,10 +346,43 @@
}
]
},
{
"title": "Cloud Sync Transfer Volume",
"type": "bargauge",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 36 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1073741824 },
{ "color": "red", "value": 10737418240 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"displayMode": "gradient",
"orientation": "horizontal",
"showUnfilled": true
},
"targets": [
{
"expr": "cloudsync_transferred_bytes",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "Active Backup Alerts",
"type": "alertlist",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 },
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 42 },
"datasource": { "type": "datasource", "uid": "grafana" },
"options": {
"showOptions": "current",
@ -305,7 +404,7 @@
{
"title": "CronJob Last Schedule",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},

View file

@ -146,7 +146,15 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
# Extract transfer stats from job progress description (rclone output)
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0)
TX_NUM=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*\([0-9.]*\).*/\1/p' | head -1)
TX_NUM=$${TX_NUM:-0}
TX_UNIT=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*[0-9.]*[[:space:]]*\([A-Za-z]*\).*/\1/p' | head -1)
TX_UNIT=$${TX_UNIT:-Bytes}
case "$TX_UNIT" in
Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;;
GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;;
esac
TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}')
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
@ -168,6 +176,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
# TYPE cloudsync_duration_seconds gauge
cloudsync_duration_seconds $SYNC_DURATION
# HELP cloudsync_transferred_bytes Bytes transferred during Cloud Sync run
# TYPE cloudsync_transferred_bytes gauge
cloudsync_transferred_bytes $TRANSFERRED_BYTES
METRICS
done