add backup IO logging, Pushgateway metrics, and Grafana dashboard

- Add /proc/self/io read/write tracking to vault raft-backup and etcd backup
- Push backup_duration_seconds, backup_read_bytes, backup_written_bytes,
  backup_last_success_timestamp to Pushgateway from all 6 backup CronJobs
  (etcd skipped — distroless image has no wget/curl)
- Add cloudsync_duration_seconds metric to cloudsync-monitor
- New "Backup Health" Grafana dashboard with 8 panels: time since last backup,
  overview table, duration/IO trends, cloud sync status, alerts, CronJob schedule
This commit is contained in:
Viktor Barzin 2026-03-23 12:19:01 +02:00
parent 0b595751c5
commit 0a294a30a6
8 changed files with 530 additions and 8 deletions

View file

@ -0,0 +1,377 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Backup health overview — K8s CronJob backups and TrueNAS Cloud Sync status",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 0,
"links": [],
"panels": [
{
"title": "Time Since Last Successful Backup",
"type": "stat",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 90000 },
{ "color": "red", "value": 604800 }
]
},
"mappings": [
{
"type": "special",
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
}
]
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none"
},
"targets": [
{
"expr": "time() - backup_last_success_timestamp",
"legendFormat": "{{ job }}",
"refId": "A"
},
{
"expr": "time() - cloudsync_last_success_timestamp",
"legendFormat": "cloudsync-{{ task_id }}",
"refId": "B"
}
]
},
{
"title": "All Backups — Overview",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Duration (s)" },
"properties": [{ "id": "unit", "value": "s" }]
},
{
"matcher": { "id": "byName", "options": "Read (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Written (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Last Success" },
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
}
]
},
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "Last Success", "desc": true }]
},
"transformations": [
{
"id": "merge",
"options": {}
},
{
"id": "organize",
"options": {
"renameByName": {
"Value #Duration": "Duration (s)",
"Value #Read": "Read (MiB)",
"Value #Written": "Written (MiB)",
"Value #LastSuccess": "Last Success",
"job": "Backup"
},
"excludeByName": {
"Time": true,
"instance": true,
"__name__": true
}
}
}
],
"targets": [
{
"expr": "backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "Duration",
"instant": true,
"format": "table"
},
{
"expr": "backup_read_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Read",
"instant": true,
"format": "table"
},
{
"expr": "backup_written_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Written",
"instant": true,
"format": "table"
},
{
"expr": "backup_last_success_timestamp * 1000",
"legendFormat": "{{ job }}",
"refId": "LastSuccess",
"instant": true,
"format": "table"
}
]
},
{
"title": "Backup Duration Trend",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Backup IO Trend (Read + Written)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 10,
"pointSize": 5,
"showPoints": "auto"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_read_bytes",
"legendFormat": "{{ job }} read",
"refId": "A"
},
{
"expr": "backup_written_bytes",
"legendFormat": "{{ job }} written",
"refId": "B"
}
]
},
{
"title": "Cloud Sync Status",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": { "1": { "text": "SUCCESS", "color": "green" }, "0": { "text": "FAILED", "color": "red" } } }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "cloudsync_job_state",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "Cloud Sync Duration",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 3600 },
{ "color": "red", "value": 86400 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "cloudsync_duration_seconds",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "Active Backup Alerts",
"type": "alertlist",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 },
"datasource": { "type": "datasource", "uid": "grafana" },
"options": {
"showOptions": "current",
"maxItems": 20,
"sortOrder": 1,
"dashboardAlerts": false,
"alertName": "backup",
"stateFilter": {
"firing": true,
"pending": true,
"noData": true,
"normal": false,
"error": true
},
"folder": { "id": null, "title": "" },
"folderId": null
}
},
{
"title": "CronJob Last Schedule",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Value" },
"properties": [{ "id": "unit", "value": "dateTimeAsIso" }]
}
]
},
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "Value", "desc": true }]
},
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"cronjob": "CronJob",
"namespace": "Namespace",
"Value": "Last Scheduled"
},
"excludeByName": {
"Time": true,
"__name__": true,
"instance": true,
"job": true,
"uid": true
}
}
}
],
"targets": [
{
"expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*cloudsync.*|.*etcd.*|.*raft.*\"} * 1000",
"legendFormat": "",
"refId": "A",
"instant": true,
"format": "table"
}
]
}
],
"templating": {
"list": [
{
"current": {
"text": "Prometheus",
"value": "PBFA97CFB590B2093"
},
"includeAll": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": { "from": "now-7d", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "Backup Health",
"uid": "backup-health",
"version": 1,
"schemaVersion": 39
}

View file

@ -144,7 +144,18 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
EPOCH_SECS=0
fi
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS"
# Extract transfer stats from job progress description (rclone output)
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0)
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 ))
else
SYNC_DURATION=0
fi
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s"
# Push metrics to Pushgateway
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/cloudsync-monitor/task_id/$TASK_ID"
@ -154,6 +165,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
# HELP cloudsync_job_state Cloud Sync job state (1=SUCCESS, 0=other)
# TYPE cloudsync_job_state gauge
cloudsync_job_state $([ "$JOB_STATE" = "SUCCESS" ] && echo 1 || echo 0)
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
# TYPE cloudsync_duration_seconds gauge
cloudsync_duration_seconds $SYNC_DURATION
METRICS
done