add backup IO logging, Pushgateway metrics, and Grafana dashboard
- Add /proc/self/io read/write tracking to vault raft-backup and etcd backup - Push backup_duration_seconds, backup_read_bytes, backup_written_bytes, backup_last_success_timestamp to Pushgateway from all 6 backup CronJobs (etcd skipped — distroless image has no wget/curl) - Add cloudsync_duration_seconds metric to cloudsync-monitor - New "Backup Health" Grafana dashboard with 8 panels: time since last backup, overview table, duration/IO trends, cloud sync status, alerts, CronJob schedule
This commit is contained in:
parent
0b595751c5
commit
0a294a30a6
8 changed files with 530 additions and 8 deletions
|
|
@ -0,0 +1,377 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": { "type": "datasource", "uid": "grafana" },
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Backup health overview — K8s CronJob backups and TrueNAS Cloud Sync status",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": 0,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Time Since Last Successful Backup",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 90000 },
|
||||
{ "color": "red", "value": 604800 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "special",
|
||||
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
|
||||
}
|
||||
]
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"orientation": "auto",
|
||||
"textMode": "auto",
|
||||
"colorMode": "background",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - backup_last_success_timestamp",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "time() - cloudsync_last_success_timestamp",
|
||||
"legendFormat": "cloudsync-{{ task_id }}",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "All Backups — Overview",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Duration (s)" },
|
||||
"properties": [{ "id": "unit", "value": "s" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Read (MiB)" },
|
||||
"properties": [{ "id": "unit", "value": "decmbytes" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Written (MiB)" },
|
||||
"properties": [{ "id": "unit", "value": "decmbytes" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Last Success" },
|
||||
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{ "displayName": "Last Success", "desc": true }]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"renameByName": {
|
||||
"Value #Duration": "Duration (s)",
|
||||
"Value #Read": "Read (MiB)",
|
||||
"Value #Written": "Written (MiB)",
|
||||
"Value #LastSuccess": "Last Success",
|
||||
"job": "Backup"
|
||||
},
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"instance": true,
|
||||
"__name__": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_duration_seconds",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Duration",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_read_bytes / 1048576",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Read",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_written_bytes / 1048576",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Written",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_last_success_timestamp * 1000",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "LastSuccess",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Backup Duration Trend",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_duration_seconds",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Backup IO Trend (Read + Written)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_read_bytes",
|
||||
"legendFormat": "{{ job }} read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "backup_written_bytes",
|
||||
"legendFormat": "{{ job }} written",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Cloud Sync Status",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "type": "value", "options": { "1": { "text": "SUCCESS", "color": "green" }, "0": { "text": "FAILED", "color": "red" } } }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cloudsync_job_state",
|
||||
"legendFormat": "Task {{ task_id }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Cloud Sync Duration",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 3600 },
|
||||
{ "color": "red", "value": 86400 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cloudsync_duration_seconds",
|
||||
"legendFormat": "Task {{ task_id }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Active Backup Alerts",
|
||||
"type": "alertlist",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 },
|
||||
"datasource": { "type": "datasource", "uid": "grafana" },
|
||||
"options": {
|
||||
"showOptions": "current",
|
||||
"maxItems": 20,
|
||||
"sortOrder": 1,
|
||||
"dashboardAlerts": false,
|
||||
"alertName": "backup",
|
||||
"stateFilter": {
|
||||
"firing": true,
|
||||
"pending": true,
|
||||
"noData": true,
|
||||
"normal": false,
|
||||
"error": true
|
||||
},
|
||||
"folder": { "id": null, "title": "" },
|
||||
"folderId": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "CronJob Last Schedule",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value" },
|
||||
"properties": [{ "id": "unit", "value": "dateTimeAsIso" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{ "displayName": "Value", "desc": true }]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"renameByName": {
|
||||
"cronjob": "CronJob",
|
||||
"namespace": "Namespace",
|
||||
"Value": "Last Scheduled"
|
||||
},
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"__name__": true,
|
||||
"instance": true,
|
||||
"job": true,
|
||||
"uid": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*cloudsync.*|.*etcd.*|.*raft.*\"} * 1000",
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "PBFA97CFB590B2093"
|
||||
},
|
||||
"includeAll": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-7d", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Backup Health",
|
||||
"uid": "backup-health",
|
||||
"version": 1,
|
||||
"schemaVersion": 39
|
||||
}
|
||||
|
|
@ -144,7 +144,18 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
|||
EPOCH_SECS=0
|
||||
fi
|
||||
|
||||
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS"
|
||||
# Extract transfer stats from job progress description (rclone output)
|
||||
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
|
||||
BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0)
|
||||
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
|
||||
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
|
||||
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
|
||||
SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 ))
|
||||
else
|
||||
SYNC_DURATION=0
|
||||
fi
|
||||
|
||||
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s"
|
||||
|
||||
# Push metrics to Pushgateway
|
||||
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/cloudsync-monitor/task_id/$TASK_ID"
|
||||
|
|
@ -154,6 +165,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
|||
# HELP cloudsync_job_state Cloud Sync job state (1=SUCCESS, 0=other)
|
||||
# TYPE cloudsync_job_state gauge
|
||||
cloudsync_job_state $([ "$JOB_STATE" = "SUCCESS" ] && echo 1 || echo 0)
|
||||
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
|
||||
# TYPE cloudsync_duration_seconds gauge
|
||||
cloudsync_duration_seconds $SYNC_DURATION
|
||||
METRICS
|
||||
done
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue