add backup_output_bytes metric and cloudsync_transferred_bytes to backup dashboard
- All 7 backup CronJobs now push backup_output_bytes (file size after backup) - Cloud Sync monitor parses rclone transfer stats into cloudsync_transferred_bytes - Grafana dashboard: new Output (MiB) table column, Output Size Trend panel, Write Throughput panel, Cloud Sync Transfer Volume bargauge - All timeseries panels use points-only draw style (discrete backup snapshots) - etcd backup restructured: init_container for etcdctl (distroless image), busybox sidecar for metrics push + purge, ClusterFirstWithHostNet DNS - Fixed pre-existing curl missing in postgres:16.4-bullseye (immich, dbaas PG) - Fixed grep -oP not available in alpine/busybox (cloud sync monitor)
This commit is contained in:
parent
f289f76882
commit
d20c5e5535
8 changed files with 186 additions and 46 deletions
|
|
@ -361,10 +361,12 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
|
|||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
|
||||
|
||||
_out_bytes=$(stat -c%s /backup/dump_$now.sql.gz)
|
||||
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup" <<PGEOF || true
|
||||
backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
PGEOF
|
||||
EOT
|
||||
|
|
@ -1096,6 +1098,7 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
}
|
||||
command = ["/bin/bash", "-c", <<-EOT
|
||||
set -euxo pipefail
|
||||
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
|
|
@ -1117,10 +1120,12 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
|
||||
|
||||
_out_bytes=$(stat -c%s /backup/dump_$now.sql.gz)
|
||||
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup" <<PGEOF || true
|
||||
backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
PGEOF
|
||||
EOT
|
||||
|
|
|
|||
|
|
@ -684,6 +684,7 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
name = "postgresql-backup"
|
||||
image = "postgres:16.4-bullseye"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
|
|
@ -704,10 +705,12 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/dump_$now.sql | awk '{print $5}')"
|
||||
|
||||
_out_bytes=$(stat -c%s /backup/dump_$now.sql)
|
||||
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/immich-postgresql-backup" <<PGEOF || true
|
||||
backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
PGEOF
|
||||
EOT
|
||||
|
|
|
|||
|
|
@ -98,38 +98,40 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
node_name = "k8s-master"
|
||||
priority_class_name = "system-cluster-critical"
|
||||
host_network = true
|
||||
container {
|
||||
dns_policy = "ClusterFirstWithHostNet"
|
||||
init_container {
|
||||
name = "backup-etcd"
|
||||
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = [<<-EOT
|
||||
set -eu
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
ETCDCTL_API=3 etcdctl \
|
||||
--endpoints=https://127.0.0.1:2379 \
|
||||
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
|
||||
--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
|
||||
--key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
|
||||
snapshot save /backup/etcd-snapshot-$TIMESTAMP.db
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/etcd-snapshot-$TIMESTAMP.db | awk '{print $5}')"
|
||||
EOT
|
||||
]
|
||||
command = ["etcdctl", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
|
||||
resources {
|
||||
requests = {
|
||||
memory = "256Mi"
|
||||
cpu = "50m"
|
||||
}
|
||||
limits = {
|
||||
memory = "512Mi"
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_API"
|
||||
value = "3"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_ENDPOINTS"
|
||||
value = "https://127.0.0.1:2379"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_CACERT"
|
||||
value = "/etc/kubernetes/pki/etcd/ca.crt"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_CERT"
|
||||
value = "/etc/kubernetes/pki/etcd/healthcheck-client.crt"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_KEY"
|
||||
value = "/etc/kubernetes/pki/etcd/healthcheck-client.key"
|
||||
}
|
||||
volume_mount {
|
||||
mount_path = "/backup"
|
||||
name = "backup"
|
||||
|
|
@ -141,11 +143,26 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
}
|
||||
}
|
||||
container {
|
||||
name = "backup-purge"
|
||||
image = "busybox:1.31.1"
|
||||
command = ["/bin/sh"]
|
||||
args = ["-c", "find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \\;"]
|
||||
name = "backup-manage"
|
||||
image = "busybox:1.37"
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = [<<-EOT
|
||||
set -eu
|
||||
# Rename snapshot with timestamp
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
mv /backup/etcd-snapshot-latest.db /backup/etcd-snapshot-$TIMESTAMP.db
|
||||
_out_bytes=$(stat -c%s /backup/etcd-snapshot-$TIMESTAMP.db 2>/dev/null || echo 0)
|
||||
echo "Backup done: etcd-snapshot-$TIMESTAMP.db ($${_out_bytes} bytes)"
|
||||
|
||||
# Rotate — 30 day retention
|
||||
find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \;
|
||||
|
||||
# Push metrics to Pushgateway
|
||||
wget -qO- --post-data "backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/backup-etcd" || true
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
mount_path = "/backup"
|
||||
name = "backup"
|
||||
|
|
|
|||
|
|
@ -84,6 +84,10 @@
|
|||
"matcher": { "id": "byName", "options": "Written (MiB)" },
|
||||
"properties": [{ "id": "unit", "value": "decmbytes" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Output (MiB)" },
|
||||
"properties": [{ "id": "unit", "value": "decmbytes" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Last Success" },
|
||||
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
|
||||
|
|
@ -106,6 +110,7 @@
|
|||
"Value #Duration": "Duration (s)",
|
||||
"Value #Read": "Read (MiB)",
|
||||
"Value #Written": "Written (MiB)",
|
||||
"Value #Output": "Output (MiB)",
|
||||
"Value #LastSuccess": "Last Success",
|
||||
"job": "Backup"
|
||||
},
|
||||
|
|
@ -139,6 +144,13 @@
|
|||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_output_bytes / 1048576",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Output",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_last_success_timestamp * 1000",
|
||||
"legendFormat": "{{ job }}",
|
||||
|
|
@ -157,11 +169,9 @@
|
|||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto"
|
||||
"drawStyle": "points",
|
||||
"pointSize": 8,
|
||||
"showPoints": "always"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
|
|
@ -187,11 +197,9 @@
|
|||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto"
|
||||
"drawStyle": "points",
|
||||
"pointSize": 8,
|
||||
"showPoints": "always"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
|
|
@ -213,10 +221,68 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Backup Output Size Trend",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"drawStyle": "points",
|
||||
"pointSize": 8,
|
||||
"showPoints": "always"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_output_bytes",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Write Throughput",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 50,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_written_bytes / backup_duration_seconds",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Cloud Sync Status",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 },
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 30 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
|
|
@ -250,7 +316,7 @@
|
|||
{
|
||||
"title": "Cloud Sync Duration",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 },
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 30 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
|
|
@ -280,10 +346,43 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Cloud Sync Transfer Volume",
|
||||
"type": "bargauge",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 36 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1073741824 },
|
||||
{ "color": "red", "value": 10737418240 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cloudsync_transferred_bytes",
|
||||
"legendFormat": "Task {{ task_id }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Active Backup Alerts",
|
||||
"type": "alertlist",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 },
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 42 },
|
||||
"datasource": { "type": "datasource", "uid": "grafana" },
|
||||
"options": {
|
||||
"showOptions": "current",
|
||||
|
|
@ -305,7 +404,7 @@
|
|||
{
|
||||
"title": "CronJob Last Schedule",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
|
|
|
|||
|
|
@ -146,7 +146,15 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
|||
|
||||
# Extract transfer stats from job progress description (rclone output)
|
||||
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
|
||||
BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0)
|
||||
TX_NUM=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*\([0-9.]*\).*/\1/p' | head -1)
|
||||
TX_NUM=$${TX_NUM:-0}
|
||||
TX_UNIT=$(echo "$JOB_PROGRESS" | sed -n 's/.*Transferred:[[:space:]]*[0-9.]*[[:space:]]*\([A-Za-z]*\).*/\1/p' | head -1)
|
||||
TX_UNIT=$${TX_UNIT:-Bytes}
|
||||
case "$TX_UNIT" in
|
||||
Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;;
|
||||
GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;;
|
||||
esac
|
||||
TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}')
|
||||
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
|
||||
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
|
||||
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
|
||||
|
|
@ -168,6 +176,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
|||
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
|
||||
# TYPE cloudsync_duration_seconds gauge
|
||||
cloudsync_duration_seconds $SYNC_DURATION
|
||||
# HELP cloudsync_transferred_bytes Bytes transferred during Cloud Sync run
|
||||
# TYPE cloudsync_transferred_bytes gauge
|
||||
cloudsync_transferred_bytes $TRANSFERRED_BYTES
|
||||
METRICS
|
||||
done
|
||||
|
||||
|
|
|
|||
|
|
@ -305,9 +305,11 @@ resource "kubernetes_cron_job_v1" "redis-backup" {
|
|||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')"
|
||||
|
||||
_out_bytes=$(stat -c%s /backup/redis-$TIMESTAMP.rdb)
|
||||
wget -qO- --post-data "backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true
|
||||
EOT
|
||||
|
|
|
|||
|
|
@ -283,7 +283,8 @@ resource "kubernetes_cron_job_v1" "vault_backup" {
|
|||
"echo \"read: $(( (_rb1 - _rb0) / 1048576 )) MiB\"; ",
|
||||
"echo \"written: $(( (_wb1 - _wb0) / 1048576 )) MiB\"; ",
|
||||
"echo \"output: $(ls -lh /backup/vault-raft-$TIMESTAMP.db | awk '{print $5}')\"; ",
|
||||
"wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_last_success_timestamp $(date +%s)\n\" ",
|
||||
"_out_bytes=$(stat -c%s /backup/vault-raft-$TIMESTAMP.db); ",
|
||||
"wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_output_bytes $${_out_bytes}\nbackup_last_success_timestamp $(date +%s)\n\" ",
|
||||
"\"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vault-raft-backup\" || true"
|
||||
])]
|
||||
volume_mount {
|
||||
|
|
|
|||
|
|
@ -283,9 +283,11 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(du -sh /backup/$$now | awk '{print $$1}')"
|
||||
|
||||
_out_bytes=$(du -sb /backup/$now | awk '{print $1}')
|
||||
wget -qO- --post-data "backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vaultwarden-backup" || true
|
||||
EOT
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue