add backup IO logging, Pushgateway metrics, and Grafana dashboard
- Add /proc/self/io read/write tracking to vault raft-backup and etcd backup - Push backup_duration_seconds, backup_read_bytes, backup_written_bytes, backup_last_success_timestamp to Pushgateway from all 6 backup CronJobs (etcd skipped — distroless image has no wget/curl) - Add cloudsync_duration_seconds metric to cloudsync-monitor - New "Backup Health" Grafana dashboard with 8 panels: time since last backup, overview table, duration/IO trends, cloud sync status, alerts, CronJob schedule
This commit is contained in:
parent
0b595751c5
commit
0a294a30a6
8 changed files with 530 additions and 8 deletions
|
|
@ -340,6 +340,10 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
|
|||
}
|
||||
command = ["/bin/bash", "-c", <<-EOT
|
||||
set -euxo pipefail
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
|
||||
export now=$(date +"%Y_%m_%d_%H_%M")
|
||||
mysqldump --all-databases -u root --host mysql.dbaas.svc.cluster.local | gzip -9 > /backup/dump_$now.sql.gz
|
||||
|
||||
|
|
@ -347,7 +351,22 @@ resource "kubernetes_cron_job_v1" "mysql-backup" {
|
|||
cd /backup
|
||||
find . -name "dump_*.sql.gz" -type f -mtime +14 -delete
|
||||
find . -name "dump_*.sql" -type f -mtime +14 -delete # clean up old uncompressed
|
||||
echo Done
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
|
||||
|
||||
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup" <<PGEOF || true
|
||||
backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
PGEOF
|
||||
EOT
|
||||
]
|
||||
# To restore (from outside of the cluster):
|
||||
|
|
@ -1077,6 +1096,10 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
}
|
||||
command = ["/bin/bash", "-c", <<-EOT
|
||||
set -euxo pipefail
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
|
||||
export now=$(date +"%Y_%m_%d_%H_%M")
|
||||
PGPASSWORD=$PGPASSWORD pg_dumpall -h postgresql.dbaas -U postgres | gzip -9 > /backup/dump_$now.sql.gz
|
||||
|
||||
|
|
@ -1084,7 +1107,22 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
cd /backup
|
||||
find . -name "dump_*.sql.gz" -type f -mtime +14 -delete
|
||||
find . -name "dump_*.sql" -type f -mtime +14 -delete # clean up old uncompressed
|
||||
echo Done
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')"
|
||||
|
||||
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup" <<PGEOF || true
|
||||
backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
PGEOF
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
|
|
|
|||
|
|
@ -684,12 +684,32 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" {
|
|||
name = "postgresql-backup"
|
||||
image = "postgres:16.4-bullseye"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
|
||||
export now=$(date +"%Y_%m_%d_%H_%M")
|
||||
pg_dumpall -h immich-postgresql -U immich > /backup/dump_$now.sql
|
||||
|
||||
# Rotate - delete last log file
|
||||
cd /backup
|
||||
find . -name "dump_*.sql" -type f -mtime +14 -delete # 14 day retention of backups
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/dump_$now.sql | awk '{print $5}')"
|
||||
|
||||
curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/immich-postgresql-backup" <<PGEOF || true
|
||||
backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
PGEOF
|
||||
EOT
|
||||
]
|
||||
env {
|
||||
|
|
|
|||
|
|
@ -102,7 +102,30 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
name = "backup-etcd"
|
||||
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = ["ETCDCTL_API=3 etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d-%H%M%S).db"]
|
||||
args = [<<-EOT
|
||||
set -eu
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
ETCDCTL_API=3 etcdctl \
|
||||
--endpoints=https://127.0.0.1:2379 \
|
||||
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
|
||||
--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
|
||||
--key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
|
||||
snapshot save /backup/etcd-snapshot-$TIMESTAMP.db
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/etcd-snapshot-$TIMESTAMP.db | awk '{print $5}')"
|
||||
EOT
|
||||
]
|
||||
env {
|
||||
name = "ETCDCTL_API"
|
||||
value = "3"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,377 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": { "type": "datasource", "uid": "grafana" },
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Backup health overview — K8s CronJob backups and TrueNAS Cloud Sync status",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": 0,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Time Since Last Successful Backup",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 90000 },
|
||||
{ "color": "red", "value": 604800 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "special",
|
||||
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
|
||||
}
|
||||
]
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"orientation": "auto",
|
||||
"textMode": "auto",
|
||||
"colorMode": "background",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - backup_last_success_timestamp",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "time() - cloudsync_last_success_timestamp",
|
||||
"legendFormat": "cloudsync-{{ task_id }}",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "All Backups — Overview",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Duration (s)" },
|
||||
"properties": [{ "id": "unit", "value": "s" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Read (MiB)" },
|
||||
"properties": [{ "id": "unit", "value": "decmbytes" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Written (MiB)" },
|
||||
"properties": [{ "id": "unit", "value": "decmbytes" }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Last Success" },
|
||||
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{ "displayName": "Last Success", "desc": true }]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"renameByName": {
|
||||
"Value #Duration": "Duration (s)",
|
||||
"Value #Read": "Read (MiB)",
|
||||
"Value #Written": "Written (MiB)",
|
||||
"Value #LastSuccess": "Last Success",
|
||||
"job": "Backup"
|
||||
},
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"instance": true,
|
||||
"__name__": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_duration_seconds",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Duration",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_read_bytes / 1048576",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Read",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_written_bytes / 1048576",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "Written",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
},
|
||||
{
|
||||
"expr": "backup_last_success_timestamp * 1000",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "LastSuccess",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Backup Duration Trend",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_duration_seconds",
|
||||
"legendFormat": "{{ job }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Backup IO Trend (Read + Written)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"pointSize": 5,
|
||||
"showPoints": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "backup_read_bytes",
|
||||
"legendFormat": "{{ job }} read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "backup_written_bytes",
|
||||
"legendFormat": "{{ job }} written",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Cloud Sync Status",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "type": "value", "options": { "1": { "text": "SUCCESS", "color": "green" }, "0": { "text": "FAILED", "color": "red" } } }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cloudsync_job_state",
|
||||
"legendFormat": "Task {{ task_id }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Cloud Sync Duration",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 3600 },
|
||||
{ "color": "red", "value": 86400 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cloudsync_duration_seconds",
|
||||
"legendFormat": "Task {{ task_id }}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Active Backup Alerts",
|
||||
"type": "alertlist",
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 },
|
||||
"datasource": { "type": "datasource", "uid": "grafana" },
|
||||
"options": {
|
||||
"showOptions": "current",
|
||||
"maxItems": 20,
|
||||
"sortOrder": 1,
|
||||
"dashboardAlerts": false,
|
||||
"alertName": "backup",
|
||||
"stateFilter": {
|
||||
"firing": true,
|
||||
"pending": true,
|
||||
"noData": true,
|
||||
"normal": false,
|
||||
"error": true
|
||||
},
|
||||
"folder": { "id": null, "title": "" },
|
||||
"folderId": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "CronJob Last Schedule",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 },
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value" },
|
||||
"properties": [{ "id": "unit", "value": "dateTimeAsIso" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{ "displayName": "Value", "desc": true }]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"renameByName": {
|
||||
"cronjob": "CronJob",
|
||||
"namespace": "Namespace",
|
||||
"Value": "Last Scheduled"
|
||||
},
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"__name__": true,
|
||||
"instance": true,
|
||||
"job": true,
|
||||
"uid": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*cloudsync.*|.*etcd.*|.*raft.*\"} * 1000",
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "PBFA97CFB590B2093"
|
||||
},
|
||||
"includeAll": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-7d", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Backup Health",
|
||||
"uid": "backup-health",
|
||||
"version": 1,
|
||||
"schemaVersion": 39
|
||||
}
|
||||
|
|
@ -144,7 +144,18 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
|||
EPOCH_SECS=0
|
||||
fi
|
||||
|
||||
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS"
|
||||
# Extract transfer stats from job progress description (rclone output)
|
||||
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
|
||||
BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0)
|
||||
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
|
||||
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
|
||||
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
|
||||
SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 ))
|
||||
else
|
||||
SYNC_DURATION=0
|
||||
fi
|
||||
|
||||
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s"
|
||||
|
||||
# Push metrics to Pushgateway
|
||||
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/cloudsync-monitor/task_id/$TASK_ID"
|
||||
|
|
@ -154,6 +165,9 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
|||
# HELP cloudsync_job_state Cloud Sync job state (1=SUCCESS, 0=other)
|
||||
# TYPE cloudsync_job_state gauge
|
||||
cloudsync_job_state $([ "$JOB_STATE" = "SUCCESS" ] && echo 1 || echo 0)
|
||||
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
|
||||
# TYPE cloudsync_duration_seconds gauge
|
||||
cloudsync_duration_seconds $SYNC_DURATION
|
||||
METRICS
|
||||
done
|
||||
|
||||
|
|
|
|||
|
|
@ -283,15 +283,33 @@ resource "kubernetes_cron_job_v1" "redis-backup" {
|
|||
image = "redis:7-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -eux
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M)
|
||||
# Trigger a fresh RDB save on the master
|
||||
redis-cli -h redis.redis BGSAVE
|
||||
sleep 5
|
||||
# Copy the RDB via redis-cli --rdb
|
||||
redis-cli -h redis.redis --rdb /backup/redis-$TIMESTAMP.rdb
|
||||
# Rotate — 7-day retention
|
||||
# Rotate — 28-day retention
|
||||
find /backup -name 'redis-*.rdb' -type f -mtime +28 -delete
|
||||
echo "Backup complete: redis-$TIMESTAMP.rdb"
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')"
|
||||
|
||||
wget -qO- --post-data "backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
|
|
|
|||
|
|
@ -265,12 +265,26 @@ resource "kubernetes_cron_job_v1" "vault_backup" {
|
|||
image = "hashicorp/vault:1.18.1"
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = [join("", [
|
||||
"set -eu; ",
|
||||
"_t0=$(date +%s); ",
|
||||
"_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ",
|
||||
"_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ",
|
||||
"export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200 && ",
|
||||
"export VAULT_TOKEN=$(cat /vault/token/vault-root-token) && ",
|
||||
"TIMESTAMP=$(date +%Y%m%d-%H%M%S) && ",
|
||||
"vault operator raft snapshot save /backup/vault-raft-$TIMESTAMP.db && ",
|
||||
"find /backup -name '*.db' -mtime +30 -delete && ",
|
||||
"echo \"Backup done: vault-raft-$TIMESTAMP.db\" && ls -lh /backup/"
|
||||
"echo \"Backup done: vault-raft-$TIMESTAMP.db\" && ls -lh /backup/ && ",
|
||||
"_dur=$(( $(date +%s) - _t0 )); ",
|
||||
"_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ",
|
||||
"_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ",
|
||||
"echo '=== Backup IO Stats ==='; ",
|
||||
"echo \"duration: $${_dur}s\"; ",
|
||||
"echo \"read: $(( (_rb1 - _rb0) / 1048576 )) MiB\"; ",
|
||||
"echo \"written: $(( (_wb1 - _wb0) / 1048576 )) MiB\"; ",
|
||||
"echo \"output: $(ls -lh /backup/vault-raft-$TIMESTAMP.db | awk '{print $5}')\"; ",
|
||||
"wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_last_success_timestamp $(date +%s)\n\" ",
|
||||
"\"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vault-raft-backup\" || true"
|
||||
])]
|
||||
volume_mount {
|
||||
mount_path = "/backup"
|
||||
|
|
|
|||
|
|
@ -247,6 +247,10 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -euxo pipefail
|
||||
apk add --no-cache sqlite
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
|
||||
now=$(date +"%Y_%m_%d_%H_%M")
|
||||
# Pre-flight: verify source DB is healthy before backing up
|
||||
if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then
|
||||
|
|
@ -269,7 +273,21 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
|||
cp -a /data/config.json /backup/$now/ 2>/dev/null || true
|
||||
# Rotate — 30 day retention
|
||||
find /backup -maxdepth 1 -mindepth 1 -type d -mtime +30 -exec rm -rf {} +
|
||||
echo "Backup complete: $now"
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(du -sh /backup/$$now | awk '{print $$1}')"
|
||||
|
||||
wget -qO- --post-data "backup_duration_seconds $${_dur}
|
||||
backup_read_bytes $(( _rb1 - _rb0 ))
|
||||
backup_written_bytes $(( _wb1 - _wb0 ))
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vaultwarden-backup" || true
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue