diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 44abb784..1b43b475 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -340,6 +340,10 @@ resource "kubernetes_cron_job_v1" "mysql-backup" { } command = ["/bin/bash", "-c", <<-EOT set -euxo pipefail + _t0=$(date +%s) + _rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + export now=$(date +"%Y_%m_%d_%H_%M") mysqldump --all-databases -u root --host mysql.dbaas.svc.cluster.local | gzip -9 > /backup/dump_$now.sql.gz @@ -347,7 +351,22 @@ resource "kubernetes_cron_job_v1" "mysql-backup" { cd /backup find . -name "dump_*.sql.gz" -type f -mtime +14 -delete find . -name "dump_*.sql" -type f -mtime +14 -delete # clean up old uncompressed - echo Done + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')" + + curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mysql-backup" </dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + export now=$(date +"%Y_%m_%d_%H_%M") PGPASSWORD=$PGPASSWORD pg_dumpall -h postgresql.dbaas -U postgres | gzip -9 > /backup/dump_$now.sql.gz @@ -1084,7 +1107,22 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" { cd /backup find . -name "dump_*.sql.gz" -type f -mtime +14 -delete find . -name "dump_*.sql" -type f -mtime +14 -delete # clean up old uncompressed - echo Done + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(ls -lh /backup/dump_$now.sql.gz | awk '{print $5}')" + + curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postgresql-backup" </dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + export now=$(date +"%Y_%m_%d_%H_%M") pg_dumpall -h immich-postgresql -U immich > /backup/dump_$now.sql # Rotate - delete last log file cd /backup find . -name "dump_*.sql" -type f -mtime +14 -delete # 14 day retention of backups + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(ls -lh /backup/dump_$now.sql | awk '{print $5}')" + + curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/immich-postgresql-backup" </dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + ETCDCTL_API=3 etcdctl \ + --endpoints=https://127.0.0.1:2379 \ + --cacert=/etc/kubernetes/pki/etcd/ca.crt \ + --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \ + --key=/etc/kubernetes/pki/etcd/healthcheck-client.key \ + snapshot save /backup/etcd-snapshot-$TIMESTAMP.db + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(ls -lh /backup/etcd-snapshot-$TIMESTAMP.db | awk '{print $5}')" + EOT + ] env { name = "ETCDCTL_API" value = "3" diff --git a/stacks/monitoring/modules/monitoring/dashboards/backup_health.json b/stacks/monitoring/modules/monitoring/dashboards/backup_health.json new file mode 100644 index 00000000..2c99019c --- /dev/null +++ b/stacks/monitoring/modules/monitoring/dashboards/backup_health.json @@ -0,0 +1,377 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "datasource", "uid": "grafana" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Backup health overview — K8s CronJob backups and TrueNAS Cloud Sync status", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 0, + "links": [], + "panels": [ + { + "title": "Time Since Last Successful Backup", + "type": "stat", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 90000 }, + { "color": "red", "value": 604800 } + ] + }, + "mappings": [ + { + "type": "special", + "options": { "match": "null", "result": { "text": "No data", "color": "red" } } + } + ] + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "background", + "graphMode": "none" + }, + "targets": [ + { + "expr": "time() - backup_last_success_timestamp", + "legendFormat": "{{ job }}", + "refId": "A" + }, + { + "expr": "time() - cloudsync_last_success_timestamp", + "legendFormat": "cloudsync-{{ task_id }}", + "refId": "B" + } + ] + }, + { + "title": "All Backups — Overview", + "type": "table", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Duration (s)" }, + "properties": [{ "id": "unit", "value": "s" }] + }, + { + "matcher": { "id": "byName", "options": "Read (MiB)" }, + "properties": [{ "id": "unit", "value": "decmbytes" }] + }, + { + "matcher": { "id": "byName", "options": "Written (MiB)" }, + "properties": [{ "id": "unit", "value": "decmbytes" }] + }, + { + "matcher": { "id": "byName", "options": "Last Success" }, + "properties": [{ "id": "unit", "value": "dateTimeFromNow" }] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{ "displayName": "Last Success", "desc": true }] + }, + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "renameByName": { + "Value #Duration": "Duration (s)", + "Value #Read": "Read (MiB)", + "Value #Written": "Written (MiB)", + "Value #LastSuccess": "Last Success", + "job": "Backup" + }, + "excludeByName": { + "Time": true, + "instance": true, + "__name__": true + } + } + } + ], + "targets": [ + { + "expr": "backup_duration_seconds", + "legendFormat": "{{ job }}", + "refId": "Duration", + "instant": true, + "format": "table" + }, + { + "expr": "backup_read_bytes / 1048576", + "legendFormat": "{{ job }}", + "refId": "Read", + "instant": true, + "format": "table" + }, + { + "expr": "backup_written_bytes / 1048576", + "legendFormat": "{{ job }}", + "refId": "Written", + "instant": true, + "format": "table" + }, + { + "expr": "backup_last_success_timestamp * 1000", + "legendFormat": "{{ job }}", + "refId": "LastSuccess", + "instant": true, + "format": "table" + } + ] + }, + { + "title": "Backup Duration Trend", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "targets": [ + { + "expr": "backup_duration_seconds", + "legendFormat": "{{ job }}", + "refId": "A" + } + ] + }, + { + "title": "Backup IO Trend (Read + Written)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "auto" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "targets": [ + { + "expr": "backup_read_bytes", + "legendFormat": "{{ job }} read", + "refId": "A" + }, + { + "expr": "backup_written_bytes", + "legendFormat": "{{ job }} written", + "refId": "B" + } + ] + }, + { + "title": "Cloud Sync Status", + "type": "stat", + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "1": { "text": "SUCCESS", "color": "green" }, "0": { "text": "FAILED", "color": "red" } } } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "targets": [ + { + "expr": "cloudsync_job_state", + "legendFormat": "Task {{ task_id }}", + "refId": "A" + } + ] + }, + { + "title": "Cloud Sync Duration", + "type": "stat", + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 3600 }, + { "color": "red", "value": 86400 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "targets": [ + { + "expr": "cloudsync_duration_seconds", + "legendFormat": "Task {{ task_id }}", + "refId": "A" + } + ] + }, + { + "title": "Active Backup Alerts", + "type": "alertlist", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 28 }, + "datasource": { "type": "datasource", "uid": "grafana" }, + "options": { + "showOptions": "current", + "maxItems": 20, + "sortOrder": 1, + "dashboardAlerts": false, + "alertName": "backup", + "stateFilter": { + "firing": true, + "pending": true, + "noData": true, + "normal": false, + "error": true + }, + "folder": { "id": null, "title": "" }, + "folderId": null + } + }, + { + "title": "CronJob Last Schedule", + "type": "table", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [{ "id": "unit", "value": "dateTimeAsIso" }] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{ "displayName": "Value", "desc": true }] + }, + "transformations": [ + { + "id": "organize", + "options": { + "renameByName": { + "cronjob": "CronJob", + "namespace": "Namespace", + "Value": "Last Scheduled" + }, + "excludeByName": { + "Time": true, + "__name__": true, + "instance": true, + "job": true, + "uid": true + } + } + } + ], + "targets": [ + { + "expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*cloudsync.*|.*etcd.*|.*raft.*\"} * 1000", + "legendFormat": "", + "refId": "A", + "instant": true, + "format": "table" + } + ] + } + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "PBFA97CFB590B2093" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { "from": "now-7d", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Backup Health", + "uid": "backup-health", + "version": 1, + "schemaVersion": 39 +} diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index 9ffcb1c5..a49c0587 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -144,7 +144,18 @@ resource "kubernetes_cron_job_v1" "cloudsync_monitor" { EPOCH_SECS=0 fi - echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS" + # Extract transfer stats from job progress description (rclone output) + JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""') + BYTES_TX=$(echo "$JOB_PROGRESS" | grep -oP 'Transferred:\s+[\d.]+ \w+' | head -1 | awk '{print $2}' || echo 0) + JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0') + JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') + if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then + SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 )) + else + SYNC_DURATION=0 + fi + + echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s" # Push metrics to Pushgateway cat </dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + TIMESTAMP=$(date +%Y%m%d-%H%M) # Trigger a fresh RDB save on the master redis-cli -h redis.redis BGSAVE sleep 5 # Copy the RDB via redis-cli --rdb redis-cli -h redis.redis --rdb /backup/redis-$TIMESTAMP.rdb - # Rotate — 7-day retention + # Rotate — 28-day retention find /backup -name 'redis-*.rdb' -type f -mtime +28 -delete - echo "Backup complete: redis-$TIMESTAMP.rdb" + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(ls -lh /backup/redis-$$TIMESTAMP.rdb | awk '{print $5}')" + + wget -qO- --post-data "backup_duration_seconds $${_dur} + backup_read_bytes $(( _rb1 - _rb0 )) + backup_written_bytes $(( _wb1 - _wb0 )) + backup_last_success_timestamp $(date +%s) + " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/redis-backup" || true EOT ] volume_mount { diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index dcfe5ffd..e4f58988 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -265,12 +265,26 @@ resource "kubernetes_cron_job_v1" "vault_backup" { image = "hashicorp/vault:1.18.1" command = ["/bin/sh", "-c"] args = [join("", [ + "set -eu; ", + "_t0=$(date +%s); ", + "_rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ", + "_wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ", "export VAULT_ADDR=http://vault-active.vault.svc.cluster.local:8200 && ", "export VAULT_TOKEN=$(cat /vault/token/vault-root-token) && ", "TIMESTAMP=$(date +%Y%m%d-%H%M%S) && ", "vault operator raft snapshot save /backup/vault-raft-$TIMESTAMP.db && ", "find /backup -name '*.db' -mtime +30 -delete && ", - "echo \"Backup done: vault-raft-$TIMESTAMP.db\" && ls -lh /backup/" + "echo \"Backup done: vault-raft-$TIMESTAMP.db\" && ls -lh /backup/ && ", + "_dur=$(( $(date +%s) - _t0 )); ", + "_rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ", + "_wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0); ", + "echo '=== Backup IO Stats ==='; ", + "echo \"duration: $${_dur}s\"; ", + "echo \"read: $(( (_rb1 - _rb0) / 1048576 )) MiB\"; ", + "echo \"written: $(( (_wb1 - _wb0) / 1048576 )) MiB\"; ", + "echo \"output: $(ls -lh /backup/vault-raft-$TIMESTAMP.db | awk '{print $5}')\"; ", + "wget -qO- --post-data \"backup_duration_seconds $${_dur}\nbackup_read_bytes $((_rb1 - _rb0))\nbackup_written_bytes $((_wb1 - _wb0))\nbackup_last_success_timestamp $(date +%s)\n\" ", + "\"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vault-raft-backup\" || true" ])] volume_mount { mount_path = "/backup" diff --git a/stacks/vaultwarden/modules/vaultwarden/main.tf b/stacks/vaultwarden/modules/vaultwarden/main.tf index 820d6cf0..bc1ea77c 100644 --- a/stacks/vaultwarden/modules/vaultwarden/main.tf +++ b/stacks/vaultwarden/modules/vaultwarden/main.tf @@ -247,6 +247,10 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { command = ["/bin/sh", "-c", <<-EOT set -euxo pipefail apk add --no-cache sqlite + _t0=$(date +%s) + _rb0=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + now=$(date +"%Y_%m_%d_%H_%M") # Pre-flight: verify source DB is healthy before backing up if ! sqlite3 /data/db.sqlite3 "PRAGMA integrity_check;" | grep -q "^ok$"; then @@ -269,7 +273,21 @@ resource "kubernetes_cron_job_v1" "vaultwarden-backup" { cp -a /data/config.json /backup/$now/ 2>/dev/null || true # Rotate — 30 day retention find /backup -maxdepth 1 -mindepth 1 -type d -mtime +30 -exec rm -rf {} + - echo "Backup complete: $now" + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/self/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(du -sh /backup/$$now | awk '{print $$1}')" + + wget -qO- --post-data "backup_duration_seconds $${_dur} + backup_read_bytes $(( _rb1 - _rb0 )) + backup_written_bytes $(( _wb1 - _wb0 )) + backup_last_success_timestamp $(date +%s) + " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/vaultwarden-backup" || true EOT ] volume_mount {