add backup_output_bytes metric and cloudsync_transferred_bytes to backup dashboard
- All 7 backup CronJobs now push backup_output_bytes (file size after backup) - Cloud Sync monitor parses rclone transfer stats into cloudsync_transferred_bytes - Grafana dashboard: new Output (MiB) table column, Output Size Trend panel, Write Throughput panel, Cloud Sync Transfer Volume bargauge - All timeseries panels use points-only draw style (discrete backup snapshots) - etcd backup restructured: init_container for etcdctl (distroless image), busybox sidecar for metrics push + purge, ClusterFirstWithHostNet DNS - Fixed pre-existing curl missing in postgres:16.4-bullseye (immich, dbaas PG) - Fixed grep -oP not available in alpine/busybox (cloud sync monitor)
This commit is contained in:
parent
f289f76882
commit
d20c5e5535
8 changed files with 186 additions and 46 deletions
|
|
@ -98,38 +98,40 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
node_name = "k8s-master"
|
||||
priority_class_name = "system-cluster-critical"
|
||||
host_network = true
|
||||
container {
|
||||
dns_policy = "ClusterFirstWithHostNet"
|
||||
init_container {
|
||||
name = "backup-etcd"
|
||||
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = [<<-EOT
|
||||
set -eu
|
||||
_t0=$(date +%s)
|
||||
_rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
ETCDCTL_API=3 etcdctl \
|
||||
--endpoints=https://127.0.0.1:2379 \
|
||||
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
|
||||
--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
|
||||
--key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
|
||||
snapshot save /backup/etcd-snapshot-$TIMESTAMP.db
|
||||
|
||||
_dur=$(($(date +%s) - _t0))
|
||||
_rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
_wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0)
|
||||
echo "=== Backup IO Stats ==="
|
||||
echo "duration: $${_dur}s"
|
||||
echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB"
|
||||
echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB"
|
||||
echo "output: $(ls -lh /backup/etcd-snapshot-$TIMESTAMP.db | awk '{print $5}')"
|
||||
EOT
|
||||
]
|
||||
command = ["etcdctl", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
|
||||
resources {
|
||||
requests = {
|
||||
memory = "256Mi"
|
||||
cpu = "50m"
|
||||
}
|
||||
limits = {
|
||||
memory = "512Mi"
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_API"
|
||||
value = "3"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_ENDPOINTS"
|
||||
value = "https://127.0.0.1:2379"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_CACERT"
|
||||
value = "/etc/kubernetes/pki/etcd/ca.crt"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_CERT"
|
||||
value = "/etc/kubernetes/pki/etcd/healthcheck-client.crt"
|
||||
}
|
||||
env {
|
||||
name = "ETCDCTL_KEY"
|
||||
value = "/etc/kubernetes/pki/etcd/healthcheck-client.key"
|
||||
}
|
||||
volume_mount {
|
||||
mount_path = "/backup"
|
||||
name = "backup"
|
||||
|
|
@ -141,11 +143,26 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
}
|
||||
}
|
||||
container {
|
||||
name = "backup-purge"
|
||||
image = "busybox:1.31.1"
|
||||
command = ["/bin/sh"]
|
||||
args = ["-c", "find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \\;"]
|
||||
name = "backup-manage"
|
||||
image = "busybox:1.37"
|
||||
command = ["/bin/sh", "-c"]
|
||||
args = [<<-EOT
|
||||
set -eu
|
||||
# Rename snapshot with timestamp
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
mv /backup/etcd-snapshot-latest.db /backup/etcd-snapshot-$TIMESTAMP.db
|
||||
_out_bytes=$(stat -c%s /backup/etcd-snapshot-$TIMESTAMP.db 2>/dev/null || echo 0)
|
||||
echo "Backup done: etcd-snapshot-$TIMESTAMP.db ($${_out_bytes} bytes)"
|
||||
|
||||
# Rotate — 30 day retention
|
||||
find /backup -type f -mtime +30 -name '*.db' -exec rm -- '{}' \;
|
||||
|
||||
# Push metrics to Pushgateway
|
||||
wget -qO- --post-data "backup_output_bytes $${_out_bytes}
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
" "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/backup-etcd" || true
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
mount_path = "/backup"
|
||||
name = "backup"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue