diff --git a/scripts/lvm-pvc-snapshot.timer b/scripts/lvm-pvc-snapshot.timer new file mode 100644 index 00000000..e4d5a2a1 --- /dev/null +++ b/scripts/lvm-pvc-snapshot.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Daily LVM thin snapshots of Proxmox CSI PVCs + +[Timer] +OnCalendar=*-*-* 03:00:00 +Persistent=true +RandomizedDelaySec=300 + +[Install] +WantedBy=timers.target diff --git a/scripts/offsite-sync-backup.service b/scripts/offsite-sync-backup.service new file mode 100644 index 00000000..41c63edc --- /dev/null +++ b/scripts/offsite-sync-backup.service @@ -0,0 +1,11 @@ +[Unit] +Description=Weekly offsite sync: rsync /mnt/backup to Synology NAS +After=network-online.target weekly-backup.service + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/offsite-sync-backup +StandardOutput=journal +StandardError=journal +SyslogIdentifier=offsite-sync-backup +TimeoutStartSec=7200 diff --git a/scripts/offsite-sync-backup.sh b/scripts/offsite-sync-backup.sh new file mode 100644 index 00000000..9a2aad87 --- /dev/null +++ b/scripts/offsite-sync-backup.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# offsite-sync-backup — Sync /mnt/backup to Synology NAS using changed-files manifest +# Deploy to PVE host at /usr/local/bin/offsite-sync-backup +# Schedule: Weekly Sunday 08:00 via systemd timer (After=weekly-backup.service) +set -euo pipefail + +# --- Configuration --- +BACKUP_ROOT="/mnt/backup" +DEST="Administrator@192.168.1.13:/volume1/Backup/Viki/pve-backup" +MANIFEST="${BACKUP_ROOT}/.changed-files" +PUSHGATEWAY="${OFFSITE_SYNC_PUSHGATEWAY:-http://10.0.20.100:30091}" +PUSHGATEWAY_JOB="offsite-backup-sync" +LOCKFILE="/run/offsite-sync-backup.lock" + +# --- Logging --- +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } +warn() { log "WARN: $*" >&2; } + +# --- Locking --- +cleanup() { rm -f "${LOCKFILE}"; } +trap cleanup EXIT +if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then + log "FATAL: Another instance running"; exit 1 +fi + +# --- Main --- +log "=== Offsite sync starting ===" +STATUS=0 + +if ! mountpoint -q "${BACKUP_ROOT}"; then + log "FATAL: ${BACKUP_ROOT} is not mounted"; exit 1 +fi + +# Test SSH connectivity first +if ! timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 Administrator@192.168.1.13 true 2>/dev/null; then + log "FATAL: Cannot SSH to Synology (192.168.1.13)" + echo "backup_last_success_timestamp 0" | \ + curl -s --connect-timeout 5 --max-time 10 --data-binary @- \ + "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true + exit 1 +fi + +DAY_OF_MONTH=$(date +%d) + +if [ "${DAY_OF_MONTH}" -le 7 ]; then + # First Sunday of month: full sync with --delete to clean orphans on Synology + log "Monthly full sync (1st Sunday)..." + rsync -az --delete \ + --exclude='.changed-files' \ + --exclude='.last-offsite-sync' \ + --exclude='.lv-pvc-mapping.json' \ + "${BACKUP_ROOT}/" "${DEST}/" 2>&1 || STATUS=1 +elif [ -s "${MANIFEST}" ]; then + # Incremental: only send files listed in manifest (no remote dir walk) + MANIFEST_LINES=$(wc -l < "${MANIFEST}") + log "Incremental sync (${MANIFEST_LINES} files from manifest)..." + rsync -az --files-from="${MANIFEST}" --no-traverse \ + "${BACKUP_ROOT}/" "${DEST}/" 2>&1 || STATUS=1 +else + log "No changed files in manifest, nothing to sync" +fi + +if [ "${STATUS}" -eq 0 ]; then + # Only clear manifest + update timestamp on SUCCESS + touch "${BACKUP_ROOT}/.last-offsite-sync" + > "${MANIFEST}" + log "=== Offsite sync complete (success) ===" +else + # Keep manifest for retry next week + warn "Offsite sync had errors — manifest preserved for retry" + log "=== Offsite sync complete (with errors) ===" +fi + +cat </dev/null || true +backup_last_success_timestamp $(date +%s) +offsite_sync_last_status ${STATUS} +EOF + +exit "${STATUS}" diff --git a/scripts/offsite-sync-backup.timer b/scripts/offsite-sync-backup.timer new file mode 100644 index 00000000..bcb72813 --- /dev/null +++ b/scripts/offsite-sync-backup.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Weekly offsite sync: rsync /mnt/backup to Synology NAS + +[Timer] +OnCalendar=Sun *-*-* 08:00:00 +Persistent=true +RandomizedDelaySec=300 + +[Install] +WantedBy=timers.target diff --git a/scripts/weekly-backup.service b/scripts/weekly-backup.service new file mode 100644 index 00000000..3fac8346 --- /dev/null +++ b/scripts/weekly-backup.service @@ -0,0 +1,11 @@ +[Unit] +Description=Weekly backup: rsync NFS to sda + prune snapshots +After=network-online.target + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/weekly-backup +StandardOutput=journal +StandardError=journal +SyslogIdentifier=weekly-backup +TimeoutStartSec=3600 diff --git a/scripts/weekly-backup.sh b/scripts/weekly-backup.sh new file mode 100644 index 00000000..c9077ea4 --- /dev/null +++ b/scripts/weekly-backup.sh @@ -0,0 +1,253 @@ +#!/usr/bin/env bash +# weekly-backup — 3-2-1 backup: NFS mirror + PVC file copy + pfsense + PVE config +# Deploy to PVE host at /usr/local/bin/weekly-backup +# Schedule: Weekly Sunday 05:00 via systemd timer +set -euo pipefail + +# --- Configuration --- +BACKUP_ROOT="/mnt/backup" +NFS_SERVER="10.0.10.15" +NFS_BASE="/mnt/main" +NFS_MOUNT="/mnt/nfs-truenas" +PVC_MOUNT="/tmp/pvc-mount" +PUSHGATEWAY="${WEEKLY_BACKUP_PUSHGATEWAY:-http://10.0.20.100:30091}" +PUSHGATEWAY_JOB="weekly-backup" +LOCKFILE="/run/weekly-backup.lock" +MANIFEST="${BACKUP_ROOT}/.changed-files" +MAPPING_CACHE="${BACKUP_ROOT}/.lv-pvc-mapping.json" +KUBECONFIG="${KUBECONFIG:-/root/.kube/config}" +export KUBECONFIG + +# NFS backup directories to mirror +BACKUP_DIRS=( + mysql-backup + postgresql-backup + vault-backup + vaultwarden-backup + redis-backup + etcd-backup + headscale-backup + prometheus-backup + plotting-book-backup +) + +# --- Logging --- +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } +warn() { log "WARN: $*" >&2; } +die() { log "FATAL: $*" >&2; push_metrics 1 0; exit 1; } + +# --- Locking --- +cleanup() { + umount "${PVC_MOUNT}" 2>/dev/null || true + umount "${NFS_MOUNT}" 2>/dev/null || true + rm -f "${LOCKFILE}" +} +trap cleanup EXIT +if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then + die "Another instance is running (PID $(cat "${LOCKFILE}" 2>/dev/null || echo unknown))" +fi + +# --- Metrics --- +push_metrics() { + local status="${1:-0}" bytes="${2:-0}" + cat </dev/null || true +weekly_backup_last_run_timestamp $(date +%s) +weekly_backup_last_status ${status} +weekly_backup_bytes_synced ${bytes} +EOF +} + +# --- PVC name resolution --- +resolve_pvc_name() { + local lv="$1" + jq -r --arg lv "${lv}" ' + .items[] | + select(.spec.csi.volumeHandle // "" | endswith($lv)) | + "\(.spec.claimRef.namespace)/\(.spec.claimRef.name)" + ' "${MAPPING_CACHE}" 2>/dev/null +} + +# --- Main --- +log "=== Weekly backup starting ===" + +if ! mountpoint -q "${BACKUP_ROOT}"; then + die "${BACKUP_ROOT} is not mounted" +fi + +STATUS=0 +TOTAL_BYTES=0 + +# Clear manifest for this run +> "${MANIFEST}" + +# ============================================================ +# STEP 1: Mirror NFS backup directories from TrueNAS +# ============================================================ +log "--- Step 1: NFS backup mirror ---" +mkdir -p "${NFS_MOUNT}" +if ! mountpoint -q "${NFS_MOUNT}"; then + if ! timeout 30 mount -t nfs -o soft,timeo=30,retrans=3,ro "${NFS_SERVER}:${NFS_BASE}" "${NFS_MOUNT}"; then + warn "Failed to mount NFS — skipping NFS mirror step" + STATUS=1 + fi +fi + +if mountpoint -q "${NFS_MOUNT}"; then + mkdir -p "${BACKUP_ROOT}/nfs-mirror" + for dir in "${BACKUP_DIRS[@]}"; do + src="${NFS_MOUNT}/${dir}/" + dst="${BACKUP_ROOT}/nfs-mirror/${dir}/" + mkdir -p "${dst}" + if [ ! -d "${src}" ]; then + continue + fi + log "Syncing ${dir}..." + if rsync -az --delete --out-format='%n' "${src}" "${dst}" 2>/dev/null | \ + sed "s|^|nfs-mirror/${dir}/|" >> "${MANIFEST}"; then + size=$(du -sb "${dst}" 2>/dev/null | cut -f1) + TOTAL_BYTES=$((TOTAL_BYTES + size)) + log " OK: ${dir} ($(du -sh "${dst}" | cut -f1))" + else + warn "Failed to sync ${dir}" + STATUS=1 + fi + done + umount "${NFS_MOUNT}" 2>/dev/null || true +fi + +# ============================================================ +# STEP 2: PVC file-level copy from LVM thin snapshots +# ============================================================ +log "--- Step 2: PVC file copy from snapshots ---" +WEEK=$(date +%Y-%W) +PREV=$(ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | tail -1 || true) + +# Cache LV→PVC mapping (fallback if kubectl is down next time) +if kubectl get pv -o json > /tmp/pv-list.json 2>/dev/null; then + cp /tmp/pv-list.json "${MAPPING_CACHE}" + rm -f /tmp/pv-list.json +fi + +if [ ! -f "${MAPPING_CACHE}" ]; then + warn "No PV mapping cache and kubectl unavailable — skipping PVC copy" + STATUS=1 +else + mkdir -p "${PVC_MOUNT}" + PVC_COUNT=0 + PVC_FAIL=0 + + # Iterate origin LVs (not snapshots), find latest snapshot for each + for origin_lv in $(lvs --noheadings -o lv_name pve 2>/dev/null | grep 'vm-9999-pvc-' | grep -v '_snap_' | tr -d ' '); do + # Find latest snapshot for this origin + snap=$(lvs --noheadings -o lv_name pve 2>/dev/null | tr -d ' ' | grep "^${origin_lv}_snap_" | sort | tail -1 || true) + [ -z "${snap}" ] && continue + + # Resolve human-readable name + ns_pvc=$(resolve_pvc_name "${origin_lv}") + if [ -z "${ns_pvc}" ] || [ "${ns_pvc}" = "null/null" ]; then + warn "Cannot resolve PVC name for ${origin_lv}, skipping" + continue + fi + + # Mount snapshot read-only, rsync files + if timeout 30 mount -o ro "/dev/pve/${snap}" "${PVC_MOUNT}" 2>&1; then + dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}" + mkdir -p "${dst}" + if rsync -az --delete \ + ${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \ + "${PVC_MOUNT}/" "${dst}/" 2>&1; then + PVC_COUNT=$((PVC_COUNT + 1)) + else + warn "rsync failed for ${ns_pvc}" + PVC_FAIL=$((PVC_FAIL + 1)) + fi + umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || true + else + warn "Failed to mount snapshot ${snap}" + PVC_FAIL=$((PVC_FAIL + 1)) + fi + done + + log " PVC copy: ${PVC_COUNT} OK, ${PVC_FAIL} failed" + [ "${PVC_FAIL}" -gt 0 ] && STATUS=1 + + # Add PVC files to manifest + if [ -d "${BACKUP_ROOT}/pvc-data/${WEEK}" ]; then + find "${BACKUP_ROOT}/pvc-data/${WEEK}" -type f 2>/dev/null | \ + sed "s|^${BACKUP_ROOT}/||" >> "${MANIFEST}" + fi + + # Prune old weekly versions (keep 4) + ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true + + PVC_BYTES=$(du -sb "${BACKUP_ROOT}/pvc-data/${WEEK}" 2>/dev/null | cut -f1 || true) + TOTAL_BYTES=$((TOTAL_BYTES + ${PVC_BYTES:-0})) +fi + +# ============================================================ +# STEP 3: pfsense backup (config.xml + full tar) +# ============================================================ +log "--- Step 3: pfsense backup ---" +PFSENSE_DEST="${BACKUP_ROOT}/pfsense" +DATE=$(date +%Y%m%d) +mkdir -p "${PFSENSE_DEST}" + +if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/dev/null; then + # config.xml — primary restore artifact + if scp -o ConnectTimeout=10 root@10.0.20.1:/cf/conf/config.xml "${PFSENSE_DEST}/config-${DATE}.xml" 2>/dev/null; then + log " OK: config.xml" + echo "pfsense/config-${DATE}.xml" >> "${MANIFEST}" + else + warn "Failed to copy pfsense config.xml" + STATUS=1 + fi + + # Full filesystem tar + if ssh -o ConnectTimeout=10 root@10.0.20.1 \ + "tar czf - --exclude=/dev --exclude=/proc --exclude=/tmp --exclude=/var/run /" \ + > "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" 2>/dev/null; then + log " OK: full tar ($(du -sh "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" | cut -f1))" + echo "pfsense/pfsense-full-${DATE}.tar.gz" >> "${MANIFEST}" + else + warn "Failed to tar pfsense filesystem" + STATUS=1 + fi + + # Retention: keep 4 weekly copies + ls -t "${PFSENSE_DEST}"/config-*.xml 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true + ls -t "${PFSENSE_DEST}"/pfsense-full-*.tar.gz 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true + + # Push pfsense-specific metric + echo "backup_last_success_timestamp $(date +%s)" | \ + curl -s --connect-timeout 5 --max-time 10 --data-binary @- \ + "${PUSHGATEWAY}/metrics/job/pfsense-backup" 2>/dev/null || true +else + warn "Cannot SSH to pfsense (10.0.20.1) — skipping" + STATUS=1 +fi + +# ============================================================ +# STEP 4: PVE host config backup +# ============================================================ +log "--- Step 4: PVE host config ---" +mkdir -p "${BACKUP_ROOT}/pve-config/scripts" +rsync -az --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; } +for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/weekly-backup /usr/local/bin/offsite-sync-backup; do + [ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true +done +find "${BACKUP_ROOT}/pve-config" -type f 2>/dev/null | sed "s|^${BACKUP_ROOT}/||" >> "${MANIFEST}" +log " OK: PVE config" + +# ============================================================ +# STEP 5: Prune LVM snapshots older than 7 days +# ============================================================ +log "--- Step 5: Snapshot pruning (7-day retention) ---" +/usr/local/bin/lvm-pvc-snapshot prune 2>&1 || { warn "Snapshot prune failed"; STATUS=1; } + +# ============================================================ +# Done +# ============================================================ +MANIFEST_LINES=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0) +log "=== Weekly backup complete (status=${STATUS}, ${TOTAL_BYTES} bytes, ${MANIFEST_LINES} files in manifest) ===" +push_metrics "${STATUS}" "${TOTAL_BYTES}" +exit "${STATUS}" diff --git a/scripts/weekly-backup.timer b/scripts/weekly-backup.timer new file mode 100644 index 00000000..3b69cdc5 --- /dev/null +++ b/scripts/weekly-backup.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Weekly backup: rsync NFS to sda + prune snapshots + +[Timer] +OnCalendar=Sun *-*-* 05:00:00 +Persistent=true +RandomizedDelaySec=300 + +[Install] +WantedBy=timers.target diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 8098cf04..8999256e 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1074,12 +1074,12 @@ serverFiles: annotations: summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}" - alert: LVMSnapshotStale - expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 86400 + expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 172800 for: 30m labels: severity: critical annotations: - summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected every 12h)" + summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected daily)" - alert: LVMSnapshotNeverRun expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) for: 48h @@ -1101,6 +1101,42 @@ serverFiles: severity: warning annotations: summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion" + # --- 3-2-1 Backup Pipeline Alerts --- + - alert: WeeklyBackupStale + expr: (time() - weekly_backup_last_run_timestamp{job="weekly-backup"}) > 777600 + for: 30m + labels: + severity: warning + annotations: + summary: "Weekly backup is {{ $value | humanizeDuration }} old (threshold: 9d)" + - alert: WeeklyBackupFailing + expr: weekly_backup_last_status{job="weekly-backup"} != 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Weekly backup completed with errors (status={{ $value }})" + - alert: PfsenseBackupStale + expr: (time() - backup_last_success_timestamp{job="pfsense-backup"}) > 777600 + for: 30m + labels: + severity: warning + annotations: + summary: "pfsense backup is {{ $value | humanizeDuration }} old (threshold: 9d)" + - alert: OffsiteBackupSyncStale + expr: (time() - backup_last_success_timestamp{job="offsite-backup-sync"}) > 777600 + for: 30m + labels: + severity: warning + annotations: + summary: "Offsite backup sync is {{ $value | humanizeDuration }} old (threshold: 9d)" + - alert: BackupDiskFull + expr: (1 - node_filesystem_avail_bytes{job="proxmox-host", mountpoint="/mnt/backup"} / node_filesystem_size_bytes{job="proxmox-host", mountpoint="/mnt/backup"}) > 0.85 + for: 15m + labels: + severity: critical + annotations: + summary: "Backup disk /mnt/backup is {{ $value | humanizePercentage }} full" - alert: NewTailscaleClient expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0 for: 5m