[backup] Fix lvm-pvc-snapshot Pushgateway push (stdout pollution in cmd_prune_count)
cmd_prune_count's `log " Pruned: ..."` wrote to stdout, which the
caller captures via `pruned=$(cmd_prune_count)`. From 2026-04-16 onward
(7d retention kicked in), pruned snapshots polluted the captured value
with multi-line log text, breaking the Prometheus exposition format
on the metric push (`lvm_snapshot_pruned_total ${pruned}` → 400 from
Pushgateway). Snapshots themselves were always fine; only the metric
push silently failed for ~9 nights, eventually triggering
LVMSnapshotNeverRun (alert has 48h `for:`).
Fix: redirect the inner log call to stderr so cmd_prune_count's stdout
contains only the count. Also adopts `infra/scripts/lvm-pvc-snapshot.sh`
as the source-of-truth (was edited only on the PVE host) and updates
backup-dr.md to point at the .sh and document the scp deploy.
Deploy: scp infra/scripts/lvm-pvc-snapshot.sh root@192.168.1.127:/usr/local/bin/lvm-pvc-snapshot
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d231615ebb
commit
4315ed5c2a
2 changed files with 471 additions and 2 deletions
|
|
@ -217,7 +217,7 @@ graph LR
|
|||
|
||||
Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 Proxmox CSI PVCs. These are CoW snapshots — instant creation, minimal overhead, sharing the thin pool's free space.
|
||||
|
||||
**Script**: `/usr/local/bin/lvm-pvc-snapshot` on PVE host (source: `infra/scripts/lvm-pvc-snapshot`)
|
||||
**Script**: `/usr/local/bin/lvm-pvc-snapshot` on PVE host (source: `infra/scripts/lvm-pvc-snapshot.sh`). Deploy: `scp infra/scripts/lvm-pvc-snapshot.sh root@192.168.1.127:/usr/local/bin/lvm-pvc-snapshot`
|
||||
**Schedule**: Daily 03:00 via systemd timer, 7-day retention
|
||||
**Discovery**: Auto-discovers PVC LVs matching `vm-*-pvc-*` pattern in VG `pve` thin pool `data`
|
||||
|
||||
|
|
@ -234,7 +234,7 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62
|
|||
|
||||
**Backup disk**: sda (1.1TB RAID1 SAS) → VG `backup` → LV `data` → ext4 → mounted at `/mnt/backup` on PVE host. Dedicated backup disk, independent of live storage.
|
||||
|
||||
**Script**: `/usr/local/bin/daily-backup` on PVE host (source: `infra/scripts/daily-backup`)
|
||||
**Script**: `/usr/local/bin/daily-backup` on PVE host (source: `infra/scripts/daily-backup.sh`)
|
||||
**Schedule**: Daily 05:00 via systemd timer
|
||||
**Retention**: 4 weekly versions (weeks 0-3 via `--link-dest` hardlink dedup)
|
||||
|
||||
|
|
|
|||
469
scripts/lvm-pvc-snapshot.sh
Executable file
469
scripts/lvm-pvc-snapshot.sh
Executable file
|
|
@ -0,0 +1,469 @@
|
|||
#!/usr/bin/env bash
|
||||
# lvm-pvc-snapshot — LVM thin snapshot management for Proxmox CSI PVCs
|
||||
# Deploy to PVE host at /usr/local/bin/lvm-pvc-snapshot
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration ---
|
||||
VG="pve"
|
||||
THINPOOL="data"
|
||||
SNAP_SUFFIX_FORMAT="%Y%m%d_%H%M"
|
||||
RETENTION_DAYS=7
|
||||
MIN_FREE_PCT=10
|
||||
PUSHGATEWAY="${LVM_SNAP_PUSHGATEWAY:-http://10.0.20.100:30091}"
|
||||
PUSHGATEWAY_JOB="lvm-pvc-snapshot"
|
||||
LOCKFILE="/run/lvm-pvc-snapshot.lock"
|
||||
KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
|
||||
export KUBECONFIG
|
||||
|
||||
# Namespaces to exclude from snapshots (high-churn, have app-level dumps)
|
||||
# These PVCs cause significant CoW write amplification (~36% overhead)
|
||||
EXCLUDE_NAMESPACES="${LVM_SNAP_EXCLUDE_NS:-dbaas,monitoring}"
|
||||
|
||||
# --- Logging ---
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
warn() { log "WARN: $*" >&2; }
|
||||
die() { log "FATAL: $*" >&2; exit 1; }
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
get_thinpool_free_pct() {
|
||||
local data_pct
|
||||
data_pct=$(lvs --noheadings --nosuffix -o data_percent "${VG}/${THINPOOL}" 2>/dev/null | tr -d ' ')
|
||||
echo "scale=2; 100 - ${data_pct}" | bc
|
||||
}
|
||||
|
||||
build_exclude_lv_list() {
|
||||
# Query K8s for PVs in excluded namespaces, extract their LV names
|
||||
if [[ -z "${EXCLUDE_NAMESPACES}" ]] || ! command -v kubectl &>/dev/null; then
|
||||
return
|
||||
fi
|
||||
kubectl get pv -o json 2>/dev/null | jq -r --arg ns "${EXCLUDE_NAMESPACES}" '
|
||||
($ns | split(",")) as $excl |
|
||||
.items[] |
|
||||
select(.spec.csi.driver == "csi.proxmox.sinextra.dev") |
|
||||
select(.spec.claimRef.namespace as $n | $excl | index($n)) |
|
||||
.spec.csi.volumeHandle | split("/") | last
|
||||
' 2>/dev/null || true
|
||||
}
|
||||
|
||||
discover_pvc_lvs() {
|
||||
# List thin LVs matching PVC pattern, excluding snapshots, pre-restore backups,
|
||||
# and LVs belonging to excluded namespaces (high-churn databases/metrics)
|
||||
local all_lvs exclude_lvs
|
||||
all_lvs=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
|
||||
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
|
||||
| grep -E '^vm-[0-9]+-pvc-' \
|
||||
| grep -v '_snap_' \
|
||||
| grep -v '_pre_restore_')
|
||||
|
||||
exclude_lvs=$(build_exclude_lv_list)
|
||||
|
||||
if [[ -n "${exclude_lvs}" ]]; then
|
||||
# Filter out excluded LVs
|
||||
local exclude_pattern
|
||||
exclude_pattern=$(echo "${exclude_lvs}" | paste -sd'|' -)
|
||||
echo "${all_lvs}" | grep -vE "(${exclude_pattern})" || true
|
||||
else
|
||||
echo "${all_lvs}"
|
||||
fi
|
||||
}
|
||||
|
||||
list_snapshots() {
|
||||
lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
|
||||
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
|
||||
| grep '_snap_' || true
|
||||
}
|
||||
|
||||
parse_snap_timestamp() {
|
||||
# Extract YYYYMMDD_HHMM from snapshot name, convert to epoch
|
||||
local snap_name="$1"
|
||||
local ts_str
|
||||
ts_str=$(echo "${snap_name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
|
||||
if [[ -z "${ts_str}" ]]; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
local ymd="${ts_str:0:8}"
|
||||
local hm="${ts_str:9:4}"
|
||||
date -d "${ymd:0:4}-${ymd:4:2}-${ymd:6:2} ${hm:0:2}:${hm:2:2}" +%s 2>/dev/null || echo "0"
|
||||
}
|
||||
|
||||
get_original_lv_from_snap() {
|
||||
# vm-200-pvc-abc_snap_20260403_1200 -> vm-200-pvc-abc
|
||||
echo "$1" | sed 's/_snap_[0-9]\{8\}_[0-9]\{4\}$//'
|
||||
}
|
||||
|
||||
push_metrics() {
|
||||
local status="$1" created="$2" failed="$3" pruned="$4"
|
||||
local free_pct
|
||||
free_pct=$(get_thinpool_free_pct)
|
||||
|
||||
cat <<METRICS | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || warn "Failed to push metrics to Pushgateway"
|
||||
# HELP lvm_snapshot_last_run_timestamp Unix timestamp of last snapshot run
|
||||
# TYPE lvm_snapshot_last_run_timestamp gauge
|
||||
lvm_snapshot_last_run_timestamp $(date +%s)
|
||||
# HELP lvm_snapshot_last_status Exit status (0=success, 1=partial failure, 2=aborted)
|
||||
# TYPE lvm_snapshot_last_status gauge
|
||||
lvm_snapshot_last_status ${status}
|
||||
# HELP lvm_snapshot_created_total Number of snapshots created in last run
|
||||
# TYPE lvm_snapshot_created_total gauge
|
||||
lvm_snapshot_created_total ${created}
|
||||
# HELP lvm_snapshot_failed_total Number of snapshot failures in last run
|
||||
# TYPE lvm_snapshot_failed_total gauge
|
||||
lvm_snapshot_failed_total ${failed}
|
||||
# HELP lvm_snapshot_pruned_total Number of snapshots pruned in last run
|
||||
# TYPE lvm_snapshot_pruned_total gauge
|
||||
lvm_snapshot_pruned_total ${pruned}
|
||||
# HELP lvm_snapshot_thinpool_free_pct Thin pool free percentage
|
||||
# TYPE lvm_snapshot_thinpool_free_pct gauge
|
||||
lvm_snapshot_thinpool_free_pct ${free_pct}
|
||||
METRICS
|
||||
}
|
||||
|
||||
# --- Subcommands ---
|
||||
|
||||
cmd_snapshot() {
|
||||
log "Starting PVC LVM thin snapshot run"
|
||||
|
||||
# Check thin pool free space
|
||||
local free_pct
|
||||
free_pct=$(get_thinpool_free_pct)
|
||||
log "Thin pool free space: ${free_pct}%"
|
||||
if (( $(echo "${free_pct} < ${MIN_FREE_PCT}" | bc -l) )); then
|
||||
warn "Thin pool has only ${free_pct}% free (minimum: ${MIN_FREE_PCT}%). Aborting."
|
||||
push_metrics 2 0 0 0
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Discover PVC LVs
|
||||
local lvs_list
|
||||
lvs_list=$(discover_pvc_lvs)
|
||||
if [[ -z "${lvs_list}" ]]; then
|
||||
warn "No PVC LVs found matching pattern"
|
||||
push_metrics 2 0 0 0
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local count=0 failed=0 total
|
||||
total=$(echo "${lvs_list}" | wc -l | tr -d ' ')
|
||||
local snap_ts
|
||||
snap_ts=$(date +"${SNAP_SUFFIX_FORMAT}")
|
||||
|
||||
log "Found ${total} PVC LVs to snapshot"
|
||||
|
||||
while IFS= read -r lv; do
|
||||
local snap_name="${lv}_snap_${snap_ts}"
|
||||
if lvcreate -s -kn -n "${snap_name}" "${VG}/${lv}" >/dev/null 2>&1; then
|
||||
log " Created: ${snap_name}"
|
||||
count=$((count + 1))
|
||||
else
|
||||
warn " Failed to create snapshot for ${lv}"
|
||||
failed=$((failed + 1))
|
||||
fi
|
||||
done <<< "${lvs_list}"
|
||||
|
||||
log "Snapshot run complete: ${count} created, ${failed} failed out of ${total}"
|
||||
|
||||
# Auto-prune
|
||||
log "Running auto-prune..."
|
||||
local pruned
|
||||
pruned=$(cmd_prune_count)
|
||||
|
||||
# Determine status
|
||||
local status=0
|
||||
if (( failed > 0 && count > 0 )); then
|
||||
status=1 # partial
|
||||
elif (( failed > 0 && count == 0 )); then
|
||||
status=2 # all failed
|
||||
fi
|
||||
|
||||
push_metrics "${status}" "${count}" "${failed}" "${pruned}"
|
||||
log "Done"
|
||||
}
|
||||
|
||||
cmd_list() {
|
||||
printf "%-45s %-50s %8s %8s\n" "ORIGINAL LV" "SNAPSHOT" "AGE" "DATA%"
|
||||
printf "%-45s %-50s %8s %8s\n" "-----------" "--------" "---" "-----"
|
||||
|
||||
local now
|
||||
now=$(date +%s)
|
||||
|
||||
local snap_lines
|
||||
snap_lines=$(lvs --noheadings --nosuffix -o lv_name,lv_size,data_percent "${VG}" 2>/dev/null \
|
||||
| grep -E '_snap_|_pre_restore_' || true)
|
||||
|
||||
if [[ -z "${snap_lines}" ]]; then
|
||||
echo "(no snapshots found)"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "${snap_lines}" | while read -r name size data_pct; do
|
||||
local original age_str ts epoch
|
||||
if [[ "${name}" == *"_pre_restore_"* ]]; then
|
||||
original=$(echo "${name}" | sed 's/_pre_restore_[0-9]\{8\}_[0-9]\{4\}$//')
|
||||
ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
|
||||
else
|
||||
original=$(get_original_lv_from_snap "${name}")
|
||||
ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
|
||||
fi
|
||||
epoch=$(parse_snap_timestamp "${name}")
|
||||
if (( epoch > 0 )); then
|
||||
local age_s=$(( now - epoch ))
|
||||
local days=$(( age_s / 86400 ))
|
||||
local hours=$(( (age_s % 86400) / 3600 ))
|
||||
age_str="${days}d${hours}h"
|
||||
else
|
||||
age_str="unknown"
|
||||
fi
|
||||
printf "%-45s %-50s %8s %7s%%\n" "${original}" "${name}" "${age_str}" "${data_pct}"
|
||||
done
|
||||
}
|
||||
|
||||
cmd_prune() {
|
||||
local pruned
|
||||
pruned=$(cmd_prune_count)
|
||||
log "Pruned ${pruned} expired snapshots"
|
||||
}
|
||||
|
||||
cmd_prune_count() {
|
||||
# NOTE: stdout of this function is captured by callers (`pruned=$(cmd_prune_count)`),
|
||||
# so all log/warn output must go to stderr — the only thing on stdout is the count.
|
||||
local now cutoff pruned=0
|
||||
now=$(date +%s)
|
||||
cutoff=$(( now - RETENTION_DAYS * 86400 ))
|
||||
|
||||
local snaps
|
||||
snaps=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
|
||||
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
|
||||
| grep -E '_snap_|_pre_restore_' || true)
|
||||
|
||||
if [[ -z "${snaps}" ]]; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
while IFS= read -r snap; do
|
||||
local epoch
|
||||
epoch=$(parse_snap_timestamp "${snap}")
|
||||
if (( epoch > 0 && epoch < cutoff )); then
|
||||
if lvremove -f "${VG}/${snap}" >/dev/null 2>&1; then
|
||||
log " Pruned: ${snap}" >&2
|
||||
pruned=$((pruned + 1))
|
||||
else
|
||||
warn " Failed to prune: ${snap}"
|
||||
fi
|
||||
fi
|
||||
done <<< "${snaps}"
|
||||
|
||||
echo "${pruned}"
|
||||
}
|
||||
|
||||
cmd_restore() {
|
||||
local pvc_lv="${1:-}" snapshot_lv="${2:-}"
|
||||
|
||||
if [[ -z "${pvc_lv}" || -z "${snapshot_lv}" ]]; then
|
||||
die "Usage: $0 restore <pvc-lv-name> <snapshot-lv-name>"
|
||||
fi
|
||||
|
||||
# Validate LVs exist
|
||||
if ! lvs "${VG}/${pvc_lv}" >/dev/null 2>&1; then
|
||||
die "PVC LV '${pvc_lv}' not found in VG '${VG}'"
|
||||
fi
|
||||
if ! lvs "${VG}/${snapshot_lv}" >/dev/null 2>&1; then
|
||||
die "Snapshot LV '${snapshot_lv}' not found in VG '${VG}'"
|
||||
fi
|
||||
|
||||
# Discover K8s context
|
||||
log "Discovering Kubernetes context for LV '${pvc_lv}'..."
|
||||
|
||||
local volume_handle="local-lvm:${pvc_lv}"
|
||||
local pv_info
|
||||
pv_info=$(kubectl get pv -o json 2>/dev/null | jq -r \
|
||||
--arg vh "${volume_handle}" \
|
||||
'.items[] | select(.spec.csi.volumeHandle == $vh) | "\(.metadata.name) \(.spec.claimRef.namespace) \(.spec.claimRef.name)"' \
|
||||
) || die "Failed to query PVs (is kubectl configured?)"
|
||||
|
||||
if [[ -z "${pv_info}" ]]; then
|
||||
die "No PV found with volumeHandle '${volume_handle}'"
|
||||
fi
|
||||
|
||||
local pv_name pvc_ns pvc_name
|
||||
read -r pv_name pvc_ns pvc_name <<< "${pv_info}"
|
||||
log "Found: PV=${pv_name}, PVC=${pvc_ns}/${pvc_name}"
|
||||
|
||||
# Find the workload (Deployment or StatefulSet) that uses this PVC
|
||||
local workload_type="" workload_name="" original_replicas=""
|
||||
|
||||
# Check StatefulSets first (databases use these)
|
||||
local sts_info
|
||||
sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
|
||||
--arg pvc "${pvc_name}" \
|
||||
'.items[] | select(
|
||||
(.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc) or
|
||||
(.spec.volumeClaimTemplates // [] | .[].metadata.name as $vct |
|
||||
.spec.replicas as $r | range($r) | "\($vct)-\(.metadata.name)-\(.)" ) == $pvc
|
||||
) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
|
||||
) || true
|
||||
|
||||
# If not found via simple volume check, try matching VCT naming pattern
|
||||
if [[ -z "${sts_info}" ]]; then
|
||||
sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
|
||||
--arg pvc "${pvc_name}" \
|
||||
'.items[] | .metadata.name as $sts | .spec.replicas as $r |
|
||||
select(.spec.volumeClaimTemplates != null) |
|
||||
.spec.volumeClaimTemplates[].metadata.name as $vct |
|
||||
[range($r)] | map("\($vct)-\($sts)-\(.)") |
|
||||
if any(. == $pvc) then "\($sts) \($r)" else empty end' 2>/dev/null \
|
||||
) || true
|
||||
fi
|
||||
|
||||
if [[ -n "${sts_info}" ]]; then
|
||||
read -r workload_name original_replicas <<< "${sts_info}"
|
||||
workload_type="statefulset"
|
||||
else
|
||||
# Check Deployments
|
||||
local deploy_info
|
||||
deploy_info=$(kubectl get deployment -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
|
||||
--arg pvc "${pvc_name}" \
|
||||
'.items[] | select(
|
||||
.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc
|
||||
) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
|
||||
) || true
|
||||
|
||||
if [[ -n "${deploy_info}" ]]; then
|
||||
read -r workload_name original_replicas <<< "${deploy_info}"
|
||||
workload_type="deployment"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${workload_type}" ]]; then
|
||||
warn "Could not auto-discover workload for PVC '${pvc_name}' in namespace '${pvc_ns}'."
|
||||
warn "You may need to scale down the pod manually."
|
||||
echo ""
|
||||
read -rp "Continue with LV swap anyway? (yes/no): " confirm
|
||||
[[ "${confirm}" == "yes" ]] || die "Aborted by user"
|
||||
workload_type="manual"
|
||||
fi
|
||||
|
||||
# Dry-run output
|
||||
local backup_name="${pvc_lv}_pre_restore_$(date +"${SNAP_SUFFIX_FORMAT}")"
|
||||
echo ""
|
||||
echo "╔══════════════════════════════════════════════════════════════╗"
|
||||
echo "║ RESTORE DRY-RUN ║"
|
||||
echo "╠══════════════════════════════════════════════════════════════╣"
|
||||
echo "║ PVC: ${pvc_ns}/${pvc_name}"
|
||||
echo "║ PV: ${pv_name}"
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
echo "║ Workload: ${workload_type}/${workload_name} (replicas: ${original_replicas}→0→${original_replicas})"
|
||||
fi
|
||||
echo "║"
|
||||
echo "║ Actions:"
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
echo "║ 1. Scale ${workload_type}/${workload_name} to 0 replicas"
|
||||
echo "║ 2. Wait for pod termination"
|
||||
fi
|
||||
echo "║ 3. Rename ${pvc_lv} → ${backup_name}"
|
||||
echo "║ 4. Rename ${snapshot_lv} → ${pvc_lv}"
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
echo "║ 5. Scale ${workload_type}/${workload_name} back to ${original_replicas} replicas"
|
||||
fi
|
||||
echo "╚══════════════════════════════════════════════════════════════╝"
|
||||
echo ""
|
||||
|
||||
# Interactive confirmation
|
||||
read -rp "Type 'yes' to proceed with restore: " confirm
|
||||
if [[ "${confirm}" != "yes" ]]; then
|
||||
die "Aborted by user"
|
||||
fi
|
||||
|
||||
# Scale down
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
log "Scaling ${workload_type}/${workload_name} to 0 replicas..."
|
||||
kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas=0
|
||||
|
||||
log "Waiting for pod termination (timeout: 120s)..."
|
||||
kubectl wait --for=delete pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
|
||||
kubectl wait --for=delete pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
|
||||
warn "Timeout waiting for pods — continuing anyway (LV may still be in use)"
|
||||
sleep 5 # extra grace period for device detach
|
||||
fi
|
||||
|
||||
# Verify LV is not active
|
||||
local lv_active
|
||||
lv_active=$(lvs --noheadings -o lv_active "${VG}/${pvc_lv}" 2>/dev/null | tr -d ' ')
|
||||
if [[ "${lv_active}" == "active" ]]; then
|
||||
warn "LV ${pvc_lv} is still active. Attempting to deactivate..."
|
||||
# Close any LUKS mapper on the LV before deactivation
|
||||
if dmsetup ls 2>/dev/null | grep -q "${pvc_lv}"; then
|
||||
log "Closing LUKS mapper for ${pvc_lv}..."
|
||||
cryptsetup luksClose "${pvc_lv}" 2>/dev/null || true
|
||||
fi
|
||||
lvchange -an "${VG}/${pvc_lv}" 2>/dev/null || warn "Could not deactivate — proceeding with caution"
|
||||
fi
|
||||
|
||||
# LV swap
|
||||
log "Renaming ${pvc_lv} → ${backup_name}"
|
||||
lvrename "${VG}" "${pvc_lv}" "${backup_name}" || die "Failed to rename original LV"
|
||||
|
||||
log "Renaming ${snapshot_lv} → ${pvc_lv}"
|
||||
lvrename "${VG}" "${snapshot_lv}" "${pvc_lv}" || die "Failed to rename snapshot LV"
|
||||
|
||||
# Scale back up
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
log "Scaling ${workload_type}/${workload_name} back to ${original_replicas} replicas..."
|
||||
kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas="${original_replicas}"
|
||||
|
||||
log "Waiting for pod to become Ready (timeout: 300s)..."
|
||||
kubectl wait --for=condition=Ready pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
|
||||
kubectl wait --for=condition=Ready pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
|
||||
warn "Timeout waiting for pod Ready — check manually"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
log "Restore complete!"
|
||||
log "Old data preserved as: ${backup_name}"
|
||||
log "To delete old data after verification: lvremove -f ${VG}/${backup_name}"
|
||||
}
|
||||
|
||||
# --- Main ---
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") <command> [args]
|
||||
|
||||
Commands:
|
||||
snapshot Create thin snapshots of all PVC LVs
|
||||
list List existing snapshots with age and data%
|
||||
prune Remove snapshots older than ${RETENTION_DAYS} days
|
||||
restore <lv> <snap> Restore a PVC from a snapshot (interactive)
|
||||
|
||||
Environment:
|
||||
LVM_SNAP_PUSHGATEWAY Pushgateway URL (default: ${PUSHGATEWAY})
|
||||
KUBECONFIG Kubeconfig path (default: /root/.kube/config)
|
||||
EOF
|
||||
}
|
||||
|
||||
main() {
|
||||
local cmd="${1:-}"
|
||||
shift || true
|
||||
|
||||
# Acquire lock (except for list which is read-only)
|
||||
if [[ "${cmd}" != "list" && "${cmd}" != "" && "${cmd}" != "help" && "${cmd}" != "--help" && "${cmd}" != "-h" ]]; then
|
||||
exec 200>"${LOCKFILE}"
|
||||
if ! flock -n 200; then
|
||||
die "Another instance is already running (lockfile: ${LOCKFILE})"
|
||||
fi
|
||||
fi
|
||||
|
||||
case "${cmd}" in
|
||||
snapshot) cmd_snapshot ;;
|
||||
list) cmd_list ;;
|
||||
prune) cmd_prune ;;
|
||||
restore) cmd_restore "$@" ;;
|
||||
help|--help|-h|"") usage ;;
|
||||
*) die "Unknown command: ${cmd}. Run '$0 help' for usage." ;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Loading…
Add table
Add a link
Reference in a new issue