backup: fix daily-backup silent failures, postiz pg_dump CronJob, doc reconcile
daily-backup ran out of its 1h budget and SIGTERMed for 10 days straight (Apr 30 → May 9). Each failed run left its snapshot mount stacked on /tmp/pvc-mount, which blocked the next run from completing — root cause of the WeeklyBackupStale alert going silent (the metric never reached its end-of-script push). Fixes: - TimeoutStartSec 1h → 4h (current workload of 118 PVCs needs ~1.5h, was hitting the wall during week 18 runs) - Recursive umount + LUKS cleanup on EXIT trap, plus the same at script start as belt-and-braces for any inherited stuck state from a prior crashed run - TERM/INT trap pushes status=2 metric so WeeklyBackupFailing fires instead of the alert going blind on systemd kills - pfsense metric pushed in BOTH success and failure paths (was only on success; any ssh-to-pfsense outage made PfsenseBackupStale silent until the alert threshold expired) Postiz backup CronJob: bundled bitnami PG/Redis live on local-path (K8s node OS disk) — outside Layer 1+2 of the 3-2-1 pipeline. Added postiz-postgres-backup that pg_dumps postiz + temporal + temporal_visibility daily 03:00 to /srv/nfs/postiz-backup, getting Layer 3 offsite coverage. Verified end-to-end: 3 dumps written, Pushgateway metric received. Note: bitnamilegacy/postgresql image is stripped (no curl/wget/python) — switched to docker.io/library/postgres matching the dbaas/postgresql-backup pattern with apt-installed curl. Doc reconcile (backup-dr.md): metric names had drifted (e.g. the docs claimed backup_weekly_last_success_timestamp but the script pushes daily_backup_last_run_timestamp). Updated to match what's actually emitted, and added a "default-covered" footnote to the Service Protection Matrix so the ~40 services with PVCs not enumerated in the table are no longer ambiguous. Manual PVE-host actions (out-of-band, not in TF): - unmounted 6 stacked snapshots from /tmp/pvc-mount - pruned 5 stale snapshots on vm-9999-pvc-67c90b6b... (origin LV that the loop got SIGTERMed against repeatedly, so prune kept failing) - created /srv/nfs/postiz-backup directory - triggered a one-shot daily-backup run with the new TimeoutStartSec to validate the fix end-to-end Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
8f0502230b
commit
cfe969fe43
4 changed files with 174 additions and 12 deletions
|
|
@ -428,6 +428,113 @@ resource "kubernetes_service" "temporal" {
|
|||
# NestJS bootstrap crashes with "cannot have more than 3 search attribute
|
||||
# of type Text" and the backend never starts.
|
||||
# Upstream issue: https://github.com/gitroomhq/postiz-app/issues/1504
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Backup CronJob — nightly pg_dump of the bundled postiz-postgresql to NFS.
|
||||
#
|
||||
# The bundled PostgreSQL StatefulSet uses local-path storage on the K8s node
|
||||
# OS disk (chart default), which is NOT covered by Layer 1 (LVM thin
|
||||
# snapshots) or Layer 2 (sda file backup) of the 3-2-1 pipeline. A pg_dump
|
||||
# CronJob writing to /srv/nfs/postiz-backup/ closes the gap: dumps land on
|
||||
# Proxmox host NFS → covered by inotify-driven offsite sync to Synology.
|
||||
# Three databases are dumped: postiz (app data), temporal (workflow engine),
|
||||
# temporal_visibility (workflow search). Bitnami chart-default credentials
|
||||
# are used — same creds the Postiz pod itself uses, scoped to the postiz
|
||||
# namespace via ClusterIP-only Services.
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
module "nfs_backup_host" {
|
||||
source = "../../../../modules/kubernetes/nfs_volume"
|
||||
name = "postiz-backup-host"
|
||||
namespace = kubernetes_namespace.postiz.metadata[0].name
|
||||
nfs_server = "192.168.1.127"
|
||||
nfs_path = "/srv/nfs/postiz-backup"
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "postgres_backup" {
|
||||
metadata {
|
||||
name = "postiz-postgres-backup"
|
||||
namespace = kubernetes_namespace.postiz.metadata[0].name
|
||||
labels = { app = "postiz", component = "backup" }
|
||||
}
|
||||
spec {
|
||||
schedule = "0 3 * * *"
|
||||
concurrency_policy = "Forbid"
|
||||
successful_jobs_history_limit = 3
|
||||
failed_jobs_history_limit = 5
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 86400
|
||||
template {
|
||||
metadata {
|
||||
labels = { app = "postiz", component = "backup" }
|
||||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
container {
|
||||
name = "backup"
|
||||
# Same image/pattern as dbaas/postgresql-backup: official postgres
|
||||
# client tools + apt-installed curl for the Pushgateway push. The
|
||||
# bitnamilegacy/postgresql variant is stripped (no curl/wget/python),
|
||||
# so the metric push silently failed there.
|
||||
image = "docker.io/library/postgres:16.4-bullseye"
|
||||
command = ["/bin/bash", "-c"]
|
||||
args = [
|
||||
<<-EOT
|
||||
set -uo pipefail
|
||||
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M)
|
||||
BACKUP_DIR=/backup
|
||||
STATUS=0
|
||||
for db in postiz temporal temporal_visibility; do
|
||||
echo "Dumping $db..."
|
||||
if PGPASSWORD=postiz-password pg_dump -h postiz-postgresql -U postiz \
|
||||
--format=custom --compress=6 \
|
||||
--file="$BACKUP_DIR/$db-$TIMESTAMP.dump" \
|
||||
"$db"; then
|
||||
echo " OK: $db ($(du -h "$BACKUP_DIR/$db-$TIMESTAMP.dump" | cut -f1))"
|
||||
else
|
||||
echo " FAIL: $db" >&2
|
||||
STATUS=1
|
||||
fi
|
||||
done
|
||||
find "$BACKUP_DIR" -name '*.dump' -mtime +30 -delete 2>/dev/null || true
|
||||
{
|
||||
echo "backup_last_run_timestamp $(date +%s)"
|
||||
echo "backup_last_status $STATUS"
|
||||
[ "$STATUS" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
|
||||
} | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postiz-postgres-backup" || true
|
||||
exit $STATUS
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
name = "backup"
|
||||
mount_path = "/backup"
|
||||
}
|
||||
resources {
|
||||
requests = { cpu = "10m", memory = "64Mi" }
|
||||
limits = { memory = "256Mi" }
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "backup"
|
||||
persistent_volume_claim {
|
||||
claim_name = module.nfs_backup_host.claim_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
depends_on = [helm_release.postiz]
|
||||
}
|
||||
|
||||
resource "kubernetes_job" "temporal_search_attr_cleanup" {
|
||||
metadata {
|
||||
name = "temporal-search-attr-cleanup"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue