From bda1bdcbf340adf30a2111c8905731d38ba5aac1 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 13 Jun 2026 14:02:54 +0000 Subject: [PATCH] dbaas: widen backup CronJob startingDeadlineSeconds from 10s to 600s The daily full PostgreSQL backup silently skipped its 2026-06-13 00:00 run, leaving the last full dump 37h old and firing the critical PostgreSQLBackupStale alert. Root cause: startingDeadlineSeconds was 10s on all four dbaas backup CronJobs, so when the CronJob controller was more than 10s late to the midnight tick (many IO-heavy backups all fire at 00:00, the known etcd-starvation window) the run was dropped entirely instead of starting late. 600s lets a brief controller lag still launch the job. Applied to all four (mysql + pg, full + per-db) since they share the footgun and the midnight contention. Co-Authored-By: Claude Opus 4.8 --- stacks/dbaas/modules/dbaas/main.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 3fc44f94..9d450689 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -427,7 +427,7 @@ resource "kubernetes_cron_job_v1" "mysql-backup" { failed_jobs_history_limit = 5 schedule = "30 0 * * *" # schedule = "* * * * *" - starting_deadline_seconds = 10 + starting_deadline_seconds = 600 successful_jobs_history_limit = 10 job_template { metadata {} @@ -519,7 +519,7 @@ resource "kubernetes_cron_job_v1" "mysql-backup-per-db" { concurrency_policy = "Replace" failed_jobs_history_limit = 3 schedule = "45 0 * * *" - starting_deadline_seconds = 10 + starting_deadline_seconds = 600 successful_jobs_history_limit = 3 job_template { metadata {} @@ -1607,7 +1607,7 @@ resource "kubernetes_cron_job_v1" "postgresql-backup" { failed_jobs_history_limit = 5 schedule = "0 0 * * *" # schedule = "* * * * *" - starting_deadline_seconds = 10 + starting_deadline_seconds = 600 successful_jobs_history_limit = 10 job_template { metadata {} @@ -1695,7 +1695,7 @@ resource "kubernetes_cron_job_v1" "postgresql-backup-per-db" { concurrency_policy = "Replace" failed_jobs_history_limit = 3 schedule = "15 0 * * *" - starting_deadline_seconds = 10 + starting_deadline_seconds = 600 successful_jobs_history_limit = 3 job_template { metadata {}