From c8be07c4038db4d664f34cde3858e0bbb47b7c72 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 6 Apr 2026 00:25:49 +0300 Subject: [PATCH] resilience improvements: MySQL anti-affinity comment, descheduler 5min, prometheus termination 60s - MySQL InnoDB: keep required anti-affinity but document why (2/3 members OK during node loss) - Descheduler: increase frequency from hourly to every 5 min for faster rebalancing - Prometheus: set terminationGracePeriodSeconds=60 to prevent drain timeout [ci skip] --- stacks/dbaas/modules/dbaas/main.tf | 4 ++++ stacks/descheduler/values.yaml | 2 +- .../platform/modules/monitoring/prometheus_chart_values.tpl | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 078cf571..79489348 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -237,6 +237,10 @@ resource "helm_release" "mysql_cluster" { } } podAntiAffinity = { + # Required anti-affinity: MySQL pods MUST be on different nodes. + # During node loss, one pod will be Pending — this is acceptable because + # InnoDB Cluster operates with 2/3 members (OK_NO_TOLERANCE). + # The descheduler (every 5 min) handles violations if any occur. requiredDuringSchedulingIgnoredDuringExecution = [{ labelSelector = { matchLabels = { diff --git a/stacks/descheduler/values.yaml b/stacks/descheduler/values.yaml index 771bf649..dca0d0fc 100644 --- a/stacks/descheduler/values.yaml +++ b/stacks/descheduler/values.yaml @@ -52,7 +52,7 @@ namespaceOverride: "" commonLabels: {} cronJobApiVersion: "batch/v1" -schedule: "0 * * * *" +schedule: "*/5 * * * *" suspend: false # startingDeadlineSeconds: 200 successfulJobsHistoryLimit: 10 diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl index d536a1da..4968ff61 100755 --- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl @@ -169,6 +169,7 @@ server: memory: 4Gi livenessProbeInitialDelay: 300 readinessProbeInitialDelay: 60 + terminationGracePeriodSeconds: 60 strategy: type: Recreate baseURL: "https://prometheus.viktorbarzin.me"