resilience improvements: MySQL anti-affinity comment, descheduler 5min, prometheus termination 60s

- MySQL InnoDB: keep required anti-affinity but document why (2/3 members OK during node loss) - Descheduler: increase frequency from hourly to every 5 min for faster rebalancing - Prometheus: set terminationGracePeriodSeconds=60 to prevent drain timeout [ci skip]
2026-04-06 00:25:49 +03:00 · 2026-04-06 00:25:49 +03:00 · c8be07c403
commit c8be07c403
parent 3eb15149e1
3 changed files with 6 additions and 1 deletions
--- a/stacks/dbaas/modules/dbaas/main.tf
+++ b/stacks/dbaas/modules/dbaas/main.tf
@ -237,6 +237,10 @@ resource "helm_release" "mysql_cluster" {
          }
        }
        podAntiAffinity = {
+          # Required anti-affinity: MySQL pods MUST be on different nodes.
+          # During node loss, one pod will be Pending — this is acceptable because
+          # InnoDB Cluster operates with 2/3 members (OK_NO_TOLERANCE).
+          # The descheduler (every 5 min) handles violations if any occur.
          requiredDuringSchedulingIgnoredDuringExecution = [{
            labelSelector = {
              matchLabels = {
--- a/stacks/descheduler/values.yaml
+++ b/stacks/descheduler/values.yaml
@ -52,7 +52,7 @@ namespaceOverride: ""
 commonLabels: {}

 cronJobApiVersion: "batch/v1"
-schedule: "0 * * * *"
+schedule: "*/5 * * * *"
 suspend: false
 # startingDeadlineSeconds: 200
 successfulJobsHistoryLimit: 10
--- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
@ -169,6 +169,7 @@ server:
      memory: 4Gi
  livenessProbeInitialDelay: 300
  readinessProbeInitialDelay: 60
+  terminationGracePeriodSeconds: 60
  strategy:
    type: Recreate
  baseURL: "https://prometheus.viktorbarzin.me"