From c034adab5f0c3c3078c5a13e379ba41076ae4df2 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Sun, 15 Mar 2026 17:23:39 +0000
Subject: [PATCH] mitigate cluster instability during terraform applies

- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf)
- Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno)
  to prevent memory request surge overwhelming scheduler
- Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup
- Disable Kyverno policy reports (ephemeral report cleanup)
- Cloud-init: journald persistence + 4Gi swap for worker nodes
- Kubelet: LimitedSwap behavior for memory pressure relief
---
 modules/create-template-vm/cloud_init.yaml    | 12 +++++
 stacks/infra/main.tf                          |  2 +
 stacks/onlyoffice/main.tf                     |  3 ++
 stacks/platform/modules/authentik/values.yaml | 10 ++++
 .../modules/infra-maintenance/main.tf         | 54 +++++++++++++++++++
 stacks/platform/modules/kyverno/main.tf       | 11 ++++
 .../monitoring/grafana_chart_values.yaml      |  3 ++
 stacks/platform/modules/traefik/main.tf       |  4 +-
 stacks/stirling-pdf/main.tf                   |  3 ++
 9 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/modules/create-template-vm/cloud_init.yaml b/modules/create-template-vm/cloud_init.yaml
index b62430fb..79d1de8c 100644
--- a/modules/create-template-vm/cloud_init.yaml
+++ b/modules/create-template-vm/cloud_init.yaml
@@ -56,6 +56,10 @@ apt:
       filename: docker.list
 
 runcmd:
+  # Enable persistent journald logging for crash forensics
+  - mkdir -p /var/log/journal
+  - sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
+  - systemctl restart systemd-journald
   %{if is_k8s_template}
   - apt-mark hold kubelet kubeadm kubectl
   - systemctl stop kubelet
@@ -63,6 +67,14 @@ runcmd:
   - ${containerd_config_update_command}
   - systemctl restart containerd
   - systemctl enable --now iscsid
+  # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
+  - fallocate -l 4G /swapfile
+  - chmod 600 /swapfile
+  - mkswap /swapfile
+  - swapon /swapfile
+  - echo '/swapfile none swap sw 0 0' >> /etc/fstab
+  - sysctl -w vm.swappiness=10
+  - echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf
   - ${k8s_join_command}
   - systemctl enable kubelet
   - systemctl start kubelet
diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf
index 5af826fd..5dd2a48f 100644
--- a/stacks/infra/main.tf
+++ b/stacks/infra/main.tf
@@ -130,6 +130,8 @@ evictionSoftGracePeriod:
   memory.available: "30s"
   nodefs.available: "60s"  # Grace period for disk space warnings
   imagefs.available: "30s"  # Shorter grace for critical containerd space
+memorySwap:
+  swapBehavior: "LimitedSwap"
 KUBELET_PATCH
   EOF
   k8s_join_command                 = var.k8s_join_command
diff --git a/stacks/onlyoffice/main.tf b/stacks/onlyoffice/main.tf
index 86b16bc3..1dda92f3 100644
--- a/stacks/onlyoffice/main.tf
+++ b/stacks/onlyoffice/main.tf
@@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
   }
   spec {
     replicas = 1
+    strategy {
+      type = "Recreate"
+    }
     selector {
       match_labels = {
         app = "onlyoffice-document-server"
diff --git a/stacks/platform/modules/authentik/values.yaml b/stacks/platform/modules/authentik/values.yaml
index 05cc1065..e50719e9 100644
--- a/stacks/platform/modules/authentik/values.yaml
+++ b/stacks/platform/modules/authentik/values.yaml
@@ -17,6 +17,11 @@ authentik:
 
 server:
   replicas: 3
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
   resources:
     requests:
       cpu: 100m
@@ -45,6 +50,11 @@ global:
 
 worker:
   replicas: 3
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
   resources:
     requests:
       cpu: 100m
diff --git a/stacks/platform/modules/infra-maintenance/main.tf b/stacks/platform/modules/infra-maintenance/main.tf
index 1f1451d1..6e3e108c 100644
--- a/stacks/platform/modules/infra-maintenance/main.tf
+++ b/stacks/platform/modules/infra-maintenance/main.tf
@@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
   }
 }
 
+# Weekly etcd defragmentation — prevents fragmentation buildup that causes slow requests
+resource "kubernetes_cron_job_v1" "defrag-etcd" {
+  metadata {
+    name      = "defrag-etcd"
+    namespace = "default"
+  }
+  spec {
+    schedule                      = "0 3 * * 0"
+    successful_jobs_history_limit = 1
+    failed_jobs_history_limit     = 1
+    concurrency_policy            = "Forbid"
+    job_template {
+      metadata {
+        name = "defrag-etcd"
+      }
+      spec {
+        template {
+          metadata {
+            name = "defrag-etcd"
+          }
+          spec {
+            node_name           = "k8s-master"
+            priority_class_name = "system-cluster-critical"
+            host_network        = true
+            container {
+              name    = "defrag-etcd"
+              image   = "registry.k8s.io/etcd:3.5.21-0"
+              command = ["etcdctl"]
+              args    = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
+              env {
+                name  = "ETCDCTL_API"
+                value = "3"
+              }
+              volume_mount {
+                mount_path = "/etc/kubernetes/pki/etcd"
+                name       = "etcd-certs"
+                read_only  = true
+              }
+            }
+            volume {
+              name = "etcd-certs"
+              host_path {
+                path = "/etc/kubernetes/pki/etcd"
+                type = "DirectoryOrCreate"
+              }
+            }
+            restart_policy = "Never"
+          }
+        }
+      }
+    }
+  }
+}
+
 # Clean up evicted/failed pods cluster-wide daily
 resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
   metadata {
diff --git a/stacks/platform/modules/kyverno/main.tf b/stacks/platform/modules/kyverno/main.tf
index 83a752cb..405029dc 100644
--- a/stacks/platform/modules/kyverno/main.tf
+++ b/stacks/platform/modules/kyverno/main.tf
@@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
       forceFailurePolicyIgnore = {
         enabled = true
       }
+      policyReports = {
+        enabled = false
+      }
     }
 
     reportsController = {
@@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
     admissionController = {
       replicas = 2
 
+      updateStrategy = {
+        type = "RollingUpdate"
+        rollingUpdate = {
+          maxSurge       = 0
+          maxUnavailable = 1
+        }
+      }
+
       container = {
         resources = {
           limits = {
diff --git a/stacks/platform/modules/monitoring/grafana_chart_values.yaml b/stacks/platform/modules/monitoring/grafana_chart_values.yaml
index 44ce866b..a5bc4222 100644
--- a/stacks/platform/modules/monitoring/grafana_chart_values.yaml
+++ b/stacks/platform/modules/monitoring/grafana_chart_values.yaml
@@ -1,5 +1,8 @@
 deploymentStrategy:
   type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 0
+    maxUnavailable: 1
 replicas: 2
 adminPassword: "${grafana_admin_password}"
 resources:
diff --git a/stacks/platform/modules/traefik/main.tf b/stacks/platform/modules/traefik/main.tf
index 39bdcd53..6428322c 100644
--- a/stacks/platform/modules/traefik/main.tf
+++ b/stacks/platform/modules/traefik/main.tf
@@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
     updateStrategy = {
       type = "RollingUpdate"
       rollingUpdate = {
-        maxUnavailable = 1
-        maxSurge       = 2
+        maxUnavailable = 0
+        maxSurge       = 1
       }
     }
 
diff --git a/stacks/stirling-pdf/main.tf b/stacks/stirling-pdf/main.tf
index b06ad539..16ef39a9 100644
--- a/stacks/stirling-pdf/main.tf
+++ b/stacks/stirling-pdf/main.tf
@@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" {
   }
   spec {
     replicas = 1
+    strategy {
+      type = "Recreate"
+    }
     selector {
       match_labels = {
         app = "stirling-pdf"