mitigate cluster instability during terraform applies

- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf) - Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno) to prevent memory request surge overwhelming scheduler - Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup - Disable Kyverno policy reports (ephemeral report cleanup) - Cloud-init: journald persistence + 4Gi swap for worker nodes - Kubelet: LimitedSwap behavior for memory pressure relief
2026-03-15 17:23:39 +00:00 · 2026-03-15 17:23:39 +00:00 · c034adab5f
commit c034adab5f
parent 1fe7798609
9 changed files with 100 additions and 2 deletions
--- a/modules/create-template-vm/cloud_init.yaml
+++ b/modules/create-template-vm/cloud_init.yaml
@ -56,6 +56,10 @@ apt:
      filename: docker.list

 runcmd:
+  # Enable persistent journald logging for crash forensics
+  - mkdir -p /var/log/journal
+  - sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
+  - systemctl restart systemd-journald
  %{if is_k8s_template}
  - apt-mark hold kubelet kubeadm kubectl
  - systemctl stop kubelet
@ -63,6 +67,14 @@ runcmd:
  - ${containerd_config_update_command}
  - systemctl restart containerd
  - systemctl enable --now iscsid
+  # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
+  - fallocate -l 4G /swapfile
+  - chmod 600 /swapfile
+  - mkswap /swapfile
+  - swapon /swapfile
+  - echo '/swapfile none swap sw 0 0' >> /etc/fstab
+  - sysctl -w vm.swappiness=10
+  - echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf
  - ${k8s_join_command}
  - systemctl enable kubelet
  - systemctl start kubelet
--- a/stacks/infra/main.tf
+++ b/stacks/infra/main.tf
@ -130,6 +130,8 @@ evictionSoftGracePeriod:
  memory.available: "30s"
  nodefs.available: "60s"  # Grace period for disk space warnings
  imagefs.available: "30s"  # Shorter grace for critical containerd space
+memorySwap:
+  swapBehavior: "LimitedSwap"
 KUBELET_PATCH
  EOF
  k8s_join_command                 = var.k8s_join_command
--- a/stacks/onlyoffice/main.tf
+++ b/stacks/onlyoffice/main.tf
@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
  }
  spec {
    replicas = 1
+    strategy {
+      type = "Recreate"
+    }
    selector {
      match_labels = {
        app = "onlyoffice-document-server"
--- a/stacks/platform/modules/authentik/values.yaml
+++ b/stacks/platform/modules/authentik/values.yaml
@ -17,6 +17,11 @@ authentik:

 server:
  replicas: 3
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
  resources:
    requests:
      cpu: 100m
@ -45,6 +50,11 @@ global:

 worker:
  replicas: 3
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
  resources:
    requests:
      cpu: 100m
--- a/stacks/platform/modules/infra-maintenance/main.tf
+++ b/stacks/platform/modules/infra-maintenance/main.tf
@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
  }
 }

+# Weekly etcd defragmentation — prevents fragmentation buildup that causes slow requests
+resource "kubernetes_cron_job_v1" "defrag-etcd" {
+  metadata {
+    name      = "defrag-etcd"
+    namespace = "default"
+  }
+  spec {
+    schedule                      = "0 3 * * 0"
+    successful_jobs_history_limit = 1
+    failed_jobs_history_limit     = 1
+    concurrency_policy            = "Forbid"
+    job_template {
+      metadata {
+        name = "defrag-etcd"
+      }
+      spec {
+        template {
+          metadata {
+            name = "defrag-etcd"
+          }
+          spec {
+            node_name           = "k8s-master"
+            priority_class_name = "system-cluster-critical"
+            host_network        = true
+            container {
+              name    = "defrag-etcd"
+              image   = "registry.k8s.io/etcd:3.5.21-0"
+              command = ["etcdctl"]
+              args    = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
+              env {
+                name  = "ETCDCTL_API"
+                value = "3"
+              }
+              volume_mount {
+                mount_path = "/etc/kubernetes/pki/etcd"
+                name       = "etcd-certs"
+                read_only  = true
+              }
+            }
+            volume {
+              name = "etcd-certs"
+              host_path {
+                path = "/etc/kubernetes/pki/etcd"
+                type = "DirectoryOrCreate"
+              }
+            }
+            restart_policy = "Never"
+          }
+        }
+      }
+    }
+  }
+}
+
 # Clean up evicted/failed pods cluster-wide daily
 resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
  metadata {
--- a/stacks/platform/modules/kyverno/main.tf
+++ b/stacks/platform/modules/kyverno/main.tf
@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
      forceFailurePolicyIgnore = {
        enabled = true
      }
+      policyReports = {
+        enabled = false
+      }
    }

    reportsController = {
@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
    admissionController = {
      replicas = 2

+      updateStrategy = {
+        type = "RollingUpdate"
+        rollingUpdate = {
+          maxSurge       = 0
+          maxUnavailable = 1
+        }
+      }
+
      container = {
        resources = {
          limits = {
--- a/stacks/platform/modules/monitoring/grafana_chart_values.yaml
+++ b/stacks/platform/modules/monitoring/grafana_chart_values.yaml
@ -1,5 +1,8 @@
 deploymentStrategy:
  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 0
+    maxUnavailable: 1
 replicas: 2
 adminPassword: "${grafana_admin_password}"
 resources:
--- a/stacks/platform/modules/traefik/main.tf
+++ b/stacks/platform/modules/traefik/main.tf
@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
    updateStrategy = {
      type = "RollingUpdate"
      rollingUpdate = {
-        maxUnavailable = 1
-        maxSurge       = 2
+        maxUnavailable = 0
+        maxSurge       = 1
      }
    }

--- a/stacks/stirling-pdf/main.tf
+++ b/stacks/stirling-pdf/main.tf
@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" {
  }
  spec {
    replicas = 1
+    strategy {
+      type = "Recreate"
+    }
    selector {
      match_labels = {
        app = "stirling-pdf"