mitigate cluster instability during terraform applies

- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf) - Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno) to prevent memory request surge overwhelming scheduler - Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup - Disable Kyverno policy reports (ephemeral report cleanup) - Cloud-init: journald persistence + 4Gi swap for worker nodes - Kubelet: LimitedSwap behavior for memory pressure relief
2026-03-15 17:23:39 +00:00 · 2026-03-15 17:23:39 +00:00 · c766d849f8
commit c766d849f8
parent 750da49c80
9 changed files with 100 additions and 2 deletions
--- a/modules/create-template-vm/cloud_init.yaml
+++ b/modules/create-template-vm/cloud_init.yaml
@ -56,6 +56,10 @@ apt:
      filename: docker.list
 runcmd:
  # Enable persistent journald logging for crash forensics
  - mkdir -p /var/log/journal
  - sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
  - systemctl restart systemd-journald
  %{if is_k8s_template}
  - apt-mark hold kubelet kubeadm kubectl
  - systemctl stop kubelet
@ -63,6 +67,14 @@ runcmd:
  - ${containerd_config_update_command}
  - systemctl restart containerd
  - systemctl enable --now iscsid
  # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
  - fallocate -l 4G /swapfile
  - chmod 600 /swapfile
  - mkswap /swapfile
  - swapon /swapfile
  - echo '/swapfile none swap sw 0 0' >> /etc/fstab
  - sysctl -w vm.swappiness=10
  - echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf
  - ${k8s_join_command}
  - systemctl enable kubelet
  - systemctl start kubelet
--- a/stacks/infra/main.tf
+++ b/stacks/infra/main.tf
@ -130,6 +130,8 @@ evictionSoftGracePeriod:
  memory.available: "30s"
  nodefs.available: "60s"  # Grace period for disk space warnings
  imagefs.available: "30s"  # Shorter grace for critical containerd space
 memorySwap:
  swapBehavior: "LimitedSwap"
 KUBELET_PATCH
  EOF
  k8s_join_command                 = var.k8s_join_command
--- a/stacks/onlyoffice/main.tf
+++ b/stacks/onlyoffice/main.tf
@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
  }
  spec {
    replicas = 1
    strategy {
      type = "Recreate"
    }
    selector {
      match_labels = {
        app = "onlyoffice-document-server"
--- a/stacks/platform/modules/authentik/values.yaml
+++ b/stacks/platform/modules/authentik/values.yaml
@ -17,6 +17,11 @@ authentik:
 server:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 0
      maxUnavailable: 1
  resources:
    requests:
      cpu: 100m
@ -45,6 +50,11 @@ global:
 worker:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 0
      maxUnavailable: 1
  resources:
    requests:
      cpu: 100m
--- a/stacks/platform/modules/infra-maintenance/main.tf
+++ b/stacks/platform/modules/infra-maintenance/main.tf
@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
  }
 }
 # Weekly etcd defragmentation — prevents fragmentation buildup that causes slow requests
 resource "kubernetes_cron_job_v1" "defrag-etcd" {
  metadata {
    name      = "defrag-etcd"
    namespace = "default"
  }
  spec {
    schedule                      = "0 3 * * 0"
    successful_jobs_history_limit = 1
    failed_jobs_history_limit     = 1
    concurrency_policy            = "Forbid"
    job_template {
      metadata {
        name = "defrag-etcd"
      }
      spec {
        template {
          metadata {
            name = "defrag-etcd"
          }
          spec {
            node_name           = "k8s-master"
            priority_class_name = "system-cluster-critical"
            host_network        = true
            container {
              name    = "defrag-etcd"
              image   = "registry.k8s.io/etcd:3.5.21-0"
              command = ["etcdctl"]
              args    = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
              env {
                name  = "ETCDCTL_API"
                value = "3"
              }
              volume_mount {
                mount_path = "/etc/kubernetes/pki/etcd"
                name       = "etcd-certs"
                read_only  = true
              }
            }
            volume {
              name = "etcd-certs"
              host_path {
                path = "/etc/kubernetes/pki/etcd"
                type = "DirectoryOrCreate"
              }
            }
            restart_policy = "Never"
          }
        }
      }
    }
  }
 }
 # Clean up evicted/failed pods cluster-wide daily
 resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
  metadata {
--- a/stacks/platform/modules/kyverno/main.tf
+++ b/stacks/platform/modules/kyverno/main.tf
@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
      forceFailurePolicyIgnore = {
        enabled = true
      }
      policyReports = {
        enabled = false
      }
    }
    reportsController = {
@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
    admissionController = {
      replicas = 2
      updateStrategy = {
        type = "RollingUpdate"
        rollingUpdate = {
          maxSurge       = 0
          maxUnavailable = 1
        }
      }
      container = {
        resources = {
          limits = {
--- a/stacks/platform/modules/monitoring/grafana_chart_values.yaml
+++ b/stacks/platform/modules/monitoring/grafana_chart_values.yaml
@ -1,5 +1,8 @@
 deploymentStrategy:
  type: RollingUpdate
  rollingUpdate:
    maxSurge: 0
    maxUnavailable: 1
 replicas: 2
 adminPassword: "${grafana_admin_password}"
 resources:
--- a/stacks/platform/modules/traefik/main.tf
+++ b/stacks/platform/modules/traefik/main.tf
@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
    updateStrategy = {
      type = "RollingUpdate"
      rollingUpdate = {
-        maxUnavailable = 1
+        maxUnavailable = 0
-        maxSurge       = 2
+        maxSurge       = 1
      }
    }
--- a/stacks/stirling-pdf/main.tf
+++ b/stacks/stirling-pdf/main.tf
@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" {
  }
  spec {
    replicas = 1
    strategy {
      type = "Recreate"
    }
    selector {
      match_labels = {
        app = "stirling-pdf"