From c034adab5f0c3c3078c5a13e379ba41076ae4df2 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Mar 2026 17:23:39 +0000 Subject: [PATCH] mitigate cluster instability during terraform applies - Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf) - Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno) to prevent memory request surge overwhelming scheduler - Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup - Disable Kyverno policy reports (ephemeral report cleanup) - Cloud-init: journald persistence + 4Gi swap for worker nodes - Kubelet: LimitedSwap behavior for memory pressure relief --- modules/create-template-vm/cloud_init.yaml | 12 +++++ stacks/infra/main.tf | 2 + stacks/onlyoffice/main.tf | 3 ++ stacks/platform/modules/authentik/values.yaml | 10 ++++ .../modules/infra-maintenance/main.tf | 54 +++++++++++++++++++ stacks/platform/modules/kyverno/main.tf | 11 ++++ .../monitoring/grafana_chart_values.yaml | 3 ++ stacks/platform/modules/traefik/main.tf | 4 +- stacks/stirling-pdf/main.tf | 3 ++ 9 files changed, 100 insertions(+), 2 deletions(-) diff --git a/modules/create-template-vm/cloud_init.yaml b/modules/create-template-vm/cloud_init.yaml index b62430fb..79d1de8c 100644 --- a/modules/create-template-vm/cloud_init.yaml +++ b/modules/create-template-vm/cloud_init.yaml @@ -56,6 +56,10 @@ apt: filename: docker.list runcmd: + # Enable persistent journald logging for crash forensics + - mkdir -p /var/log/journal + - sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf + - systemctl restart systemd-journald %{if is_k8s_template} - apt-mark hold kubelet kubeadm kubectl - systemctl stop kubelet @@ -63,6 +67,14 @@ runcmd: - ${containerd_config_update_command} - systemctl restart containerd - systemctl enable --now iscsid + # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical) + - fallocate -l 4G /swapfile + - chmod 600 /swapfile + - mkswap /swapfile + - swapon /swapfile + - echo '/swapfile none swap sw 0 0' >> /etc/fstab + - sysctl -w vm.swappiness=10 + - echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf - ${k8s_join_command} - systemctl enable kubelet - systemctl start kubelet diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf index 5af826fd..5dd2a48f 100644 --- a/stacks/infra/main.tf +++ b/stacks/infra/main.tf @@ -130,6 +130,8 @@ evictionSoftGracePeriod: memory.available: "30s" nodefs.available: "60s" # Grace period for disk space warnings imagefs.available: "30s" # Shorter grace for critical containerd space +memorySwap: + swapBehavior: "LimitedSwap" KUBELET_PATCH EOF k8s_join_command = var.k8s_join_command diff --git a/stacks/onlyoffice/main.tf b/stacks/onlyoffice/main.tf index 86b16bc3..1dda92f3 100644 --- a/stacks/onlyoffice/main.tf +++ b/stacks/onlyoffice/main.tf @@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" { } spec { replicas = 1 + strategy { + type = "Recreate" + } selector { match_labels = { app = "onlyoffice-document-server" diff --git a/stacks/platform/modules/authentik/values.yaml b/stacks/platform/modules/authentik/values.yaml index 05cc1065..e50719e9 100644 --- a/stacks/platform/modules/authentik/values.yaml +++ b/stacks/platform/modules/authentik/values.yaml @@ -17,6 +17,11 @@ authentik: server: replicas: 3 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 resources: requests: cpu: 100m @@ -45,6 +50,11 @@ global: worker: replicas: 3 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 resources: requests: cpu: 100m diff --git a/stacks/platform/modules/infra-maintenance/main.tf b/stacks/platform/modules/infra-maintenance/main.tf index 1f1451d1..6e3e108c 100644 --- a/stacks/platform/modules/infra-maintenance/main.tf +++ b/stacks/platform/modules/infra-maintenance/main.tf @@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" { } } +# Weekly etcd defragmentation — prevents fragmentation buildup that causes slow requests +resource "kubernetes_cron_job_v1" "defrag-etcd" { + metadata { + name = "defrag-etcd" + namespace = "default" + } + spec { + schedule = "0 3 * * 0" + successful_jobs_history_limit = 1 + failed_jobs_history_limit = 1 + concurrency_policy = "Forbid" + job_template { + metadata { + name = "defrag-etcd" + } + spec { + template { + metadata { + name = "defrag-etcd" + } + spec { + node_name = "k8s-master" + priority_class_name = "system-cluster-critical" + host_network = true + container { + name = "defrag-etcd" + image = "registry.k8s.io/etcd:3.5.21-0" + command = ["etcdctl"] + args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"] + env { + name = "ETCDCTL_API" + value = "3" + } + volume_mount { + mount_path = "/etc/kubernetes/pki/etcd" + name = "etcd-certs" + read_only = true + } + } + volume { + name = "etcd-certs" + host_path { + path = "/etc/kubernetes/pki/etcd" + type = "DirectoryOrCreate" + } + } + restart_policy = "Never" + } + } + } + } + } +} + # Clean up evicted/failed pods cluster-wide daily resource "kubernetes_cron_job_v1" "cleanup-failed-pods" { metadata { diff --git a/stacks/platform/modules/kyverno/main.tf b/stacks/platform/modules/kyverno/main.tf index 83a752cb..405029dc 100644 --- a/stacks/platform/modules/kyverno/main.tf +++ b/stacks/platform/modules/kyverno/main.tf @@ -25,6 +25,9 @@ resource "helm_release" "kyverno" { forceFailurePolicyIgnore = { enabled = true } + policyReports = { + enabled = false + } } reportsController = { @@ -66,6 +69,14 @@ resource "helm_release" "kyverno" { admissionController = { replicas = 2 + updateStrategy = { + type = "RollingUpdate" + rollingUpdate = { + maxSurge = 0 + maxUnavailable = 1 + } + } + container = { resources = { limits = { diff --git a/stacks/platform/modules/monitoring/grafana_chart_values.yaml b/stacks/platform/modules/monitoring/grafana_chart_values.yaml index 44ce866b..a5bc4222 100644 --- a/stacks/platform/modules/monitoring/grafana_chart_values.yaml +++ b/stacks/platform/modules/monitoring/grafana_chart_values.yaml @@ -1,5 +1,8 @@ deploymentStrategy: type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 replicas: 2 adminPassword: "${grafana_admin_password}" resources: diff --git a/stacks/platform/modules/traefik/main.tf b/stacks/platform/modules/traefik/main.tf index 39bdcd53..6428322c 100644 --- a/stacks/platform/modules/traefik/main.tf +++ b/stacks/platform/modules/traefik/main.tf @@ -64,8 +64,8 @@ resource "helm_release" "traefik" { updateStrategy = { type = "RollingUpdate" rollingUpdate = { - maxUnavailable = 1 - maxSurge = 2 + maxUnavailable = 0 + maxSurge = 1 } } diff --git a/stacks/stirling-pdf/main.tf b/stacks/stirling-pdf/main.tf index b06ad539..16ef39a9 100644 --- a/stacks/stirling-pdf/main.tf +++ b/stacks/stirling-pdf/main.tf @@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" { } spec { replicas = 1 + strategy { + type = "Recreate" + } selector { match_labels = { app = "stirling-pdf"