mitigate cluster instability during terraform applies

- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf)
- Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno)
  to prevent memory request surge overwhelming scheduler
- Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup
- Disable Kyverno policy reports (ephemeral report cleanup)
- Cloud-init: journald persistence + 4Gi swap for worker nodes
- Kubelet: LimitedSwap behavior for memory pressure relief
This commit is contained in:
Viktor Barzin 2026-03-15 17:23:39 +00:00
parent 1fe7798609
commit c034adab5f
9 changed files with 100 additions and 2 deletions

View file

@ -56,6 +56,10 @@ apt:
filename: docker.list
runcmd:
# Enable persistent journald logging for crash forensics
- mkdir -p /var/log/journal
- sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
- systemctl restart systemd-journald
%{if is_k8s_template}
- apt-mark hold kubelet kubeadm kubectl
- systemctl stop kubelet
@ -63,6 +67,14 @@ runcmd:
- ${containerd_config_update_command}
- systemctl restart containerd
- systemctl enable --now iscsid
# Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
- fallocate -l 4G /swapfile
- chmod 600 /swapfile
- mkswap /swapfile
- swapon /swapfile
- echo '/swapfile none swap sw 0 0' >> /etc/fstab
- sysctl -w vm.swappiness=10
- echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf
- ${k8s_join_command}
- systemctl enable kubelet
- systemctl start kubelet

View file

@ -130,6 +130,8 @@ evictionSoftGracePeriod:
memory.available: "30s"
nodefs.available: "60s" # Grace period for disk space warnings
imagefs.available: "30s" # Shorter grace for critical containerd space
memorySwap:
swapBehavior: "LimitedSwap"
KUBELET_PATCH
EOF
k8s_join_command = var.k8s_join_command

View file

@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
}
spec {
replicas = 1
strategy {
type = "Recreate"
}
selector {
match_labels = {
app = "onlyoffice-document-server"

View file

@ -17,6 +17,11 @@ authentik:
server:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
resources:
requests:
cpu: 100m
@ -45,6 +50,11 @@ global:
worker:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
resources:
requests:
cpu: 100m

View file

@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
}
}
# Weekly etcd defragmentation prevents fragmentation buildup that causes slow requests
resource "kubernetes_cron_job_v1" "defrag-etcd" {
metadata {
name = "defrag-etcd"
namespace = "default"
}
spec {
schedule = "0 3 * * 0"
successful_jobs_history_limit = 1
failed_jobs_history_limit = 1
concurrency_policy = "Forbid"
job_template {
metadata {
name = "defrag-etcd"
}
spec {
template {
metadata {
name = "defrag-etcd"
}
spec {
node_name = "k8s-master"
priority_class_name = "system-cluster-critical"
host_network = true
container {
name = "defrag-etcd"
image = "registry.k8s.io/etcd:3.5.21-0"
command = ["etcdctl"]
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
env {
name = "ETCDCTL_API"
value = "3"
}
volume_mount {
mount_path = "/etc/kubernetes/pki/etcd"
name = "etcd-certs"
read_only = true
}
}
volume {
name = "etcd-certs"
host_path {
path = "/etc/kubernetes/pki/etcd"
type = "DirectoryOrCreate"
}
}
restart_policy = "Never"
}
}
}
}
}
}
# Clean up evicted/failed pods cluster-wide daily
resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
metadata {

View file

@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
forceFailurePolicyIgnore = {
enabled = true
}
policyReports = {
enabled = false
}
}
reportsController = {
@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
admissionController = {
replicas = 2
updateStrategy = {
type = "RollingUpdate"
rollingUpdate = {
maxSurge = 0
maxUnavailable = 1
}
}
container = {
resources = {
limits = {

View file

@ -1,5 +1,8 @@
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
replicas: 2
adminPassword: "${grafana_admin_password}"
resources:

View file

@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
updateStrategy = {
type = "RollingUpdate"
rollingUpdate = {
maxUnavailable = 1
maxSurge = 2
maxUnavailable = 0
maxSurge = 1
}
}

View file

@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" {
}
spec {
replicas = 1
strategy {
type = "Recreate"
}
selector {
match_labels = {
app = "stirling-pdf"