mitigate cluster instability during terraform applies

- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf)
- Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno)
  to prevent memory request surge overwhelming scheduler
- Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup
- Disable Kyverno policy reports (ephemeral report cleanup)
- Cloud-init: journald persistence + 4Gi swap for worker nodes
- Kubelet: LimitedSwap behavior for memory pressure relief
This commit is contained in:
Viktor Barzin 2026-03-15 17:23:39 +00:00 committed by Viktor Barzin
parent 750da49c80
commit c766d849f8
9 changed files with 100 additions and 2 deletions

View file

@ -17,6 +17,11 @@ authentik:
server:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
resources:
requests:
cpu: 100m
@ -45,6 +50,11 @@ global:
worker:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
resources:
requests:
cpu: 100m

View file

@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
}
}
# Weekly etcd defragmentation prevents fragmentation buildup that causes slow requests
resource "kubernetes_cron_job_v1" "defrag-etcd" {
metadata {
name = "defrag-etcd"
namespace = "default"
}
spec {
schedule = "0 3 * * 0"
successful_jobs_history_limit = 1
failed_jobs_history_limit = 1
concurrency_policy = "Forbid"
job_template {
metadata {
name = "defrag-etcd"
}
spec {
template {
metadata {
name = "defrag-etcd"
}
spec {
node_name = "k8s-master"
priority_class_name = "system-cluster-critical"
host_network = true
container {
name = "defrag-etcd"
image = "registry.k8s.io/etcd:3.5.21-0"
command = ["etcdctl"]
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
env {
name = "ETCDCTL_API"
value = "3"
}
volume_mount {
mount_path = "/etc/kubernetes/pki/etcd"
name = "etcd-certs"
read_only = true
}
}
volume {
name = "etcd-certs"
host_path {
path = "/etc/kubernetes/pki/etcd"
type = "DirectoryOrCreate"
}
}
restart_policy = "Never"
}
}
}
}
}
}
# Clean up evicted/failed pods cluster-wide daily
resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
metadata {

View file

@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
forceFailurePolicyIgnore = {
enabled = true
}
policyReports = {
enabled = false
}
}
reportsController = {
@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
admissionController = {
replicas = 2
updateStrategy = {
type = "RollingUpdate"
rollingUpdate = {
maxSurge = 0
maxUnavailable = 1
}
}
container = {
resources = {
limits = {

View file

@ -1,5 +1,8 @@
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
replicas: 2
adminPassword: "${grafana_admin_password}"
resources:

View file

@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
updateStrategy = {
type = "RollingUpdate"
rollingUpdate = {
maxUnavailable = 1
maxSurge = 2
maxUnavailable = 0
maxSurge = 1
}
}