mitigate cluster instability during terraform applies
- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf) - Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno) to prevent memory request surge overwhelming scheduler - Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup - Disable Kyverno policy reports (ephemeral report cleanup) - Cloud-init: journald persistence + 4Gi swap for worker nodes - Kubelet: LimitedSwap behavior for memory pressure relief
This commit is contained in:
parent
1fe7798609
commit
c034adab5f
9 changed files with 100 additions and 2 deletions
|
|
@ -56,6 +56,10 @@ apt:
|
|||
filename: docker.list
|
||||
|
||||
runcmd:
|
||||
# Enable persistent journald logging for crash forensics
|
||||
- mkdir -p /var/log/journal
|
||||
- sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
|
||||
- systemctl restart systemd-journald
|
||||
%{if is_k8s_template}
|
||||
- apt-mark hold kubelet kubeadm kubectl
|
||||
- systemctl stop kubelet
|
||||
|
|
@ -63,6 +67,14 @@ runcmd:
|
|||
- ${containerd_config_update_command}
|
||||
- systemctl restart containerd
|
||||
- systemctl enable --now iscsid
|
||||
# Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
|
||||
- fallocate -l 4G /swapfile
|
||||
- chmod 600 /swapfile
|
||||
- mkswap /swapfile
|
||||
- swapon /swapfile
|
||||
- echo '/swapfile none swap sw 0 0' >> /etc/fstab
|
||||
- sysctl -w vm.swappiness=10
|
||||
- echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf
|
||||
- ${k8s_join_command}
|
||||
- systemctl enable kubelet
|
||||
- systemctl start kubelet
|
||||
|
|
|
|||
|
|
@ -130,6 +130,8 @@ evictionSoftGracePeriod:
|
|||
memory.available: "30s"
|
||||
nodefs.available: "60s" # Grace period for disk space warnings
|
||||
imagefs.available: "30s" # Shorter grace for critical containerd space
|
||||
memorySwap:
|
||||
swapBehavior: "LimitedSwap"
|
||||
KUBELET_PATCH
|
||||
EOF
|
||||
k8s_join_command = var.k8s_join_command
|
||||
|
|
|
|||
|
|
@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
|
|||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
strategy {
|
||||
type = "Recreate"
|
||||
}
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "onlyoffice-document-server"
|
||||
|
|
|
|||
|
|
@ -17,6 +17,11 @@ authentik:
|
|||
|
||||
server:
|
||||
replicas: 3
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
|
|
@ -45,6 +50,11 @@ global:
|
|||
|
||||
worker:
|
||||
replicas: 3
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
|
|
|
|||
|
|
@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
}
|
||||
}
|
||||
|
||||
# Weekly etcd defragmentation — prevents fragmentation buildup that causes slow requests
|
||||
resource "kubernetes_cron_job_v1" "defrag-etcd" {
|
||||
metadata {
|
||||
name = "defrag-etcd"
|
||||
namespace = "default"
|
||||
}
|
||||
spec {
|
||||
schedule = "0 3 * * 0"
|
||||
successful_jobs_history_limit = 1
|
||||
failed_jobs_history_limit = 1
|
||||
concurrency_policy = "Forbid"
|
||||
job_template {
|
||||
metadata {
|
||||
name = "defrag-etcd"
|
||||
}
|
||||
spec {
|
||||
template {
|
||||
metadata {
|
||||
name = "defrag-etcd"
|
||||
}
|
||||
spec {
|
||||
node_name = "k8s-master"
|
||||
priority_class_name = "system-cluster-critical"
|
||||
host_network = true
|
||||
container {
|
||||
name = "defrag-etcd"
|
||||
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||
command = ["etcdctl"]
|
||||
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
|
||||
env {
|
||||
name = "ETCDCTL_API"
|
||||
value = "3"
|
||||
}
|
||||
volume_mount {
|
||||
mount_path = "/etc/kubernetes/pki/etcd"
|
||||
name = "etcd-certs"
|
||||
read_only = true
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "etcd-certs"
|
||||
host_path {
|
||||
path = "/etc/kubernetes/pki/etcd"
|
||||
type = "DirectoryOrCreate"
|
||||
}
|
||||
}
|
||||
restart_policy = "Never"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Clean up evicted/failed pods cluster-wide daily
|
||||
resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
|
||||
metadata {
|
||||
|
|
|
|||
|
|
@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
|
|||
forceFailurePolicyIgnore = {
|
||||
enabled = true
|
||||
}
|
||||
policyReports = {
|
||||
enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
reportsController = {
|
||||
|
|
@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
|
|||
admissionController = {
|
||||
replicas = 2
|
||||
|
||||
updateStrategy = {
|
||||
type = "RollingUpdate"
|
||||
rollingUpdate = {
|
||||
maxSurge = 0
|
||||
maxUnavailable = 1
|
||||
}
|
||||
}
|
||||
|
||||
container = {
|
||||
resources = {
|
||||
limits = {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
replicas: 2
|
||||
adminPassword: "${grafana_admin_password}"
|
||||
resources:
|
||||
|
|
|
|||
|
|
@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
|
|||
updateStrategy = {
|
||||
type = "RollingUpdate"
|
||||
rollingUpdate = {
|
||||
maxUnavailable = 1
|
||||
maxSurge = 2
|
||||
maxUnavailable = 0
|
||||
maxSurge = 1
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" {
|
|||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
strategy {
|
||||
type = "Recreate"
|
||||
}
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "stirling-pdf"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue