mitigate cluster instability during terraform applies
- Recreate strategy for heavy single-replica deployments (onlyoffice, stirling-pdf) - Reduce maxSurge on multi-replica deployments (traefik, authentik, grafana, kyverno) to prevent memory request surge overwhelming scheduler - Weekly etcd defrag CronJob (Sunday 3 AM) to prevent fragmentation buildup - Disable Kyverno policy reports (ephemeral report cleanup) - Cloud-init: journald persistence + 4Gi swap for worker nodes - Kubelet: LimitedSwap behavior for memory pressure relief
This commit is contained in:
parent
750da49c80
commit
c766d849f8
9 changed files with 100 additions and 2 deletions
|
|
@ -56,6 +56,10 @@ apt:
|
||||||
filename: docker.list
|
filename: docker.list
|
||||||
|
|
||||||
runcmd:
|
runcmd:
|
||||||
|
# Enable persistent journald logging for crash forensics
|
||||||
|
- mkdir -p /var/log/journal
|
||||||
|
- sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
|
||||||
|
- systemctl restart systemd-journald
|
||||||
%{if is_k8s_template}
|
%{if is_k8s_template}
|
||||||
- apt-mark hold kubelet kubeadm kubectl
|
- apt-mark hold kubelet kubeadm kubectl
|
||||||
- systemctl stop kubelet
|
- systemctl stop kubelet
|
||||||
|
|
@ -63,6 +67,14 @@ runcmd:
|
||||||
- ${containerd_config_update_command}
|
- ${containerd_config_update_command}
|
||||||
- systemctl restart containerd
|
- systemctl restart containerd
|
||||||
- systemctl enable --now iscsid
|
- systemctl enable --now iscsid
|
||||||
|
# Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical)
|
||||||
|
- fallocate -l 4G /swapfile
|
||||||
|
- chmod 600 /swapfile
|
||||||
|
- mkswap /swapfile
|
||||||
|
- swapon /swapfile
|
||||||
|
- echo '/swapfile none swap sw 0 0' >> /etc/fstab
|
||||||
|
- sysctl -w vm.swappiness=10
|
||||||
|
- echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf
|
||||||
- ${k8s_join_command}
|
- ${k8s_join_command}
|
||||||
- systemctl enable kubelet
|
- systemctl enable kubelet
|
||||||
- systemctl start kubelet
|
- systemctl start kubelet
|
||||||
|
|
|
||||||
|
|
@ -130,6 +130,8 @@ evictionSoftGracePeriod:
|
||||||
memory.available: "30s"
|
memory.available: "30s"
|
||||||
nodefs.available: "60s" # Grace period for disk space warnings
|
nodefs.available: "60s" # Grace period for disk space warnings
|
||||||
imagefs.available: "30s" # Shorter grace for critical containerd space
|
imagefs.available: "30s" # Shorter grace for critical containerd space
|
||||||
|
memorySwap:
|
||||||
|
swapBehavior: "LimitedSwap"
|
||||||
KUBELET_PATCH
|
KUBELET_PATCH
|
||||||
EOF
|
EOF
|
||||||
k8s_join_command = var.k8s_join_command
|
k8s_join_command = var.k8s_join_command
|
||||||
|
|
|
||||||
|
|
@ -85,6 +85,9 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
replicas = 1
|
replicas = 1
|
||||||
|
strategy {
|
||||||
|
type = "Recreate"
|
||||||
|
}
|
||||||
selector {
|
selector {
|
||||||
match_labels = {
|
match_labels = {
|
||||||
app = "onlyoffice-document-server"
|
app = "onlyoffice-document-server"
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,11 @@ authentik:
|
||||||
|
|
||||||
server:
|
server:
|
||||||
replicas: 3
|
replicas: 3
|
||||||
|
strategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 0
|
||||||
|
maxUnavailable: 1
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
|
|
@ -45,6 +50,11 @@ global:
|
||||||
|
|
||||||
worker:
|
worker:
|
||||||
replicas: 3
|
replicas: 3
|
||||||
|
strategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 0
|
||||||
|
maxUnavailable: 1
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
|
|
|
||||||
|
|
@ -150,6 +150,60 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Weekly etcd defragmentation — prevents fragmentation buildup that causes slow requests
|
||||||
|
resource "kubernetes_cron_job_v1" "defrag-etcd" {
|
||||||
|
metadata {
|
||||||
|
name = "defrag-etcd"
|
||||||
|
namespace = "default"
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
schedule = "0 3 * * 0"
|
||||||
|
successful_jobs_history_limit = 1
|
||||||
|
failed_jobs_history_limit = 1
|
||||||
|
concurrency_policy = "Forbid"
|
||||||
|
job_template {
|
||||||
|
metadata {
|
||||||
|
name = "defrag-etcd"
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
template {
|
||||||
|
metadata {
|
||||||
|
name = "defrag-etcd"
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
node_name = "k8s-master"
|
||||||
|
priority_class_name = "system-cluster-critical"
|
||||||
|
host_network = true
|
||||||
|
container {
|
||||||
|
name = "defrag-etcd"
|
||||||
|
image = "registry.k8s.io/etcd:3.5.21-0"
|
||||||
|
command = ["etcdctl"]
|
||||||
|
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "defrag"]
|
||||||
|
env {
|
||||||
|
name = "ETCDCTL_API"
|
||||||
|
value = "3"
|
||||||
|
}
|
||||||
|
volume_mount {
|
||||||
|
mount_path = "/etc/kubernetes/pki/etcd"
|
||||||
|
name = "etcd-certs"
|
||||||
|
read_only = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
volume {
|
||||||
|
name = "etcd-certs"
|
||||||
|
host_path {
|
||||||
|
path = "/etc/kubernetes/pki/etcd"
|
||||||
|
type = "DirectoryOrCreate"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
restart_policy = "Never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Clean up evicted/failed pods cluster-wide daily
|
# Clean up evicted/failed pods cluster-wide daily
|
||||||
resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
|
resource "kubernetes_cron_job_v1" "cleanup-failed-pods" {
|
||||||
metadata {
|
metadata {
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,9 @@ resource "helm_release" "kyverno" {
|
||||||
forceFailurePolicyIgnore = {
|
forceFailurePolicyIgnore = {
|
||||||
enabled = true
|
enabled = true
|
||||||
}
|
}
|
||||||
|
policyReports = {
|
||||||
|
enabled = false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
reportsController = {
|
reportsController = {
|
||||||
|
|
@ -66,6 +69,14 @@ resource "helm_release" "kyverno" {
|
||||||
admissionController = {
|
admissionController = {
|
||||||
replicas = 2
|
replicas = 2
|
||||||
|
|
||||||
|
updateStrategy = {
|
||||||
|
type = "RollingUpdate"
|
||||||
|
rollingUpdate = {
|
||||||
|
maxSurge = 0
|
||||||
|
maxUnavailable = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
container = {
|
container = {
|
||||||
resources = {
|
resources = {
|
||||||
limits = {
|
limits = {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
deploymentStrategy:
|
deploymentStrategy:
|
||||||
type: RollingUpdate
|
type: RollingUpdate
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 0
|
||||||
|
maxUnavailable: 1
|
||||||
replicas: 2
|
replicas: 2
|
||||||
adminPassword: "${grafana_admin_password}"
|
adminPassword: "${grafana_admin_password}"
|
||||||
resources:
|
resources:
|
||||||
|
|
|
||||||
|
|
@ -64,8 +64,8 @@ resource "helm_release" "traefik" {
|
||||||
updateStrategy = {
|
updateStrategy = {
|
||||||
type = "RollingUpdate"
|
type = "RollingUpdate"
|
||||||
rollingUpdate = {
|
rollingUpdate = {
|
||||||
maxUnavailable = 1
|
maxUnavailable = 0
|
||||||
maxSurge = 2
|
maxSurge = 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,9 @@ resource "kubernetes_deployment" "stirling-pdf" {
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
replicas = 1
|
replicas = 1
|
||||||
|
strategy {
|
||||||
|
type = "Recreate"
|
||||||
|
}
|
||||||
selector {
|
selector {
|
||||||
match_labels = {
|
match_labels = {
|
||||||
app = "stirling-pdf"
|
app = "stirling-pdf"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue