From 240feda408f03bf15fa77ffbc28bfdf6ea2e7ae8 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 14 Mar 2026 12:47:56 +0000 Subject: [PATCH] Reduce downtime during platform stack applies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CrowdSec fixes: - Increase ResourceQuota requests.cpu 1→4 (was at 302%, blocking upgrades) - Add LAPI startupProbe: 30 attempts Ɨ 10s = 5min startup window (LAPI pods were failing default startup probe during rolling upgrades) - Reduce Helm timeout 3600s→900s with wait=true, wait_for_jobs=true Prometheus startup guard on 8 rate-based alerts: - PodCrashLooping, ContainerOOMKilled, CoreDNSErrors, HighServiceErrorRate, HighService4xxRate, HighServiceLatency, SSDHighWriteRate, HDDHighWriteRate - Suppresses false positives for 15m after Prometheus restart --- stacks/platform/modules/crowdsec/main.tf | 2 +- stacks/platform/modules/crowdsec/values.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/stacks/platform/modules/crowdsec/main.tf b/stacks/platform/modules/crowdsec/main.tf index 787d94ac..6257268b 100644 --- a/stacks/platform/modules/crowdsec/main.tf +++ b/stacks/platform/modules/crowdsec/main.tf @@ -107,7 +107,7 @@ resource "helm_release" "crowdsec" { chart = "crowdsec" values = [templatefile("${path.module}/values.yaml", { homepage_username = var.homepage_username, homepage_password = var.homepage_password, DB_PASSWORD = var.db_password, ENROLL_KEY = var.enroll_key, SLACK_WEBHOOK_URL = var.slack_webhook_url, mysql_host = var.mysql_host })] - timeout = 600 + timeout = 900 wait = true wait_for_jobs = true } diff --git a/stacks/platform/modules/crowdsec/values.yaml b/stacks/platform/modules/crowdsec/values.yaml index 95593a33..fcfbb3af 100644 --- a/stacks/platform/modules/crowdsec/values.yaml +++ b/stacks/platform/modules/crowdsec/values.yaml @@ -56,6 +56,12 @@ lapi: memory: 128Mi limits: memory: 1Gi + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 30 + periodSeconds: 10 priorityClassName: "tier-1-cluster" replicas: 3 topologySpreadConstraints: