Reduce downtime during platform stack applies

CrowdSec fixes: - Increase ResourceQuota requests.cpu 1→4 (was at 302%, blocking upgrades) - Add LAPI startupProbe: 30 attempts × 10s = 5min startup window (LAPI pods were failing default startup probe during rolling upgrades) - Reduce Helm timeout 3600s→900s with wait=true, wait_for_jobs=true Prometheus startup guard on 8 rate-based alerts: - PodCrashLooping, ContainerOOMKilled, CoreDNSErrors, HighServiceErrorRate, HighService4xxRate, HighServiceLatency, SSDHighWriteRate, HDDHighWriteRate - Suppresses false positives for 15m after Prometheus restart
2026-03-14 12:47:56 +00:00 · 2026-03-14 12:47:56 +00:00 · 240feda408
commit 240feda408
parent a66a8d0de2
2 changed files with 7 additions and 1 deletions
--- a/stacks/platform/modules/crowdsec/main.tf
+++ b/stacks/platform/modules/crowdsec/main.tf
@ -107,7 +107,7 @@ resource "helm_release" "crowdsec" {
  chart      = "crowdsec"

  values  = [templatefile("${path.module}/values.yaml", { homepage_username = var.homepage_username, homepage_password = var.homepage_password, DB_PASSWORD = var.db_password, ENROLL_KEY = var.enroll_key, SLACK_WEBHOOK_URL = var.slack_webhook_url, mysql_host = var.mysql_host })]
-  timeout       = 600
+  timeout       = 900
  wait          = true
  wait_for_jobs = true
 }
--- a/stacks/platform/modules/crowdsec/values.yaml
+++ b/stacks/platform/modules/crowdsec/values.yaml
@ -56,6 +56,12 @@ lapi:
      memory: 128Mi
    limits:
      memory: 1Gi
+  startupProbe:
+    httpGet:
+      path: /health
+      port: 8080
+    failureThreshold: 30
+    periodSeconds: 10
  priorityClassName: "tier-1-cluster"
  replicas: 3
  topologySpreadConstraints: