From 4ea3ffe9d3064c5beb6d7a6f0483408c6f02a39c Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Sat, 14 Mar 2026 12:09:09 +0000
Subject: [PATCH] Reduce downtime during platform stack applies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CrowdSec Helm fix:
- Increase ResourceQuota requests.cpu from 1 to 4 — pods were at 302%
  of quota, preventing scheduling during rolling upgrades
- Reduce Helm timeout from 3600s to 600s — 1 hour hang is excessive
- Add wait=true and wait_for_jobs=true for proper readiness checking

Prometheus startup guard:
- Add startup guard to 8 rate/increase-based alerts that false-fire
  after Prometheus restarts (needs 2 scrapes for rate() to work):
  PodCrashLooping, ContainerOOMKilled, CoreDNSErrors,
  HighServiceErrorRate, HighService4xxRate, HighServiceLatency,
  SSDHighWriteRate, HDDHighWriteRate
- Guard: and on() (time() - process_start_time_seconds) > 900
  suppresses alerts for 15m after Prometheus startup
---
 stacks/platform/modules/crowdsec/main.tf            |  6 ++++--
 .../modules/monitoring/prometheus_chart_values.tpl  | 13 ++++++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/stacks/platform/modules/crowdsec/main.tf b/stacks/platform/modules/crowdsec/main.tf
index 9a060f4b..787d94ac 100644
--- a/stacks/platform/modules/crowdsec/main.tf
+++ b/stacks/platform/modules/crowdsec/main.tf
@@ -107,7 +107,9 @@ resource "helm_release" "crowdsec" {
   chart      = "crowdsec"
 
   values  = [templatefile("${path.module}/values.yaml", { homepage_username = var.homepage_username, homepage_password = var.homepage_password, DB_PASSWORD = var.db_password, ENROLL_KEY = var.enroll_key, SLACK_WEBHOOK_URL = var.slack_webhook_url, mysql_host = var.mysql_host })]
-  timeout = 3600
+  timeout       = 600
+  wait          = true
+  wait_for_jobs = true
 }
 
 
@@ -365,7 +367,7 @@ resource "kubernetes_resource_quota" "crowdsec" {
   }
   spec {
     hard = {
-      "requests.cpu"    = "1"
+      "requests.cpu"    = "4"
       "requests.memory" = "8Gi"
       "limits.memory"   = "16Gi"
       pods              = "30"
diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
index 4f7912bd..ac74d0a3 100755
--- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
@@ -229,14 +229,14 @@ serverFiles:
             annotations:
               summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)"
           - alert: SSDHighWriteRate
-            expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 # sdb is SSD; value in MB
+            expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 # sdb is SSD; value in MB
             for: 10m
             labels:
               severity: info
             annotations:
               summary: "SSD write rate: {{ $value | printf \"%.1f\" }} MB/s (threshold: 2 MB/s)"
           - alert: HDDHighWriteRate
-            expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdc"}[2m]) / 1024 / 1024 > 10 # sdc is 11TB HDD; value in MB
+            expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdc"}[2m]) / 1024 / 1024 > 10 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 # sdc is 11TB HDD; value in MB
             for: 20m
             labels:
               severity: info
@@ -369,14 +369,14 @@ serverFiles:
       - name: K8s Health
         rules:
           - alert: PodCrashLooping
-            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
+            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 15m
             labels:
               severity: warning
             annotations:
               summary: "{{ $labels.namespace }}/{{ $labels.pod }}: {{ $value | printf \"%.0f\" }} restarts in 1h"
           - alert: ContainerOOMKilled
-            expr: increase(container_oom_events_total{container!=""}[15m]) > 0
+            expr: increase(container_oom_events_total{container!=""}[15m]) > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 5m
             labels:
               severity: warning
@@ -416,7 +416,7 @@ serverFiles:
             annotations:
               summary: "Home Assistant down: {{ $labels.instance }}"
           - alert: CoreDNSErrors
-            expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1
+            expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 10m
             labels:
               severity: warning
@@ -681,6 +681,7 @@ serverFiles:
                 * 100
               ) > 10
               and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 10m
             labels:
               severity: warning
@@ -694,6 +695,7 @@ serverFiles:
                 * 100
               ) > 30
               and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*"}[5m])) by (service) > 0.1
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 15m
             labels:
               severity: warning
@@ -704,6 +706,7 @@ serverFiles:
               histogram_quantile(0.99,
                 sum(rate(traefik_service_request_duration_seconds_bucket[5m])) by (service, le)
               ) > 10
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 5m
             labels:
               severity: warning