Reduce downtime during platform stack applies

CrowdSec Helm fix:
- Increase ResourceQuota requests.cpu from 1 to 4 — pods were at 302%
  of quota, preventing scheduling during rolling upgrades
- Reduce Helm timeout from 3600s to 600s — 1 hour hang is excessive
- Add wait=true and wait_for_jobs=true for proper readiness checking

Prometheus startup guard:
- Add startup guard to 8 rate/increase-based alerts that false-fire
  after Prometheus restarts (needs 2 scrapes for rate() to work):
  PodCrashLooping, ContainerOOMKilled, CoreDNSErrors,
  HighServiceErrorRate, HighService4xxRate, HighServiceLatency,
  SSDHighWriteRate, HDDHighWriteRate
- Guard: and on() (time() - process_start_time_seconds) > 900
  suppresses alerts for 15m after Prometheus startup
This commit is contained in:
Viktor Barzin 2026-03-14 12:09:09 +00:00 committed by Viktor Barzin
parent 44f6614bf9
commit a66a8d0de2
2 changed files with 12 additions and 7 deletions

View file

@ -107,7 +107,9 @@ resource "helm_release" "crowdsec" {
chart = "crowdsec" chart = "crowdsec"
values = [templatefile("${path.module}/values.yaml", { homepage_username = var.homepage_username, homepage_password = var.homepage_password, DB_PASSWORD = var.db_password, ENROLL_KEY = var.enroll_key, SLACK_WEBHOOK_URL = var.slack_webhook_url, mysql_host = var.mysql_host })] values = [templatefile("${path.module}/values.yaml", { homepage_username = var.homepage_username, homepage_password = var.homepage_password, DB_PASSWORD = var.db_password, ENROLL_KEY = var.enroll_key, SLACK_WEBHOOK_URL = var.slack_webhook_url, mysql_host = var.mysql_host })]
timeout = 3600 timeout = 600
wait = true
wait_for_jobs = true
} }
@ -365,7 +367,7 @@ resource "kubernetes_resource_quota" "crowdsec" {
} }
spec { spec {
hard = { hard = {
"requests.cpu" = "1" "requests.cpu" = "4"
"requests.memory" = "8Gi" "requests.memory" = "8Gi"
"limits.memory" = "16Gi" "limits.memory" = "16Gi"
pods = "30" pods = "30"

View file

@ -229,14 +229,14 @@ serverFiles:
annotations: annotations:
summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)" summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)"
- alert: SSDHighWriteRate - alert: SSDHighWriteRate
expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 # sdb is SSD; value in MB expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 # sdb is SSD; value in MB
for: 10m for: 10m
labels: labels:
severity: info severity: info
annotations: annotations:
summary: "SSD write rate: {{ $value | printf \"%.1f\" }} MB/s (threshold: 2 MB/s)" summary: "SSD write rate: {{ $value | printf \"%.1f\" }} MB/s (threshold: 2 MB/s)"
- alert: HDDHighWriteRate - alert: HDDHighWriteRate
expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdc"}[2m]) / 1024 / 1024 > 10 # sdc is 11TB HDD; value in MB expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdc"}[2m]) / 1024 / 1024 > 10 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 # sdc is 11TB HDD; value in MB
for: 20m for: 20m
labels: labels:
severity: info severity: info
@ -369,14 +369,14 @@ serverFiles:
- name: K8s Health - name: K8s Health
rules: rules:
- alert: PodCrashLooping - alert: PodCrashLooping
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }}: {{ $value | printf \"%.0f\" }} restarts in 1h" summary: "{{ $labels.namespace }}/{{ $labels.pod }}: {{ $value | printf \"%.0f\" }} restarts in 1h"
- alert: ContainerOOMKilled - alert: ContainerOOMKilled
expr: increase(container_oom_events_total{container!=""}[15m]) > 0 expr: increase(container_oom_events_total{container!=""}[15m]) > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -416,7 +416,7 @@ serverFiles:
annotations: annotations:
summary: "Home Assistant down: {{ $labels.instance }}" summary: "Home Assistant down: {{ $labels.instance }}"
- alert: CoreDNSErrors - alert: CoreDNSErrors
expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1 expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -681,6 +681,7 @@ serverFiles:
* 100 * 100
) > 10 ) > 10
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1 and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -694,6 +695,7 @@ serverFiles:
* 100 * 100
) > 30 ) > 30
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*"}[5m])) by (service) > 0.1 and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*|.*linkwarden.*"}[5m])) by (service) > 0.1
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -704,6 +706,7 @@ serverFiles:
histogram_quantile(0.99, histogram_quantile(0.99,
sum(rate(traefik_service_request_duration_seconds_bucket[5m])) by (service, le) sum(rate(traefik_service_request_duration_seconds_bucket[5m])) by (service, le)
) > 10 ) > 10
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m for: 5m
labels: labels:
severity: warning severity: warning