reduce alert noise: add cascade inhibitions, increase for durations, drop Loki alerts

- NodeDown now suppresses workload and service alerts (PodCrashLooping,
  DeploymentReplicasMismatch, StatefulSetReplicasMismatch, etc.)
- NFSServerUnresponsive suppresses pod-level alerts
- Increased for durations on transient alerts (e.g. 15m→30m for replica mismatches)
- NodeDown for: 1m→3m to avoid flapping
- Removed all 3 Loki log-based alerts (duplicated Prometheus alerts)
- Downgraded HeadscaleDown critical→warning, mail server page→warning
This commit is contained in:
Viktor Barzin 2026-03-08 21:13:16 +00:00
parent 4978804404
commit e6c0c39ae7
No known key found for this signature in database
GPG key ID: 0EB088298288D958
2 changed files with 30 additions and 50 deletions

View file

@ -109,44 +109,7 @@ resource "kubernetes_config_map" "loki_alert_rules" {
}
data = {
"rules.yaml" = yamlencode({
groups = [{
name = "log-alerts"
rules = [
{
alert = "HighErrorRate"
expr = "sum(rate({namespace=~\".+\"} |= \"error\" [5m])) by (namespace) > 10"
for = "5m"
labels = {
severity = "warning"
}
annotations = {
summary = "High error rate in {{ $labels.namespace }}"
}
},
{
alert = "PodCrashLoopBackOff"
expr = "count_over_time({namespace=~\".+\"} |= \"CrashLoopBackOff\" [5m]) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "CrashLoopBackOff detected in {{ $labels.namespace }}"
}
},
{
alert = "OOMKilled"
expr = "count_over_time({namespace=~\".+\"} |= \"OOMKilled\" [5m]) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "OOMKilled detected in {{ $labels.namespace }}"
}
}
]
}]
groups = []
})
}
}