reduce alert noise: add cascade inhibitions, increase for durations, drop Loki alerts
- NodeDown now suppresses workload and service alerts (PodCrashLooping, DeploymentReplicasMismatch, StatefulSetReplicasMismatch, etc.) - NFSServerUnresponsive suppresses pod-level alerts - Increased for durations on transient alerts (e.g. 15m→30m for replica mismatches) - NodeDown for: 1m→3m to avoid flapping - Removed all 3 Loki log-based alerts (duplicated Prometheus alerts) - Downgraded HeadscaleDown critical→warning, mail server page→warning
This commit is contained in:
parent
4978804404
commit
e6c0c39ae7
2 changed files with 30 additions and 50 deletions
|
|
@ -109,44 +109,7 @@ resource "kubernetes_config_map" "loki_alert_rules" {
|
|||
}
|
||||
data = {
|
||||
"rules.yaml" = yamlencode({
|
||||
groups = [{
|
||||
name = "log-alerts"
|
||||
rules = [
|
||||
{
|
||||
alert = "HighErrorRate"
|
||||
expr = "sum(rate({namespace=~\".+\"} |= \"error\" [5m])) by (namespace) > 10"
|
||||
for = "5m"
|
||||
labels = {
|
||||
severity = "warning"
|
||||
}
|
||||
annotations = {
|
||||
summary = "High error rate in {{ $labels.namespace }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "PodCrashLoopBackOff"
|
||||
expr = "count_over_time({namespace=~\".+\"} |= \"CrashLoopBackOff\" [5m]) > 0"
|
||||
for = "1m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "CrashLoopBackOff detected in {{ $labels.namespace }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "OOMKilled"
|
||||
expr = "count_over_time({namespace=~\".+\"} |= \"OOMKilled\" [5m]) > 0"
|
||||
for = "1m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "OOMKilled detected in {{ $labels.namespace }}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}]
|
||||
groups = []
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue