reduce alert noise: remove 4 memory alerts, raise latency threshold [ci skip]
- Remove ClusterMemoryRequestsHigh, ContainerNearOOM, NodeLowFreeMemory, NodeMemoryPressureTrending — all fire regularly due to intentional memory overcommit and are not actionable - Keep ContainerOOMKilled (actionable — container actually died) - Raise HighServiceLatency p99 threshold from 10s to 30s to ignore transient spikes
This commit is contained in:
parent
2445edea8f
commit
b8a5740138
1 changed files with 2 additions and 30 deletions
|
|
@ -514,20 +514,6 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: {{ $value | printf \"%.0f\" }} OOM kill(s) in 15m"
|
||||
- alert: ClusterMemoryRequestsHigh
|
||||
expr: sum(kube_pod_container_resource_requests{resource="memory"}) / sum(kube_node_status_allocatable{resource="memory"}) * 100 > 92
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Cluster memory requests: {{ $value | printf \"%.0f\" }}% of allocatable (threshold: 92%)"
|
||||
- alert: ContainerNearOOM
|
||||
expr: (container_memory_working_set_bytes{container!=""} / container_spec_memory_limit_bytes{container!=""} * 100 > 90) and container_spec_memory_limit_bytes{container!=""} > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: {{ $value | printf \"%.0f\" }}% of memory limit (threshold: 90%)"
|
||||
- alert: PodUnschedulable
|
||||
expr: kube_pod_status_conditions{condition="PodScheduled", status="false"} == 1
|
||||
for: 5m
|
||||
|
|
@ -880,13 +866,6 @@ serverFiles:
|
|||
severity: info
|
||||
annotations:
|
||||
summary: "CPU usage on {{ $labels.node }}: {{ $value | printf \"%.0f\" }}% (threshold: 60%)"
|
||||
- alert: NodeLowFreeMemory
|
||||
expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) or on() vector(1)) * 100 > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage on {{ $labels.node }}: {{ $value | printf \"%.0f\" }}% (threshold: 95%)"
|
||||
# - name: PodStuckNotReady
|
||||
# rules:
|
||||
# - alert: PodStuckNotReady
|
||||
|
|
@ -929,13 +908,6 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.daemonset }}: {{ $value | printf \"%.0f\" }} pod(s) missing"
|
||||
- alert: NodeMemoryPressureTrending
|
||||
expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 92
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 92%)"
|
||||
- alert: NodeExporterDown
|
||||
expr: up{job="prometheus-prometheus-node-exporter"} == 0
|
||||
for: 5m
|
||||
|
|
@ -1042,13 +1014,13 @@ serverFiles:
|
|||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(traefik_service_request_duration_seconds_bucket{service!~".*idrac.*"}[5m])) by (service, le)
|
||||
) > 10
|
||||
) > 30
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "p99 latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 10s)"
|
||||
summary: "p99 latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 30s)"
|
||||
- alert: TLSCertExpiringSoon
|
||||
expr: (traefik_tls_certs_not_after - time()) / 86400 < 7
|
||||
for: 1h
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue