fix noisy JobFailed and duplicate mail server alerts
- JobFailed: only alert on jobs started within the last hour, so stale failed CronJob runs don't keep firing after subsequent runs succeed - Mail server alert: renamed to MailServerDown, now targets the specific mailserver deployment instead of all deployments in the namespace (was falsely triggering on roundcubemail going down) - Updated inhibition rule to use new MailServerDown alert name
This commit is contained in:
parent
33c7976630
commit
ad8b90575e
1 changed files with 8 additions and 5 deletions
|
|
@ -82,7 +82,7 @@ alertmanager:
|
||||||
- source_matchers:
|
- source_matchers:
|
||||||
- alertname = NodeDown
|
- alertname = NodeDown
|
||||||
target_matchers:
|
target_matchers:
|
||||||
- alertname =~ "PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|LokiDown|HackmdDown|PrivatebinDown|Mail server has no replicas available"
|
- alertname =~ "PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|LokiDown|HackmdDown|PrivatebinDown|MailServerDown"
|
||||||
# NFS down causes mass pod failures
|
# NFS down causes mass pod failures
|
||||||
- source_matchers:
|
- source_matchers:
|
||||||
- alertname = NFSServerUnresponsive
|
- alertname = NFSServerUnresponsive
|
||||||
|
|
@ -396,7 +396,10 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Node {{ $labels.node }}: {{ $labels.condition }}"
|
summary: "Node {{ $labels.node }}: {{ $labels.condition }}"
|
||||||
- alert: JobFailed
|
- alert: JobFailed
|
||||||
expr: kube_job_status_failed > 0
|
expr: |
|
||||||
|
kube_job_status_failed > 0
|
||||||
|
and on(namespace, job_name)
|
||||||
|
(time() - kube_job_status_start_time) < 3600
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -665,13 +668,13 @@ serverFiles:
|
||||||
# severity: page
|
# severity: page
|
||||||
# annotations:
|
# annotations:
|
||||||
# summary: OpenWRT high memory usage. Can cause services getting stuck.
|
# summary: OpenWRT high memory usage. Can cause services getting stuck.
|
||||||
- alert: Mail server has no replicas available
|
- alert: MailServerDown
|
||||||
expr: (kube_deployment_status_replicas_available{namespace="mailserver"} or on() vector(0)) < 1
|
expr: (kube_deployment_status_replicas_available{namespace="mailserver", deployment="mailserver"} or on() vector(0)) < 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Mail server has no available replicas. This means mail may not be received.
|
summary: "Mail server has no available replicas - mail may not be received"
|
||||||
- alert: HackmdDown
|
- alert: HackmdDown
|
||||||
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
|
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue