add e2e email roundtrip monitoring
CronJob (every 30 min) sends test email via Mailgun API to smoke-test@viktorbarzin.me, verifies IMAP delivery in spam@ catch-all, deletes test email, pushes metrics to Pushgateway + Uptime Kuma. Prometheus alerts: EmailRoundtripFailing, EmailRoundtripStale, EmailRoundtripNeverRun. Uptime Kuma: SMTP/IMAP port checks + E2E push.
This commit is contained in:
parent
b9c2d7c1f6
commit
78dec8f0ad
5 changed files with 256 additions and 19 deletions
|
|
@ -72,12 +72,12 @@ alertmanager:
|
|||
- source_matchers:
|
||||
- alertname = NodeDown
|
||||
target_matchers:
|
||||
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
|
||||
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
|
||||
# NFS down causes mass pod failures and NFS-dependent service outages
|
||||
- source_matchers:
|
||||
- alertname = NFSServerUnresponsive
|
||||
target_matchers:
|
||||
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|HomeAssistantDown"
|
||||
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown"
|
||||
# Traefik down makes service-level alerts noise
|
||||
- source_matchers:
|
||||
- alertname = TraefikDown
|
||||
|
|
@ -1154,6 +1154,27 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Mail server has no available replicas - mail may not be received"
|
||||
- alert: EmailRoundtripFailing
|
||||
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
|
||||
for: 90m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Email round-trip probe failing. Check Mailgun relay, DNS, and IMAP."
|
||||
- alert: EmailRoundtripStale
|
||||
expr: (time() - email_roundtrip_last_success_timestamp{job="email-roundtrip-monitor"}) > 5400
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Email round-trip probe has not succeeded in >90 min"
|
||||
- alert: EmailRoundtripNeverRun
|
||||
expr: absent(email_roundtrip_success{job="email-roundtrip-monitor"})
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
|
||||
- alert: HackmdDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
|
|
@ -1225,6 +1246,29 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "High DNS SERVFAIL rate: {{ $value | printf \"%.0f\" }} failures detected"
|
||||
- name: qbittorrent
|
||||
rules:
|
||||
- alert: QBittorrentMAMRatioLow
|
||||
expr: qbt_tracker_ratio{tracker="mam"} < 1.0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "MAM ratio is {{ $value | printf \"%.2f\" }} (must be >= 1.0)"
|
||||
- alert: QBittorrentDisconnected
|
||||
expr: qbt_connected == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "qBittorrent is disconnected from the network"
|
||||
- alert: QBittorrentMAMUnsatisfied
|
||||
expr: qbt_tracker_unsatisfied{tracker="mam"} > 15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)"
|
||||
|
||||
extraScrapeConfigs: |
|
||||
- job_name: 'proxmox-host'
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue