add e2e email roundtrip monitoring

CronJob (every 30 min) sends test email via Mailgun API to
smoke-test@viktorbarzin.me, verifies IMAP delivery in spam@ catch-all,
deletes test email, pushes metrics to Pushgateway + Uptime Kuma.

Prometheus alerts: EmailRoundtripFailing, EmailRoundtripStale,
EmailRoundtripNeverRun. Uptime Kuma: SMTP/IMAP port checks + E2E push.
This commit is contained in:
Viktor Barzin 2026-03-25 22:50:22 +02:00
parent b9c2d7c1f6
commit 78dec8f0ad
5 changed files with 256 additions and 19 deletions

View file

@ -72,12 +72,12 @@ alertmanager:
- source_matchers:
- alertname = NodeDown
target_matchers:
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
# NFS down causes mass pod failures and NFS-dependent service outages
- source_matchers:
- alertname = NFSServerUnresponsive
target_matchers:
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|HomeAssistantDown"
- alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown"
# Traefik down makes service-level alerts noise
- source_matchers:
- alertname = TraefikDown
@ -1154,6 +1154,27 @@ serverFiles:
severity: warning
annotations:
summary: "Mail server has no available replicas - mail may not be received"
- alert: EmailRoundtripFailing
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
for: 90m
labels:
severity: warning
annotations:
summary: "Email round-trip probe failing. Check Mailgun relay, DNS, and IMAP."
- alert: EmailRoundtripStale
expr: (time() - email_roundtrip_last_success_timestamp{job="email-roundtrip-monitor"}) > 5400
for: 30m
labels:
severity: warning
annotations:
summary: "Email round-trip probe has not succeeded in >90 min"
- alert: EmailRoundtripNeverRun
expr: absent(email_roundtrip_success{job="email-roundtrip-monitor"})
for: 2h
labels:
severity: warning
annotations:
summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
- alert: HackmdDown
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
for: 5m
@ -1225,6 +1246,29 @@ serverFiles:
severity: warning
annotations:
summary: "High DNS SERVFAIL rate: {{ $value | printf \"%.0f\" }} failures detected"
- name: qbittorrent
rules:
- alert: QBittorrentMAMRatioLow
expr: qbt_tracker_ratio{tracker="mam"} < 1.0
for: 1h
labels:
severity: warning
annotations:
summary: "MAM ratio is {{ $value | printf \"%.2f\" }} (must be >= 1.0)"
- alert: QBittorrentDisconnected
expr: qbt_connected == 0
for: 5m
labels:
severity: critical
annotations:
summary: "qBittorrent is disconnected from the network"
- alert: QBittorrentMAMUnsatisfied
expr: qbt_tracker_unsatisfied{tracker="mam"} > 15
for: 10m
labels:
severity: warning
annotations:
summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)"
extraScrapeConfigs: |
- job_name: 'proxmox-host'