From 78dec8f0adf3a3b8a7837c4c8f6a15ba6e772ca7 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 25 Mar 2026 22:50:22 +0200 Subject: [PATCH] add e2e email roundtrip monitoring CronJob (every 30 min) sends test email via Mailgun API to smoke-test@viktorbarzin.me, verifies IMAP delivery in spam@ catch-all, deletes test email, pushes metrics to Pushgateway + Uptime Kuma. Prometheus alerts: EmailRoundtripFailing, EmailRoundtripStale, EmailRoundtripNeverRun. Uptime Kuma: SMTP/IMAP port checks + E2E push. --- .claude/CLAUDE.md | 1 + docs/architecture/monitoring.md | 31 +++- stacks/mailserver/main.tf | 27 +-- stacks/mailserver/modules/mailserver/main.tf | 168 +++++++++++++++++- .../monitoring/prometheus_chart_values.tpl | 48 ++++- 5 files changed, 256 insertions(+), 19 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 72d03775..0857aedb 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -116,6 +116,7 @@ Repo IDs: infra=1, Website=2, finance=3, health=4, travel_blog=5, webhook-handle - Exclude completed CronJob pods from "pod not ready" alerts. - Every new service gets Prometheus scrape config + Uptime Kuma monitor. - Key alerts: OOMKill, pod replica mismatch, 4xx/5xx error rates, UPS battery, CPU temp, SSD writes, NFS responsiveness, ClusterMemoryRequestsHigh (>85%), ContainerNearOOM (>85% limit), PodUnschedulable. +- **E2E email monitoring**: CronJob `email-roundtrip-monitor` (every 30 min) sends test email via Mailgun API to `smoke-test@viktorbarzin.me` (catch-all → `spam@`), verifies IMAP delivery, deletes test email, pushes metrics to Pushgateway + Uptime Kuma. Alerts: `EmailRoundtripFailing` (90m), `EmailRoundtripStale` (90m), `EmailRoundtripNeverRun` (2h). Vault: `mailgun_api_key` in `secret/viktor`. ## Storage & Backup Architecture diff --git a/docs/architecture/monitoring.md b/docs/architecture/monitoring.md index 960b155a..97477ca1 100644 --- a/docs/architecture/monitoring.md +++ b/docs/architecture/monitoring.md @@ -15,6 +15,7 @@ graph TB GPU[NVIDIA GPU via dcgm-exporter] UPS[UPS Exporter] NFS[NFS Exporter] + EMAIL[Email Roundtrip Probe
CronJob every 30m] end subgraph "Monitoring Stack (platform stack)" @@ -45,6 +46,8 @@ graph TB AM --> INHIBIT INHIBIT --> NOTIFY + EMAIL -->|Pushgateway| PROM + EMAIL -.->|Push| UPTIME PODS -.->|HTTP Health| UPTIME ``` @@ -52,12 +55,13 @@ graph TB | Component | Version | Location | Purpose | |-----------|---------|----------|---------| -| Prometheus | Latest (Diun monitored) | `stacks/platform/modules/monitoring/` | Metrics collection and storage, scrape configs for all services | -| Grafana | Latest (Diun monitored) | `stacks/platform/modules/monitoring/` | Visualization, 14+ dashboards (API server, CoreDNS, GPU, UPS, etc.) | -| Loki | Latest (Diun monitored) | `stacks/platform/modules/monitoring/` | Log aggregation and querying | -| Alertmanager | Latest (Diun monitored) | `stacks/platform/modules/monitoring/` | Alert routing with cascade inhibitions | -| Uptime Kuma | Latest (Diun monitored) | `stacks/platform/modules/monitoring/` | Per-service HTTP monitors, status page | -| dcgm-exporter | Configurable resources | `stacks/platform/modules/monitoring/` | NVIDIA GPU metrics collection | +| Prometheus | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Metrics collection and storage, scrape configs for all services | +| Grafana | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Visualization, 14+ dashboards (API server, CoreDNS, GPU, UPS, etc.) | +| Loki | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Log aggregation and querying | +| Alertmanager | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Alert routing with cascade inhibitions | +| Uptime Kuma | Latest (Diun monitored) | `stacks/monitoring/modules/monitoring/` | Per-service HTTP monitors, status page | +| dcgm-exporter | Configurable resources | `stacks/monitoring/modules/monitoring/` | NVIDIA GPU metrics collection | +| Email Roundtrip Probe | Python 3.12 | `stacks/mailserver/modules/mailserver/` | E2E email delivery verification via Mailgun API + IMAP | ## How It Works @@ -143,6 +147,21 @@ spec: #### Application Alerts - **4xx/5xx Error Rates**: HTTP error rate threshold exceeded +#### Email Monitoring Alerts +- **EmailRoundtripFailing**: E2E email probe returning failure for >90m +- **EmailRoundtripStale**: No successful email round-trip in >90m +- **EmailRoundtripNeverRun**: Email probe has never reported (CronJob not running) + +The email monitoring system uses a CronJob (`email-roundtrip-monitor`, every 30 min) in the `mailserver` namespace that: +1. Sends a test email via Mailgun HTTP API to `smoke-test@viktorbarzin.me` +2. Email lands in the `spam@` catch-all mailbox via MX delivery +3. Verifies delivery via IMAP (searches by UUID marker in subject) +4. Deletes the test email immediately +5. Pushes metrics (`email_roundtrip_success`, `email_roundtrip_duration_seconds`, `email_roundtrip_last_success_timestamp`) to Prometheus Pushgateway +6. Pushes status to Uptime Kuma E2E Push monitor + +Uptime Kuma also has TCP monitors for SMTP (port 25) and IMAP (port 993) on `10.0.20.200`. + #### Backup Alerts - **PostgreSQLBackupStale**: >36h since last backup - **MySQLBackupStale**: >36h since last backup diff --git a/stacks/mailserver/main.tf b/stacks/mailserver/main.tf index bff786cc..9ca6a504 100644 --- a/stacks/mailserver/main.tf +++ b/stacks/mailserver/main.tf @@ -11,6 +11,11 @@ data "vault_kv_secret_v2" "secrets" { name = "platform" } +data "vault_kv_secret_v2" "viktor" { + mount = "secret" + name = "viktor" +} + locals { mailserver_accounts = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_accounts"]) mailserver_aliases = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_aliases"]) @@ -19,14 +24,16 @@ locals { } module "mailserver" { - source = "./modules/mailserver" - tls_secret_name = var.tls_secret_name - nfs_server = var.nfs_server - mysql_host = var.mysql_host - mailserver_accounts = local.mailserver_accounts - postfix_account_aliases = local.mailserver_aliases - opendkim_key = local.mailserver_opendkim_key - sasl_passwd = local.mailserver_sasl_passwd - roundcube_db_password = data.vault_kv_secret_v2.secrets.data["mailserver_roundcubemail_db_password"] - tier = local.tiers.edge + source = "./modules/mailserver" + tls_secret_name = var.tls_secret_name + nfs_server = var.nfs_server + mysql_host = var.mysql_host + mailserver_accounts = local.mailserver_accounts + postfix_account_aliases = local.mailserver_aliases + opendkim_key = local.mailserver_opendkim_key + sasl_passwd = local.mailserver_sasl_passwd + roundcube_db_password = data.vault_kv_secret_v2.secrets.data["mailserver_roundcubemail_db_password"] + tier = local.tiers.edge + mailgun_api_key = data.vault_kv_secret_v2.viktor.data["mailgun_api_key"] + email_monitor_imap_password = local.mailserver_accounts["spam@viktorbarzin.me"] } diff --git a/stacks/mailserver/modules/mailserver/main.tf b/stacks/mailserver/modules/mailserver/main.tf index 1f4333f4..2a52f41d 100644 --- a/stacks/mailserver/modules/mailserver/main.tf +++ b/stacks/mailserver/modules/mailserver/main.tf @@ -5,6 +5,14 @@ variable "postfix_account_aliases" {} variable "opendkim_key" {} variable "sasl_passwd" {} # For sendgrid i.e relayhost variable "nfs_server" { type = string } +variable "mailgun_api_key" { + type = string + sensitive = true +} +variable "email_monitor_imap_password" { + type = string + sensitive = true +} resource "kubernetes_namespace" "mailserver" { metadata { @@ -466,7 +474,7 @@ resource "kubernetes_service" "mailserver" { } spec { - type = "LoadBalancer" + type = "LoadBalancer" external_traffic_policy = "Cluster" selector = { app = "mailserver" @@ -502,3 +510,161 @@ resource "kubernetes_service" "mailserver" { } } +# ============================================================================= +# E2E Email Roundtrip Monitor +# Sends test email via Mailgun API, verifies delivery via IMAP, pushes metrics +# ============================================================================= +resource "kubernetes_cron_job_v1" "email_roundtrip_monitor" { + metadata { + name = "email-roundtrip-monitor" + namespace = kubernetes_namespace.mailserver.metadata[0].name + } + spec { + concurrency_policy = "Replace" + failed_jobs_history_limit = 3 + successful_jobs_history_limit = 3 + schedule = "*/30 * * * *" + job_template { + metadata {} + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 300 + template { + metadata {} + spec { + container { + name = "email-roundtrip" + image = "docker.io/library/python:3.12-alpine" + command = ["/bin/sh", "-c", <<-EOT + pip install --quiet --disable-pip-version-check requests && python3 -c ' +import requests, imaplib, email, time, os, uuid, sys, ssl + +MAILGUN_API_KEY = os.environ["MAILGUN_API_KEY"] +IMAP_USER = "spam@viktorbarzin.me" +IMAP_PASS = os.environ["EMAIL_MONITOR_IMAP_PASSWORD"] +IMAP_HOST = "mailserver.mailserver.svc.cluster.local" +PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/email-roundtrip-monitor" +DOMAIN = "viktorbarzin.me" + +marker = f"e2e-probe-{uuid.uuid4().hex[:12]}" +subject = f"[E2E Monitor] {marker}" +start = time.time() +success = 0 +duration = 0 + +try: + # Step 1: Send via Mailgun HTTP API to smoke-test@ (hits catch-all -> spam@) + resp = requests.post( + f"https://api.eu.mailgun.net/v3/{DOMAIN}/messages", + auth=("api", MAILGUN_API_KEY), + data={ + "from": f"monitoring@{DOMAIN}", + "to": f"smoke-test@{DOMAIN}", + "subject": subject, + "text": f"E2E email monitoring probe {marker}. Auto-generated, will be deleted.", + }, + timeout=30, + ) + resp.raise_for_status() + print(f"Sent test email via Mailgun: {resp.status_code} marker={marker}") + + # Step 2: Wait for delivery, retry IMAP up to 3 min + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + found = False + for attempt in range(9): + time.sleep(20) + try: + imap = imaplib.IMAP4_SSL(IMAP_HOST, 993, ssl_context=ctx) + imap.login(IMAP_USER, IMAP_PASS) + imap.select("INBOX") + _, msg_ids = imap.search(None, "SUBJECT", marker) + if msg_ids[0]: + found = True + print(f"Found test email after {attempt+1} attempts") + # Delete the test email + try: + for mid in msg_ids[0].split(): + imap.store(mid, "+FLAGS", "\\Deleted") + imap.expunge() + print("Deleted test email") + except Exception as de: + print(f"Delete failed (non-critical): {de}") + imap.logout() + if found: + break + except Exception as e: + print(f"IMAP attempt {attempt+1} failed: {e}") + + duration = time.time() - start + if found: + success = 1 + print(f"Round-trip SUCCESS in {duration:.1f}s") + else: + print(f"Round-trip FAILED - email not found after {duration:.1f}s") + +except Exception as e: + duration = time.time() - start + print(f"ERROR: {e}") + +# Push metrics to Pushgateway +metrics = f"""# HELP email_roundtrip_success Whether the last e2e email probe succeeded +# TYPE email_roundtrip_success gauge +email_roundtrip_success {success} +# HELP email_roundtrip_duration_seconds Duration of the last e2e email probe +# TYPE email_roundtrip_duration_seconds gauge +email_roundtrip_duration_seconds {duration:.2f} +# HELP email_roundtrip_last_success_timestamp Unix timestamp of last successful probe +# TYPE email_roundtrip_last_success_timestamp gauge +email_roundtrip_last_success_timestamp {int(time.time()) if success else 0} +""" +try: + requests.put(PUSHGATEWAY, data=metrics, timeout=10) + print("Pushed metrics to Pushgateway") +except Exception as e: + print(f"Failed to push metrics: {e}") + +# Push to Uptime Kuma on success +if success: + try: + requests.get("https://uptime.viktorbarzin.me/api/push/hLtyRKgeZO?status=up&msg=OK&ping=" + str(int(duration)), timeout=10) + print("Pushed to Uptime Kuma") + except Exception as e: + print(f"Failed to push to Uptime Kuma: {e}") + +sys.exit(0 if success else 1) +' + EOT + ] + env { + name = "MAILGUN_API_KEY" + value = var.mailgun_api_key + } + env { + name = "EMAIL_MONITOR_IMAP_PASSWORD" + value = var.email_monitor_imap_password + } + resources { + requests = { + memory = "64Mi" + cpu = "10m" + } + limits = { + memory = "128Mi" + } + } + } + dns_config { + option { + name = "ndots" + value = "2" + } + } + } + } + } + } + } +} + diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index fac4f055..c7774b33 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -72,12 +72,12 @@ alertmanager: - source_matchers: - alertname = NodeDown target_matchers: - - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" + - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" # NFS down causes mass pod failures and NFS-dependent service outages - source_matchers: - alertname = NFSServerUnresponsive target_matchers: - - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|HomeAssistantDown" + - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown" # Traefik down makes service-level alerts noise - source_matchers: - alertname = TraefikDown @@ -1154,6 +1154,27 @@ serverFiles: severity: warning annotations: summary: "Mail server has no available replicas - mail may not be received" + - alert: EmailRoundtripFailing + expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0 + for: 90m + labels: + severity: warning + annotations: + summary: "Email round-trip probe failing. Check Mailgun relay, DNS, and IMAP." + - alert: EmailRoundtripStale + expr: (time() - email_roundtrip_last_success_timestamp{job="email-roundtrip-monitor"}) > 5400 + for: 30m + labels: + severity: warning + annotations: + summary: "Email round-trip probe has not succeeded in >90 min" + - alert: EmailRoundtripNeverRun + expr: absent(email_roundtrip_success{job="email-roundtrip-monitor"}) + for: 2h + labels: + severity: warning + annotations: + summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace" - alert: HackmdDown expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1 for: 5m @@ -1225,6 +1246,29 @@ serverFiles: severity: warning annotations: summary: "High DNS SERVFAIL rate: {{ $value | printf \"%.0f\" }} failures detected" + - name: qbittorrent + rules: + - alert: QBittorrentMAMRatioLow + expr: qbt_tracker_ratio{tracker="mam"} < 1.0 + for: 1h + labels: + severity: warning + annotations: + summary: "MAM ratio is {{ $value | printf \"%.2f\" }} (must be >= 1.0)" + - alert: QBittorrentDisconnected + expr: qbt_connected == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "qBittorrent is disconnected from the network" + - alert: QBittorrentMAMUnsatisfied + expr: qbt_tracker_unsatisfied{tracker="mam"} > 15 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)" extraScrapeConfigs: | - job_name: 'proxmox-host'