diff --git a/stacks/mailserver/modules/mailserver/main.tf b/stacks/mailserver/modules/mailserver/main.tf index f156549a..fd46708a 100644 --- a/stacks/mailserver/modules/mailserver/main.tf +++ b/stacks/mailserver/modules/mailserver/main.tf @@ -139,17 +139,6 @@ resource "kubernetes_config_map" "mailserver_config" { # attempt waits 5s before responding, stretching a 1000-password # dictionary attack from <1s to ~85min. Addresses code-9mi. auth_failure_delay = 5s - # NOTE (code-vnc 2026-04-19): `viktorbarzin/dovecot_exporter` - # expects the legacy old_stats FIFO wire protocol. Dovecot 2.3 still - # supports the `old_stats` plugin, but docker-mailserver 15.0.0 - # ships `service stats` (new architecture) as the default. Mixing - # the two — enabling old_stats + declaring `service old-stats - # unix_listener stats-reader` — makes `doveadm stats dump` fail - # with "Failed to read VERSION line" and the exporter loops on - # "Input does not provide any columns". A real fix requires either - # a newer exporter that speaks Dovecot 2.3 `doveadm-server` / - # HTTP stats, or retiring the exporter entirely. Tracked as a - # follow-up task. EOF fail2ban_conf = <<-EOF [DEFAULT] @@ -467,33 +456,6 @@ resource "kubernetes_deployment" "mailserver" { } - container { - name = "dovecot-exporter" - image = "viktorbarzin/dovecot_exporter@sha256:1114224c9bf0261ca8e9949a6b42d3c5a2c923d34ca4593f6b62f034daf14fc5" - command = [ - "/dovecot_exporter/exporter", - "--dovecot.socket-path=/var/run/dovecot/stats-reader" - ] - image_pull_policy = "IfNotPresent" - port { - name = "dovecotexporter" - container_port = 9166 - protocol = "TCP" - } - volume_mount { - name = "var-run-dovecot" - mount_path = "/var/run/dovecot" - } - resources { - requests = { - cpu = "10m" - memory = "32Mi" - } - limits = { - memory = "32Mi" - } - } - } volume { name = "config" @@ -597,35 +559,13 @@ resource "kubernetes_service" "mailserver" { } } -# Split the Dovecot metrics port off the public LB and onto its own -# ClusterIP Service. Port 9166 was only LAN-routable via 10.0.20.202 -# but was over-exposed for a Prometheus-internal metric. Addresses -# code-izl. Prometheus scrape target follows in -# stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl -# (updated to `mailserver-metrics.mailserver.svc.cluster.local:9166`). -resource "kubernetes_service" "mailserver_metrics" { - metadata { - name = "mailserver-metrics" - namespace = kubernetes_namespace.mailserver.metadata[0].name - labels = { - app = "mailserver" - } - } - - spec { - type = "ClusterIP" - selector = { - app = "mailserver" - } - - port { - name = "dovecot-metrics" - protocol = "TCP" - port = 9166 - target_port = 9166 - } - } -} +# The `mailserver-metrics` ClusterIP Service (formerly split from the +# main LB in code-izl) was retired in code-1ik when the Dovecot +# exporter was removed — the exporter spoke the pre-Dovecot-2.3 +# old_stats protocol which docker-mailserver 15.0.0 no longer +# emits, so the scrape was a no-op. If a working exporter is ever +# re-introduced, add back: ClusterIP Service exposing port 9166 +# with selector app=mailserver. # ============================================================================= # E2E Email Roundtrip Monitor diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index c083b2a5..89485cb0 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1977,26 +1977,10 @@ serverFiles: severity: warning annotations: summary: "Authentik outpost restarted {{ $value | printf \"%.0f\" }} times in 30m — check for OOM or crash loop" - - name: Mailserver Dovecot - # Dovecot exporter on mailserver:9166 exposes connection-count gauges. - # The Dovecot IMAP login service is capped by `mail_max_userip_connections` - # (50 per user-IP in the deployed config); fire at 85% so we can tune - # before real users get ECONNREFUSED. - rules: - - alert: DovecotConnectionsNearLimit - expr: max(dovecot_imap_connected_users) >= 42 - for: 5m - labels: - severity: warning - annotations: - summary: "Dovecot IMAP connections near cap ({{ $value | printf \"%.0f\" }} / 50) — review mail_max_userip_connections or investigate noisy client" - - alert: DovecotExporterDown - expr: up{job="mailserver-dovecot"} == 0 - for: 10m - labels: - severity: warning - annotations: - summary: "Dovecot exporter unreachable for 10m — check mailserver pod health + port 9166" + # Mailserver Dovecot alerts were removed with the exporter in + # code-1ik (viktorbarzin/dovecot_exporter incompatible with + # Dovecot 2.3 stats architecture). Re-add the rule group if a + # working exporter is introduced. - name: Infrastructure Drift # Metrics pushed by .woodpecker/drift-detection.yml after each cron run. # See Wave 7 of the state-drift consolidation plan. @@ -2031,16 +2015,11 @@ serverFiles: summary: "{{ $value | printf \"%.0f\" }} stacks drifting — likely a systemic cause (new admission webhook, provider upgrade). Check the most recent drift-detection run in Woodpecker." extraScrapeConfigs: | - - job_name: 'mailserver-dovecot' - # Dovecot exporter lives on the mailserver pod; port 9166 is exposed by - # the dedicated ClusterIP Service `mailserver-metrics` (split from the - # public LB in code-izl). Kube-prometheus-stack (with ServiceMonitor - # CRDs) isn't deployed here, so we scrape by service DNS. - static_configs: - - targets: - - "mailserver-metrics.mailserver.svc.cluster.local:9166" - metrics_path: '/metrics' - scrape_interval: 30s + # The `mailserver-dovecot` scrape job was retired in code-1ik together + # with the Dovecot exporter. docker-mailserver 15.0.0's Dovecot 2.3 + # doesn't emit the old_stats protocol the exporter expected, so the + # scrape only ever returned `dovecot_up{scope="user"} 0`. Re-add here + # if a working exporter is introduced. - job_name: 'proxmox-host' static_configs: - targets: