From d72c7169c012017cdd2e1d36288dd4ee82f9d911 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 27 May 2026 18:36:11 +0000 Subject: [PATCH] monitoring: route proxmox-exporter to scrape_slow job (fix flapping alerts) PVE API endpoint regularly takes ~11s with ~1035 thin LVs on the host (1002 k8s-csi PVCs + 22 VMs + 11 system), blowing past Prometheus's default 10s scrape_timeout and flapping ProxmoxMetricsMissing + ScrapeTargetDown. Switch the Service annotation from prometheus.io/scrape to prometheus.io/scrape_slow so the scrape moves to the existing kubernetes-service-endpoints-slow job (5m interval, 30s timeout). --- stacks/monitoring/modules/monitoring/pve_exporter.tf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/stacks/monitoring/modules/monitoring/pve_exporter.tf b/stacks/monitoring/modules/monitoring/pve_exporter.tf index 2c24dd42..1d6bbad4 100644 --- a/stacks/monitoring/modules/monitoring/pve_exporter.tf +++ b/stacks/monitoring/modules/monitoring/pve_exporter.tf @@ -100,7 +100,13 @@ resource "kubernetes_service" "proxmox-exporter" { "app" = "proxmox-exporter" } annotations = { - "prometheus.io/scrape" = "true" + # Use scrape_slow (5m interval, 30s timeout in prometheus values) because + # the PVE API endpoint regularly takes ~11s with ~1000 k8s-csi LVs on the + # host, blowing past the default 10s scrape_timeout and flapping the + # ProxmoxMetricsMissing + ScrapeTargetDown alerts. The slow job is gated + # by the `prometheus_io_scrape_slow=true` annotation in + # prometheus_chart_values.tpl and also excludes us from the fast job. + "prometheus.io/scrape_slow" = "true" "prometheus.io/port" = 9221 "prometheus.io/path" = "/pve" "prometheus.io/param_target" = "192.168.1.127"