From 46ffc37dcf0094b05e71e0538f53e2d0a202951a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 11 Feb 2026 22:40:56 +0000 Subject: [PATCH] [ci skip] Fix all active Prometheus alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - meshcentral: rename port from "https" to "http" — MeshCentral serves plain HTTP when REVERSE_PROXY=true, but Traefik inferred HTTPS from the port name, causing 100% 5xx errors - osm-routing/otp: scale to 0 — TfL GTFS data expired, OTP crash-loops trying to build graph with no valid transit trips - wireguard: add prometheus.io/port=9586 annotation — without it, Prometheus tried scraping all container ports (51820 UDP, 80) - travel-blog: remove stale prometheus.io annotations and dead port 9113 — nginx-exporter sidecar was commented out but annotations remained - dawarich: remove prometheus.io annotations — exporter env vars are commented out so nothing listens on port 9394 - monitoring: raise CPU temp threshold 60°C→75°C (E5-2699 v4 Tcase is 79°C), lower registry cache threshold 50%→25%, add minimum traffic floor (>0.1 req/s) to 4xx/5xx rate alerts to prevent false positives on low-traffic services --- modules/kubernetes/dawarich/main.tf | 3 --- modules/kubernetes/meshcentral/main.tf | 4 ++-- .../kubernetes/monitoring/prometheus_chart_values.tpl | 10 ++++++---- modules/kubernetes/osm-routing/main.tf | 2 +- modules/kubernetes/travel_blog/main.tf | 10 ---------- 5 files changed, 9 insertions(+), 20 deletions(-) diff --git a/modules/kubernetes/dawarich/main.tf b/modules/kubernetes/dawarich/main.tf index aeb345e6..8b1851b7 100644 --- a/modules/kubernetes/dawarich/main.tf +++ b/modules/kubernetes/dawarich/main.tf @@ -52,9 +52,6 @@ resource "kubernetes_deployment" "dawarich" { annotations = { # "diun.enable" = "true" # "diun.include_tags" = "latest" - "prometheus.io/scrape" = "true" - "prometheus.io/path" = "/metrics" - "prometheus.io/port" = 9394 } } spec { diff --git a/modules/kubernetes/meshcentral/main.tf b/modules/kubernetes/meshcentral/main.tf index 563d53dc..8fab8266 100644 --- a/modules/kubernetes/meshcentral/main.tf +++ b/modules/kubernetes/meshcentral/main.tf @@ -55,7 +55,7 @@ resource "kubernetes_deployment" "meshcentral" { image = "typhonragewind/meshcentral:latest" name = "meshcentral" port { - name = "https" + name = "http" container_port = 443 } env { @@ -133,7 +133,7 @@ resource "kubernetes_service" "meshcentral" { app = "meshcentral" } port { - name = "https" + name = "http" port = 443 protocol = "TCP" } diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index bf09e08a..8b401bd4 100755 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -155,12 +155,12 @@ serverFiles: - name: R730 Host rules: - alert: HighCPUTemperature - expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 60 + expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 75 for: 30m labels: severity: page annotations: - summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 60°C)" + summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)" - alert: SSDHighWriteRate expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 # sdb is SSD; value in MB for: 10m @@ -361,12 +361,12 @@ serverFiles: annotations: summary: "Docker registry down for 10m" - alert: RegistryLowCacheHitRate - expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 50 + expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 25 for: 12h labels: severity: page annotations: - summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 50%)" + summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 25%)" - alert: NodeHighCPUUsage expr: pve_cpu_usage_ratio * 100 > 30 for: 6h @@ -446,6 +446,7 @@ serverFiles: / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 ) > 10 + and sum(rate(traefik_service_requests_total[5m])) by (service) > 0.1 for: 5m labels: severity: page @@ -458,6 +459,7 @@ serverFiles: / sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service) * 100 ) > 30 + and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service) > 0.1 for: 10m labels: severity: page diff --git a/modules/kubernetes/osm-routing/main.tf b/modules/kubernetes/osm-routing/main.tf index 8fd54627..be943c04 100644 --- a/modules/kubernetes/osm-routing/main.tf +++ b/modules/kubernetes/osm-routing/main.tf @@ -165,7 +165,7 @@ resource "kubernetes_deployment" "otp" { } } spec { - replicas = 1 + replicas = 0 # Scaled down: TfL GTFS data expired, OTP crash-loops on build strategy { type = "Recreate" } diff --git a/modules/kubernetes/travel_blog/main.tf b/modules/kubernetes/travel_blog/main.tf index 15b29720..fd47fe1b 100644 --- a/modules/kubernetes/travel_blog/main.tf +++ b/modules/kubernetes/travel_blog/main.tf @@ -83,11 +83,6 @@ resource "kubernetes_service" "travel-blog" { labels = { app = "travel-blog" } - annotations = { - "prometheus.io/scrape" = "true" - "prometheus.io/path" = "/metrics" - "prometheus.io/port" = "9113" - } } spec { @@ -99,11 +94,6 @@ resource "kubernetes_service" "travel-blog" { port = "80" target_port = "80" } - port { - name = "prometheus" - port = "9113" - target_port = "9113" - } } }