[ci skip] Fix all active Prometheus alerts

- meshcentral: rename port from "https" to "http" — MeshCentral serves
  plain HTTP when REVERSE_PROXY=true, but Traefik inferred HTTPS from the
  port name, causing 100% 5xx errors
- osm-routing/otp: scale to 0 — TfL GTFS data expired, OTP crash-loops
  trying to build graph with no valid transit trips
- wireguard: add prometheus.io/port=9586 annotation — without it,
  Prometheus tried scraping all container ports (51820 UDP, 80)
- travel-blog: remove stale prometheus.io annotations and dead port 9113
  — nginx-exporter sidecar was commented out but annotations remained
- dawarich: remove prometheus.io annotations — exporter env vars are
  commented out so nothing listens on port 9394
- monitoring: raise CPU temp threshold 60°C→75°C (E5-2699 v4 Tcase is
  79°C), lower registry cache threshold 50%→25%, add minimum traffic
  floor (>0.1 req/s) to 4xx/5xx rate alerts to prevent false positives
  on low-traffic services
This commit is contained in:
Viktor Barzin 2026-02-11 22:40:56 +00:00
parent 9c3f8adc11
commit 0c18a86a7b
No known key found for this signature in database
GPG key ID: 0EB088298288D958
6 changed files with 10 additions and 20 deletions

View file

@ -52,9 +52,6 @@ resource "kubernetes_deployment" "dawarich" {
annotations = {
# "diun.enable" = "true"
# "diun.include_tags" = "latest"
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/metrics"
"prometheus.io/port" = 9394
}
}
spec {

View file

@ -55,7 +55,7 @@ resource "kubernetes_deployment" "meshcentral" {
image = "typhonragewind/meshcentral:latest"
name = "meshcentral"
port {
name = "https"
name = "http"
container_port = 443
}
env {
@ -133,7 +133,7 @@ resource "kubernetes_service" "meshcentral" {
app = "meshcentral"
}
port {
name = "https"
name = "http"
port = 443
protocol = "TCP"
}

View file

@ -155,12 +155,12 @@ serverFiles:
- name: R730 Host
rules:
- alert: HighCPUTemperature
expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 60
expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 75
for: 30m
labels:
severity: page
annotations:
summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 60°C)"
summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)"
- alert: SSDHighWriteRate
expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 # sdb is SSD; value in MB
for: 10m
@ -361,12 +361,12 @@ serverFiles:
annotations:
summary: "Docker registry down for 10m"
- alert: RegistryLowCacheHitRate
expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 50
expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 25
for: 12h
labels:
severity: page
annotations:
summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 50%)"
summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 25%)"
- alert: NodeHighCPUUsage
expr: pve_cpu_usage_ratio * 100 > 30
for: 6h
@ -446,6 +446,7 @@ serverFiles:
/ sum(rate(traefik_service_requests_total[5m])) by (service)
* 100
) > 10
and sum(rate(traefik_service_requests_total[5m])) by (service) > 0.1
for: 5m
labels:
severity: page
@ -458,6 +459,7 @@ serverFiles:
/ sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service)
* 100
) > 30
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service) > 0.1
for: 10m
labels:
severity: page

View file

@ -165,7 +165,7 @@ resource "kubernetes_deployment" "otp" {
}
}
spec {
replicas = 1
replicas = 0 # Scaled down: TfL GTFS data expired, OTP crash-loops on build
strategy {
type = "Recreate"
}

View file

@ -83,11 +83,6 @@ resource "kubernetes_service" "travel-blog" {
labels = {
app = "travel-blog"
}
annotations = {
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/metrics"
"prometheus.io/port" = "9113"
}
}
spec {
@ -99,11 +94,6 @@ resource "kubernetes_service" "travel-blog" {
port = "80"
target_port = "80"
}
port {
name = "prometheus"
port = "9113"
target_port = "9113"
}
}
}

View file

@ -84,6 +84,7 @@ resource "kubernetes_deployment" "wireguard" {
}
annotations = {
"prometheus.io/scrape" = "true"
"prometheus.io/port" = "9586"
}
}
spec {