[ci skip] Fix all active Prometheus alerts
- meshcentral: rename port from "https" to "http" — MeshCentral serves plain HTTP when REVERSE_PROXY=true, but Traefik inferred HTTPS from the port name, causing 100% 5xx errors - osm-routing/otp: scale to 0 — TfL GTFS data expired, OTP crash-loops trying to build graph with no valid transit trips - wireguard: add prometheus.io/port=9586 annotation — without it, Prometheus tried scraping all container ports (51820 UDP, 80) - travel-blog: remove stale prometheus.io annotations and dead port 9113 — nginx-exporter sidecar was commented out but annotations remained - dawarich: remove prometheus.io annotations — exporter env vars are commented out so nothing listens on port 9394 - monitoring: raise CPU temp threshold 60°C→75°C (E5-2699 v4 Tcase is 79°C), lower registry cache threshold 50%→25%, add minimum traffic floor (>0.1 req/s) to 4xx/5xx rate alerts to prevent false positives on low-traffic services
This commit is contained in:
parent
9c3f8adc11
commit
0c18a86a7b
6 changed files with 10 additions and 20 deletions
|
|
@ -155,12 +155,12 @@ serverFiles:
|
|||
- name: R730 Host
|
||||
rules:
|
||||
- alert: HighCPUTemperature
|
||||
expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 60
|
||||
expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 75
|
||||
for: 30m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 60°C)"
|
||||
summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)"
|
||||
- alert: SSDHighWriteRate
|
||||
expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 # sdb is SSD; value in MB
|
||||
for: 10m
|
||||
|
|
@ -361,12 +361,12 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Docker registry down for 10m"
|
||||
- alert: RegistryLowCacheHitRate
|
||||
expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 50
|
||||
expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 25
|
||||
for: 12h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 50%)"
|
||||
summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 25%)"
|
||||
- alert: NodeHighCPUUsage
|
||||
expr: pve_cpu_usage_ratio * 100 > 30
|
||||
for: 6h
|
||||
|
|
@ -446,6 +446,7 @@ serverFiles:
|
|||
/ sum(rate(traefik_service_requests_total[5m])) by (service)
|
||||
* 100
|
||||
) > 10
|
||||
and sum(rate(traefik_service_requests_total[5m])) by (service) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
|
|
@ -458,6 +459,7 @@ serverFiles:
|
|||
/ sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service)
|
||||
* 100
|
||||
) > 30
|
||||
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue