diff --git a/modules/kubernetes/dawarich/main.tf b/modules/kubernetes/dawarich/main.tf index aeb345e6..8b1851b7 100644 --- a/modules/kubernetes/dawarich/main.tf +++ b/modules/kubernetes/dawarich/main.tf @@ -52,9 +52,6 @@ resource "kubernetes_deployment" "dawarich" { annotations = { # "diun.enable" = "true" # "diun.include_tags" = "latest" - "prometheus.io/scrape" = "true" - "prometheus.io/path" = "/metrics" - "prometheus.io/port" = 9394 } } spec { diff --git a/modules/kubernetes/meshcentral/main.tf b/modules/kubernetes/meshcentral/main.tf index 563d53dc..8fab8266 100644 --- a/modules/kubernetes/meshcentral/main.tf +++ b/modules/kubernetes/meshcentral/main.tf @@ -55,7 +55,7 @@ resource "kubernetes_deployment" "meshcentral" { image = "typhonragewind/meshcentral:latest" name = "meshcentral" port { - name = "https" + name = "http" container_port = 443 } env { @@ -133,7 +133,7 @@ resource "kubernetes_service" "meshcentral" { app = "meshcentral" } port { - name = "https" + name = "http" port = 443 protocol = "TCP" } diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index bf09e08a..8b401bd4 100755 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -155,12 +155,12 @@ serverFiles: - name: R730 Host rules: - alert: HighCPUTemperature - expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 60 + expr: node_hwmon_temp_celsius{instance="pve-node-r730"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance="pve-node-r730"} > 75 for: 30m labels: severity: page annotations: - summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 60°C)" + summary: "CPU temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)" - alert: SSDHighWriteRate expr: rate(node_disk_written_bytes_total{job="proxmox-host", device="sdb"}[2m]) / 1024 / 1024 > 2 # sdb is SSD; value in MB for: 10m @@ -361,12 +361,12 @@ serverFiles: annotations: summary: "Docker registry down for 10m" - alert: RegistryLowCacheHitRate - expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 50 + expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 25 for: 12h labels: severity: page annotations: - summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 50%)" + summary: "Registry cache hit rate: {{ $value | printf \"%.0f\" }}% (threshold: 25%)" - alert: NodeHighCPUUsage expr: pve_cpu_usage_ratio * 100 > 30 for: 6h @@ -446,6 +446,7 @@ serverFiles: / sum(rate(traefik_service_requests_total[5m])) by (service) * 100 ) > 10 + and sum(rate(traefik_service_requests_total[5m])) by (service) > 0.1 for: 5m labels: severity: page @@ -458,6 +459,7 @@ serverFiles: / sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service) * 100 ) > 30 + and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*|.*grafana.*"}[5m])) by (service) > 0.1 for: 10m labels: severity: page diff --git a/modules/kubernetes/osm-routing/main.tf b/modules/kubernetes/osm-routing/main.tf index 8fd54627..be943c04 100644 --- a/modules/kubernetes/osm-routing/main.tf +++ b/modules/kubernetes/osm-routing/main.tf @@ -165,7 +165,7 @@ resource "kubernetes_deployment" "otp" { } } spec { - replicas = 1 + replicas = 0 # Scaled down: TfL GTFS data expired, OTP crash-loops on build strategy { type = "Recreate" } diff --git a/modules/kubernetes/travel_blog/main.tf b/modules/kubernetes/travel_blog/main.tf index 15b29720..fd47fe1b 100644 --- a/modules/kubernetes/travel_blog/main.tf +++ b/modules/kubernetes/travel_blog/main.tf @@ -83,11 +83,6 @@ resource "kubernetes_service" "travel-blog" { labels = { app = "travel-blog" } - annotations = { - "prometheus.io/scrape" = "true" - "prometheus.io/path" = "/metrics" - "prometheus.io/port" = "9113" - } } spec { @@ -99,11 +94,6 @@ resource "kubernetes_service" "travel-blog" { port = "80" target_port = "80" } - port { - name = "prometheus" - port = "9113" - target_port = "9113" - } } }