diff --git a/modules/kubernetes/monitoring/dashboards/cluster_health.json b/modules/kubernetes/monitoring/dashboards/cluster_health.json index 971d99f7..00a662d8 100644 --- a/modules/kubernetes/monitoring/dashboards/cluster_health.json +++ b/modules/kubernetes/monitoring/dashboards/cluster_health.json @@ -202,7 +202,7 @@ "x": 0, "y": 1 }, - "id": 71, + "id": 78, "options": { "colorMode": "value", "graphMode": "area", @@ -217,7 +217,7 @@ }, "textMode": "auto" }, - "title": "GPU Temp", + "title": "CPU Temp", "type": "stat", "targets": [ { @@ -225,280 +225,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP", - "legendFormat": "GPU Temp", - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 60 - }, - { - "color": "red", - "value": 85 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 3, - "y": 1 - }, - "id": 72, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "title": "GPU Util %", - "type": "gauge", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL", - "legendFormat": "GPU Util", - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 12000 - }, - { - "color": "red", - "value": 14000 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 6, - "y": 1 - }, - "id": 73, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "title": "GPU VRAM Used", - "type": "stat", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED", - "legendFormat": "VRAM", - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 10 - }, - { - "color": "green", - "value": 30 - } - ] - }, - "unit": "m" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 9, - "y": 1 - }, - "id": 75, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "title": "UPS Battery (min)", - "type": "stat", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "ups_upsEstimatedMinutesRemaining", - "legendFormat": "Battery", - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 30 - }, - { - "color": "green", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 12, - "y": 1 - }, - "id": 76, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "title": "UPS Charge %", - "type": "gauge", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "ups_upsEstimatedChargeRemaining", - "legendFormat": "Charge", + "expr": "max(r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"})", + "legendFormat": "CPU Temp", "refId": "A" } ] @@ -538,10 +266,10 @@ "gridPos": { "h": 4, "w": 3, - "x": 15, + "x": 3, "y": 1 }, - "id": 78, + "id": 71, "options": { "colorMode": "value", "graphMode": "area", @@ -556,7 +284,7 @@ }, "textMode": "auto" }, - "title": "CPU Temp", + "title": "GPU Temp", "type": "stat", "targets": [ { @@ -564,8 +292,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"}", - "legendFormat": "CPU Temp", + "expr": "max(nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP)", + "legendFormat": "GPU Temp", "refId": "A" } ] @@ -621,7 +349,7 @@ "gridPos": { "h": 4, "w": 3, - "x": 18, + "x": 6, "y": 1 }, "id": 93, @@ -647,12 +375,284 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "ups_upsSecondsOnBattery", + "expr": "max(ups_upsSecondsOnBattery)", "legendFormat": "", "refId": "A" } ] }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 72, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "GPU Util %", + "type": "gauge", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "max(nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL)", + "legendFormat": "GPU Util", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 12000 + }, + { + "color": "red", + "value": 14000 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 73, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "title": "GPU VRAM Used", + "type": "stat", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "max(nvidia_tesla_t4_DCGM_FI_DEV_FB_USED)", + "legendFormat": "VRAM", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 10 + }, + { + "color": "green", + "value": 30 + } + ] + }, + "unit": "m" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 75, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "title": "UPS Battery (min)", + "type": "stat", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "max(ups_upsEstimatedMinutesRemaining)", + "legendFormat": "Battery", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 30 + }, + { + "color": "green", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 76, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "UPS Charge %", + "type": "gauge", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "max(ups_upsEstimatedChargeRemaining)", + "legendFormat": "Charge", + "refId": "A" + } + ] + }, { "datasource": { "type": "prometheus", @@ -720,7 +720,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "ups_upsAlarmsPresent", + "expr": "max(ups_upsAlarmsPresent)", "legendFormat": "", "refId": "A" } @@ -862,7 +862,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts[$__rate_interval])", + "expr": "max by() (avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts[$__rate_interval]))", "legendFormat": "Consumed", "refId": "A" }, @@ -871,7 +871,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "haos_sensor_power_w{entity=\"sensor.fv_b_pv_power\"}", + "expr": "max(haos_sensor_power_w{entity=\"sensor.fv_b_pv_power\"})", "legendFormat": "Solar Production", "refId": "B" }, @@ -880,7 +880,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"}", + "expr": "max(r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"})", "legendFormat": "CPU Temp", "refId": "C" } @@ -1550,6 +1550,39 @@ "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], "thresholds": { "mode": "absolute", @@ -1560,7 +1593,8 @@ } ] }, - "unit": "bytes" + "unit": "bytes", + "min": 0 }, "overrides": [] }, @@ -1572,75 +1606,32 @@ }, "id": 43, "options": { - "displayMode": "gradient", - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "left", - "orientation": "horizontal", - "reduceOptions": { + "legend": { "calcs": [ "lastNotNull" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color" + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, - "title": "Top 10 Pods by Memory", - "type": "bargauge", + "title": "GPU Memory by Namespace", + "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "topk(10, container_memory_working_set_bytes{container!=\"\"})", - "refId": "A", - "format": "table", - "instant": true - } - ], - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "__name__": true, - "container": true, - "endpoint": true, - "instance": true, - "job": true, - "namespace": true, - "service": true, - "uid": true, - "id": true, - "image": true, - "metrics_path": true, - "name": true, - "node": true - }, - "renameByName": {} - } - }, - { - "id": "sortBy", - "options": { - "fields": {}, - "sort": [ - { - "field": "Value", - "desc": true - } - ] - } - }, - { - "id": "rowsToFields", - "options": {} + "expr": "sum by (namespace) (gpu_pod_memory_used_bytes) > 0", + "format": "time_series", + "instant": false, + "legendFormat": "{{namespace}}", + "refId": "A" } ] }, @@ -1657,151 +1648,6 @@ "title": "Key Services", "type": "row" }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "0": { - "color": "red", - "text": "DOWN" - }, - "1": { - "color": "green", - "text": "UP" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 32 - }, - "id": 81, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "title": "API Server", - "type": "stat", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "up{job=\"apiserver\"}", - "legendFormat": "", - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 1 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "percent", - "noValue": "0", - "decimals": 2 - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 32 - }, - "id": 82, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "title": "API Server Error %", - "type": "stat", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m])) / sum(rate(apiserver_request_total[5m])) * 100", - "legendFormat": "", - "refId": "A" - } - ] - }, { "datasource": { "type": "prometheus", @@ -2124,75 +1970,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(traefik_open_connections)", - "legendFormat": "", - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 1 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "s", - "decimals": 2 - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 36 - }, - "id": 88, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "title": "API Latency P99", - "type": "stat", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", + "expr": "sum(max by(entrypoint, protocol) (traefik_open_connections))", "legendFormat": "", "refId": "A" }