diff --git a/modules/kubernetes/monitoring/dashboards/registry.json b/modules/kubernetes/monitoring/dashboards/registry.json index e9d904c7..cf10800f 100644 --- a/modules/kubernetes/monitoring/dashboards/registry.json +++ b/modules/kubernetes/monitoring/dashboards/registry.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 24, + "id": 0, "links": [], "panels": [ { @@ -182,10 +182,12 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "editorMode": "code", "expr": "registry_registry_storage_cache_total{instance=\"$instance\",type=\"Request\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ type }}", + "range": true, "refId": "A" } ], @@ -260,10 +262,12 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "editorMode": "code", "expr": "registry_registry_storage_cache_total{instance=\"$instance\",type=\"Hit\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ type }}", + "range": true, "refId": "A" } ], @@ -353,38 +357,30 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "description": "Process Resident Memory Usage", "fieldConfig": { "defaults": { "color": { - "fixedColor": "rgb(31, 120, 193)", - "mode": "fixed" + "mode": "thresholds" }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": 0 }, { - "color": "red", - "value": 80 + "color": "#EAB839", + "value": 0.3 + }, + { + "color": "green", + "value": 0.8 } ] }, - "unit": "decbytes" + "unit": "percentunit" }, "overrides": [] }, @@ -394,17 +390,16 @@ "x": 19, "y": 1 }, - "id": 24, - "maxDataPoints": 100, + "id": 45, "options": { - "colorMode": "none", + "colorMode": "value", "graphMode": "area", "justifyMode": "auto", - "orientation": "horizontal", + "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "mean" + "lastNotNull" ], "fields": "", "values": false @@ -421,15 +416,14 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "avg(registry_process_resident_memory_bytes{instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", + "expr": "(sum by (job) (rate(registry_registry_storage_cache_total{type=\"Hit\"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type=\"Request\"}[15m])))", + "instant": false, + "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Resident Memory Usage", + "title": "Cache Hit Rate", "type": "stat" }, { @@ -784,48 +778,24 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "description": "The HTTP requests", + "description": "Process Resident Memory Usage", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 3, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "showValues": false, - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } - }, - "mappings": [], - "min": 0, + ], "thresholds": { "mode": "absolute", "steps": [ @@ -839,31 +809,34 @@ } ] }, - "unit": "short" + "unit": "decbytes" }, "overrides": [] }, "gridPos": { - "h": 8, + "h": 5, "w": 8, "x": 0, "y": 12 }, - "id": 26, + "id": 24, + "maxDataPoints": 100, "options": { - "legend": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { "calcs": [ - "lastNotNull" + "mean" ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, "pluginVersion": "12.3.0", "targets": [ @@ -872,15 +845,17 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "ceil(rate(registry_registry_http_requests_total{instance=\"$instance\"}[5m]))", + "editorMode": "code", + "expr": "avg(registry_process_resident_memory_bytes{instance=\"$instance\"})", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ handler }}", + "intervalFactor": 2, + "legendFormat": "", + "range": true, "refId": "A" } ], - "title": "HTTP Requests", - "type": "timeseries" + "title": "Resident Memory Usage", + "type": "stat" }, { "datasource": { @@ -1091,6 +1066,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "description": "The HTTP requests", "fieldConfig": { "defaults": { "color": { @@ -1114,7 +1090,7 @@ }, "insertNulls": false, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 3, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -1124,7 +1100,7 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "normal" + "mode": "none" }, "thresholdsStyle": { "mode": "off" @@ -1145,28 +1121,30 @@ } ] }, - "unit": "s" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 8, "x": 0, - "y": 20 + "y": 17 }, - "id": 44, + "id": 26, "options": { "legend": { - "calcs": [], - "displayMode": "table", - "placement": "right", + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", - "sort": "desc" + "sort": "none" } }, "pluginVersion": "12.3.0", @@ -1176,17 +1154,14 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(registry_registry_storage_action_seconds_sum{instance=\"$instance\"}[2m]) * 1000", + "expr": "ceil(rate(registry_registry_http_requests_total{instance=\"$instance\"}[5m]))", "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{ action }}", - "refId": "A", - "step": 10, - "target": "" + "intervalFactor": 1, + "legendFormat": "{{ handler }}", + "refId": "A" } ], - "title": "Registry Action Latency", + "title": "HTTP Requests", "type": "timeseries" }, { @@ -1364,16 +1339,59 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" } - } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" }, "overrides": [] }, @@ -1381,46 +1399,20 @@ "h": 7, "w": 8, "x": 0, - "y": 27 + "y": 25 }, - "id": 2, + "id": 44, "options": { - "calculate": true, - "calculation": {}, - "cellGap": 2, - "cellValues": {}, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Oranges", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": false + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": false - }, - "yAxis": { - "axisPlacement": "left", - "min": "0", - "reverse": false, - "unit": "short" + "hideZeros": false, + "mode": "multi", + "sort": "desc" } }, "pluginVersion": "12.3.0", @@ -1430,16 +1422,18 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "editorMode": "code", - "expr": "rate(registry_http_request_duration_seconds_bucket{handler=\"blob_upload\"}[10m])", - "format": "heatmap", - "intervalFactor": 1, - "range": true, - "refId": "A" + "expr": "increase(registry_registry_storage_action_seconds_sum{instance=\"$instance\"}[2m]) * 1000", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{ action }}", + "refId": "A", + "step": 10, + "target": "" } ], - "title": "Upload HTTP Request Latencies in seconds (blob_upload)", - "type": "heatmap" + "title": "Registry Action Latency", + "type": "timeseries" }, { "datasource": { @@ -1606,6 +1600,90 @@ ], "title": "Catalog HTTP Request Latencies in seconds (catalog)", "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 32 + }, + "id": 2, + "options": { + "calculate": true, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "short" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "rate(registry_http_request_duration_seconds_bucket{handler=\"blob_upload\"}[10m])", + "format": "heatmap", + "intervalFactor": 1, + "range": true, + "refId": "A" + } + ], + "title": "Upload HTTP Request Latencies in seconds (blob_upload)", + "type": "heatmap" } ], "preload": false, @@ -1675,12 +1753,12 @@ ] }, "time": { - "from": "now-6h", + "from": "now-12h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Docker Registry", "uid": "CoBSgj8iz", - "version": 6 + "version": 10 } diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index ecaee66c..6b5466d8 100644 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -212,13 +212,6 @@ serverFiles: severity: page annotations: summary: "High system load: {{ $value }}. Can signal runaway process." - - alert: DockerRegistryDown - expr: (registry_process_start_time_seconds or on() vector(0)) == 0 - for: 10m - labels: - severity: page - annotations: - summary: "Docker registry is down" - name: Nvidia Tesla T4 GPU rules: - alert: HighGPUTemp @@ -294,6 +287,20 @@ serverFiles: severity: page annotations: summary: Node {{$labels.instance}} down. + - alert: DockerRegistryDown + expr: (registry_process_start_time_seconds or on() vector(0)) == 0 + for: 10m + labels: + severity: page + annotations: + summary: "Docker registry is down" + - alert: RegistryLowCacheHitRate + expr: (sum by (job) (rate(registry_registry_storage_cache_total{type="Hit"}[15m]))) / (sum by (job) (rate(registry_registry_storage_cache_total{type="Request"}[15m]))) * 100 < 50 + for: 12h + labels: + severity: page + annotations: + summary: "Low registry cache hit rate" - alert: NodeHighCPUUsage expr: node_load1{instance!="pve-node-r730"} > 2 for: 20m diff --git a/terraform.tfstate b/terraform.tfstate index 7b5417e7..5b056f64 100644 Binary files a/terraform.tfstate and b/terraform.tfstate differ