From 34f90c06dce0fdfad4b72a5488b004295113ebbe Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 28 Dec 2025 20:05:27 +0000 Subject: [PATCH] move grafana into separate file and tunr off persistence as we use external db now [ci skip] --- .../monitoring/dashboards/nvidia.json | 70 ++++++++++--------- modules/kubernetes/monitoring/grafana.tf | 68 ++++++++++++++++++ .../monitoring/grafana_chart_values.yaml | 3 +- 3 files changed, 108 insertions(+), 33 deletions(-) create mode 100644 modules/kubernetes/monitoring/grafana.tf diff --git a/modules/kubernetes/monitoring/dashboards/nvidia.json b/modules/kubernetes/monitoring/dashboards/nvidia.json index 27ca72ea..ee9a21ac 100644 --- a/modules/kubernetes/monitoring/dashboards/nvidia.json +++ b/modules/kubernetes/monitoring/dashboards/nvidia.json @@ -19,9 +19,8 @@ "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster", "editable": true, "fiscalYearStartMonth": 0, - "gnetId": 12239, "graphTooltip": 0, - "id": 26, + "id": 0, "links": [], "panels": [ { @@ -41,6 +40,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -57,6 +57,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -72,7 +73,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -103,10 +104,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -142,7 +145,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "#EAB839", @@ -180,7 +183,7 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.1.0", + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -215,6 +218,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -231,6 +235,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -246,7 +251,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -277,11 +282,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, - "pluginVersion": "6.5.2", + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -317,7 +323,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "#EAB839", @@ -355,7 +361,7 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.1.0", + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -363,10 +369,12 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", + "exemplar": false, "expr": "sum(nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE)", + "instant": true, "interval": "", "legendFormat": "", - "range": true, + "range": false, "refId": "A" } ], @@ -390,6 +398,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -406,6 +415,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -423,7 +433,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -454,10 +464,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -492,6 +504,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -508,6 +521,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -523,7 +537,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -553,10 +567,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -591,6 +607,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -607,6 +624,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -622,7 +640,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -641,7 +659,6 @@ "y": 24 }, "id": 2, - "interval": "", "options": { "legend": { "calcs": [ @@ -654,10 +671,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.3.0", "targets": [ { "datasource": { @@ -678,33 +697,20 @@ "type": "timeseries" } ], + "preload": false, "refresh": "auto", - "schemaVersion": 39, + "schemaVersion": 42, "tags": [], "templating": { "list": [] }, "time": { - "from": "now-30m", + "from": "now-3h", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, + "timepicker": {}, "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", - "version": 7, - "weekStart": "" + "version": 8 } diff --git a/modules/kubernetes/monitoring/grafana.tf b/modules/kubernetes/monitoring/grafana.tf new file mode 100644 index 00000000..e31ce1a9 --- /dev/null +++ b/modules/kubernetes/monitoring/grafana.tf @@ -0,0 +1,68 @@ + +# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" { +# metadata { +# name = "grafana-pv" +# } +# spec { +# capacity = { +# "storage" = "2Gi" +# } +# access_modes = ["ReadWriteOnce"] +# persistent_volume_source { +# nfs { +# path = "/mnt/main/grafana" +# server = "10.0.10.15" +# } +# # iscsi { +# # target_portal = "iscsi.viktorbarzin.lan:3260" +# # iqn = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana" +# # lun = 0 +# # fs_type = "ext4" +# # } +# } +# } +# } + +resource "kubernetes_persistent_volume" "alertmanager_pv" { + metadata { + name = "alertmanager-pv" + } + spec { + capacity = { + "storage" = "2Gi" + } + access_modes = ["ReadWriteOnce"] + persistent_volume_source { + nfs { + path = "/mnt/main/alertmanager" + server = "10.0.10.15" + } + } + } +} +# resource "kubernetes_persistent_volume_claim" "grafana_pvc" { +# metadata { +# name = "grafana-pvc" +# namespace = "monitoring" +# } +# spec { +# access_modes = ["ReadWriteOnce"] +# resources { +# requests = { +# "storage" = "2Gi" +# } +# } +# } +# } + +resource "helm_release" "grafana" { + namespace = "monitoring" + create_namespace = true + name = "grafana" + atomic = true + + repository = "https://grafana.github.io/helm-charts" + chart = "grafana" + + values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password })] +} diff --git a/modules/kubernetes/monitoring/grafana_chart_values.yaml b/modules/kubernetes/monitoring/grafana_chart_values.yaml index 5b742b3c..812ba4e9 100644 --- a/modules/kubernetes/monitoring/grafana_chart_values.yaml +++ b/modules/kubernetes/monitoring/grafana_chart_values.yaml @@ -1,7 +1,7 @@ deploymentStrategy: type: Recreate persistence: - enabled: true + enabled: false # using external mysql existingClaim: "grafana-pvc" ingress: enabled: "true" @@ -39,6 +39,7 @@ dashboardProviders: path: "/var/lib/grafana/dashboards/default" env: GF_DATABASE_PASSWORD: "${db_password}" + GF_SERVER_ROOT_URL: https://grafana.viktorbarzin.me grafana.ini: database: