diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf index 1cef3968..e1c6d425 100644 --- a/modules/kubernetes/nvidia/main.tf +++ b/modules/kubernetes/nvidia/main.tf @@ -49,7 +49,7 @@ resource "helm_release" "nvidia-gpu-operator" { # version = "0.9.3" timeout = 6000 - # values = [templatefile("${path.module}/values.yaml", {})] + values = [templatefile("${path.module}/values.yaml", {})] } resource "kubernetes_deployment" "nvidia-exporter" { diff --git a/modules/kubernetes/nvidia/values.yaml b/modules/kubernetes/nvidia/values.yaml index 97115270..f2611ea0 100644 --- a/modules/kubernetes/nvidia/values.yaml +++ b/modules/kubernetes/nvidia/values.yaml @@ -1,48 +1,15 @@ -# values-cuda12.8.yaml driver: enabled: true - repository: nvcr.io/nvidia/driver + # repository: nvcr.io/nvidia/driver # choose a driver version compatible with your GPU + CUDA 12.x (example) - version: "535.113.01" - -toolkit: - enabled: true - image: - repository: nvcr.io/nvidia/cuda - # use a CUDA 12.8 toolkit image; adjust OS tag as needed (e.g. ubuntu22.04, ubuntu20.04) - tag: "12.8.1-devel-ubuntu22.04" - # (Optional) If you have containerd, you might need environment for containerd config - # env: - # - name: CONTAINERD_CONFIG - # value: /etc/containerd/config.toml - # - name: CONTAINERD_SOCKET - # value: /run/containerd/containerd.sock - # - name: RUNTIME_CONFIG_SOURCE - # value: "command, file" - -devicePlugin: - enabled: true - image: - repository: nvcr.io/nvidia/k8s-device-plugin - # pick a device-plugin build compatible with CUDA 12.8 - tag: "1.14.0-12.8-ubuntu22.04" - -dcgmExporter: - enabled: true - image: - repository: nvcr.io/nvidia/k8s/dcgm-exporter - tag: "2.12.13-12.8-ubuntu22.04" - -# Optional: if you use MIG or want MIG Manager -# migManager: -# enabled: false - -# If you want GPU‑Feature‑Discovery (GFD) — optional -# gfd: -# enabled: false - -# Other optional overrides (keep defaults or adjust as needed) -# ccManager: -# enabled: false -# cdi: -# enabled: true + # NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue + # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ + # 13.x >= 580 + # 12.x >= 525, <580 + # 11.x >= 450, <525 + # + # Delete the cluster policy before each change + # version: "575.57.08" # CUDA 12.9 + version: "570.195.03" # CUDA 12.8 + upgradePolicy: + autoUpgrade: false