downgrade nvidia driver to work with 12.8 cuda[ci skip]
This commit is contained in:
parent
397fa0cba7
commit
308ce0019d
2 changed files with 13 additions and 46 deletions
|
|
@ -49,7 +49,7 @@ resource "helm_release" "nvidia-gpu-operator" {
|
|||
# version = "0.9.3"
|
||||
timeout = 6000
|
||||
|
||||
# values = [templatefile("${path.module}/values.yaml", {})]
|
||||
values = [templatefile("${path.module}/values.yaml", {})]
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "nvidia-exporter" {
|
||||
|
|
|
|||
|
|
@ -1,48 +1,15 @@
|
|||
# values-cuda12.8.yaml
|
||||
driver:
|
||||
enabled: true
|
||||
repository: nvcr.io/nvidia/driver
|
||||
# repository: nvcr.io/nvidia/driver
|
||||
# choose a driver version compatible with your GPU + CUDA 12.x (example)
|
||||
version: "535.113.01"
|
||||
|
||||
toolkit:
|
||||
enabled: true
|
||||
image:
|
||||
repository: nvcr.io/nvidia/cuda
|
||||
# use a CUDA 12.8 toolkit image; adjust OS tag as needed (e.g. ubuntu22.04, ubuntu20.04)
|
||||
tag: "12.8.1-devel-ubuntu22.04"
|
||||
# (Optional) If you have containerd, you might need environment for containerd config
|
||||
# env:
|
||||
# - name: CONTAINERD_CONFIG
|
||||
# value: /etc/containerd/config.toml
|
||||
# - name: CONTAINERD_SOCKET
|
||||
# value: /run/containerd/containerd.sock
|
||||
# - name: RUNTIME_CONFIG_SOURCE
|
||||
# value: "command, file"
|
||||
|
||||
devicePlugin:
|
||||
enabled: true
|
||||
image:
|
||||
repository: nvcr.io/nvidia/k8s-device-plugin
|
||||
# pick a device-plugin build compatible with CUDA 12.8
|
||||
tag: "1.14.0-12.8-ubuntu22.04"
|
||||
|
||||
dcgmExporter:
|
||||
enabled: true
|
||||
image:
|
||||
repository: nvcr.io/nvidia/k8s/dcgm-exporter
|
||||
tag: "2.12.13-12.8-ubuntu22.04"
|
||||
|
||||
# Optional: if you use MIG or want MIG Manager
|
||||
# migManager:
|
||||
# enabled: false
|
||||
|
||||
# If you want GPU‑Feature‑Discovery (GFD) — optional
|
||||
# gfd:
|
||||
# enabled: false
|
||||
|
||||
# Other optional overrides (keep defaults or adjust as needed)
|
||||
# ccManager:
|
||||
# enabled: false
|
||||
# cdi:
|
||||
# enabled: true
|
||||
# NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
|
||||
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
|
||||
# 13.x >= 580
|
||||
# 12.x >= 525, <580
|
||||
# 11.x >= 450, <525
|
||||
#
|
||||
# Delete the cluster policy before each change
|
||||
# version: "575.57.08" # CUDA 12.9
|
||||
version: "570.195.03" # CUDA 12.8
|
||||
upgradePolicy:
|
||||
autoUpgrade: false
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue