driver: enabled: true # repository: nvcr.io/nvidia/driver # choose a driver version compatible with your GPU + CUDA 12.x (example) # NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ # 13.x >= 580 # 12.x >= 525, <580 # 11.x >= 450, <525 # # Delete the cluster policy before each change # version: "575.57.08" # CUDA 12.9 version: "570.195.03" # CUDA 12.8 upgradePolicy: autoUpgrade: false devicePlugin: config: name: time-slicing-config # DCGM Exporter - reduced from 2560Mi to 1536Mi based on VPA upper bound of 1459Mi (1.05x margin) dcgmExporter: resources: requests: memory: "1536Mi" limits: memory: "1536Mi" # CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job) validator: resources: requests: memory: "256Mi" limits: memory: "256Mi" # Tolerate GPU node taint for all GPU operator components daemonsets: tolerations: - key: "nvidia.com/gpu" operator: "Equal" value: "true" effect: "NoSchedule"