infra/stacks/platform/modules/nvidia/values.yaml

driver:
  enabled: true
  # repository: nvcr.io/nvidia/driver
  # choose a driver version compatible with your GPU + CUDA 12.x (example)
  # NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
  # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
  # 13.x >= 580
  # 12.x >= 525, <580
  # 11.x >= 450, <525
  #
  # Delete the cluster policy before each change
  # version: "575.57.08" # CUDA 12.9
  version: "570.195.03" # CUDA 12.8
  upgradePolicy:
    autoUpgrade: false

  devicePlugin:
    config:
      name: time-slicing-config

# DCGM Exporter - increased from 768Mi to 2560Mi based on VPA upper bound of 2091Mi
dcgmExporter:
  resources:
    requests:
      memory: "2560Mi"
    limits:
      memory: "2560Mi"

# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
validator:
  resources:
    requests:
      memory: "256Mi"
    limits:
      memory: "256Mi"

# Tolerate GPU node taint for all GPU operator components
daemonsets:
  tolerations:
    - key: "nvidia.com/gpu"
      operator: "Equal"
      value: "true"
      effect: "NoSchedule"