infra/stacks/nvidia/modules/nvidia/values.yaml

driver:
  enabled: true
  # repository: nvcr.io/nvidia/driver
  # choose a driver version compatible with your GPU + CUDA 12.x (example)
  # NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
  # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
  # 13.x >= 580
  # 12.x >= 525, <580
  # 11.x >= 450, <525
  #
  # Delete the cluster policy before each change
  # version: "575.57.08" # CUDA 12.9
  #
  # 2026-05-17: tried bumping to 580.x with kernelModuleType=open but
  # NVIDIA has NOT published any nvcr.io/nvidia/driver:*-ubuntu26.04
  # images yet (skopeo list-tags shows 0 ubuntu26.04 tags vs 779 for
  # ubuntu22.04 and 206 for ubuntu24.04). Chart v26.3.1's operator
  # auto-detects the host OS (k8s-node1 was upgraded to Ubuntu 26.04
  # with kernel 7.0.0-15-generic) and picks `<version>-ubuntu26.04` —
  # which then 404s on pull. Rolled back to chart v25.10.1 + this
  # 570.195.03 pin, which uses the ubuntu24.04 image suffix. That
  # image still can't compile against kernel 7.0.0 (apt sources are
  # 24.04 noble, which doesn't ship linux-headers-7.0.0-15-generic),
  # so the host kernel needs to be rolled back to 6.8.0-117-generic
  # (still installed in /boot) before the driver can come up.
  # See post-mortem 2026-05-17-gpu-driver-ubuntu2604-mismatch.md.
  version: "570.195.03" # CUDA 12.8 — pinned until NVIDIA ships ubuntu26.04 images
  upgradePolicy:
    autoUpgrade: false

  # 2026-05-17: bumped from the namespace LimitRange default of 128Mi.
  # The driver-installer's `apt-get install linux-headers-<kernel>` step
  # exceeded 128Mi and OOMKilled (exit 137) before producing any visible
  # output beyond "Installing Linux kernel headers...". 2Gi limit gives
  # the apt + module-compile phase enough headroom (peak observed ~1.4Gi
  # while DKMS builds the kernel module).
  resources:
    requests:
      cpu: "50m"
      memory: "822Mi"
    limits:
      memory: "2Gi"

  # 2026-05-25: extended startup probe from 120 to 300 failures.
  # On k8s-node1 (6 vCPUs, 16Gi RAM, Ubuntu 24.04 + 6.8.0-117-generic),
  # the full driver install sequence — apt install linux-headers (~2min) +
  # gcc make -j16 kernel module compilation (~12min) + nvidia-installer
  # file copy (~7min) = ~21min total, which exactly exhausted the default
  # 120×10s=20min window (exit 137 = SIGKILL from startup probe).
  # 300×10s = 50min gives 2.5× headroom on this hardware.
  startupProbe:
    failureThreshold: 300

  devicePlugin:
    config:
      name: time-slicing-config

# DCGM Exporter - reduced to 768Mi (actual usage ~489Mi, 1.5x margin)
dcgmExporter:
  resources:
    requests:
      memory: "768Mi"
    limits:
      memory: "768Mi"

# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
validator:
  resources:
    requests:
      memory: "256Mi"
    limits:
      memory: "256Mi"

# Tolerate GPU node taint for all GPU operator components
daemonsets:
  tolerations:
    - key: "nvidia.com/gpu"
      operator: "Equal"
      value: "true"
      effect: "NoSchedule"