2025-12-14 09:49:58 +00:00
|
|
|
driver:
|
|
|
|
|
enabled: true
|
2025-12-14 19:09:07 +00:00
|
|
|
# repository: nvcr.io/nvidia/driver
|
2025-12-14 09:49:58 +00:00
|
|
|
# choose a driver version compatible with your GPU + CUDA 12.x (example)
|
2025-12-14 19:09:07 +00:00
|
|
|
# NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
|
|
|
|
|
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
|
|
|
|
|
# 13.x >= 580
|
|
|
|
|
# 12.x >= 525, <580
|
|
|
|
|
# 11.x >= 450, <525
|
|
|
|
|
#
|
|
|
|
|
# Delete the cluster policy before each change
|
|
|
|
|
# version: "575.57.08" # CUDA 12.9
|
|
|
|
|
version: "570.195.03" # CUDA 12.8
|
|
|
|
|
upgradePolicy:
|
|
|
|
|
autoUpgrade: false
|
2025-12-28 08:35:36 +00:00
|
|
|
|
|
|
|
|
devicePlugin:
|
|
|
|
|
config:
|
|
|
|
|
name: time-slicing-config
|
2026-02-06 20:19:26 +00:00
|
|
|
|
right-size 14 services and scale down GPU-heavy workloads [ci skip]
Memory right-sizing based on VPA upperBound analysis:
- Increases: stirling-pdf 1200→1536Mi, claude-memory 64→128Mi,
dawarich 512→768Mi, kyverno-cleanup 128→192Mi, linkwarden 768→1Gi,
navidrome 64→128Mi, listenarr 768→896Mi, privatebin 64→128Mi,
ntfy 64→128Mi, health 128→256Mi, dbaas quota 16→20Gi,
mysql-operator 384→512Mi
- Decreases: rybbit 768→384Mi, nvidia-exporter added explicit 192Mi,
dcgm-exporter 2560→1536Mi
- Scale to 0: ebook2audiobook/audiblez-web, whisper (GPU node pressure)
Net effect: -496Mi cluster-wide, 13 ContainerNearOOM alerts resolved,
all ResourceQuota pressures cleared, GPU health green.
2026-03-15 23:00:49 +00:00
|
|
|
# DCGM Exporter - reduced from 2560Mi to 1536Mi based on VPA upper bound of 1459Mi (1.05x margin)
|
2026-03-15 15:36:19 +00:00
|
|
|
dcgmExporter:
|
|
|
|
|
resources:
|
|
|
|
|
requests:
|
right-size 14 services and scale down GPU-heavy workloads [ci skip]
Memory right-sizing based on VPA upperBound analysis:
- Increases: stirling-pdf 1200→1536Mi, claude-memory 64→128Mi,
dawarich 512→768Mi, kyverno-cleanup 128→192Mi, linkwarden 768→1Gi,
navidrome 64→128Mi, listenarr 768→896Mi, privatebin 64→128Mi,
ntfy 64→128Mi, health 128→256Mi, dbaas quota 16→20Gi,
mysql-operator 384→512Mi
- Decreases: rybbit 768→384Mi, nvidia-exporter added explicit 192Mi,
dcgm-exporter 2560→1536Mi
- Scale to 0: ebook2audiobook/audiblez-web, whisper (GPU node pressure)
Net effect: -496Mi cluster-wide, 13 ContainerNearOOM alerts resolved,
all ResourceQuota pressures cleared, GPU health green.
2026-03-15 23:00:49 +00:00
|
|
|
memory: "1536Mi"
|
2026-03-15 15:36:19 +00:00
|
|
|
limits:
|
right-size 14 services and scale down GPU-heavy workloads [ci skip]
Memory right-sizing based on VPA upperBound analysis:
- Increases: stirling-pdf 1200→1536Mi, claude-memory 64→128Mi,
dawarich 512→768Mi, kyverno-cleanup 128→192Mi, linkwarden 768→1Gi,
navidrome 64→128Mi, listenarr 768→896Mi, privatebin 64→128Mi,
ntfy 64→128Mi, health 128→256Mi, dbaas quota 16→20Gi,
mysql-operator 384→512Mi
- Decreases: rybbit 768→384Mi, nvidia-exporter added explicit 192Mi,
dcgm-exporter 2560→1536Mi
- Scale to 0: ebook2audiobook/audiblez-web, whisper (GPU node pressure)
Net effect: -496Mi cluster-wide, 13 ContainerNearOOM alerts resolved,
all ResourceQuota pressures cleared, GPU health green.
2026-03-15 23:00:49 +00:00
|
|
|
memory: "1536Mi"
|
2026-03-15 15:36:19 +00:00
|
|
|
|
|
|
|
|
# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
|
|
|
|
|
validator:
|
|
|
|
|
resources:
|
|
|
|
|
requests:
|
|
|
|
|
memory: "256Mi"
|
|
|
|
|
limits:
|
|
|
|
|
memory: "256Mi"
|
|
|
|
|
|
2026-02-06 20:19:26 +00:00
|
|
|
# Tolerate GPU node taint for all GPU operator components
|
|
|
|
|
daemonsets:
|
|
|
|
|
tolerations:
|
|
|
|
|
- key: "nvidia.com/gpu"
|
|
|
|
|
operator: "Equal"
|
|
|
|
|
value: "true"
|
|
|
|
|
effect: "NoSchedule"
|