extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip]
Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules.
This commit is contained in:
parent
3c804aedf8
commit
ae36dc253b
73 changed files with 166093 additions and 96 deletions
27
stacks/nvidia/modules/nvidia/Dockerfile
Normal file
27
stacks/nvidia/modules/nvidia/Dockerfile
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# GPU container
|
||||
|
||||
FROM ubuntu
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and pip
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-venv
|
||||
|
||||
# Deps
|
||||
RUN apt-get install -y ffmpeg espeak-ng
|
||||
|
||||
# Set a working directory
|
||||
WORKDIR /app
|
||||
|
||||
RUN python3 -m venv audiblez && ./audiblez/bin/pip install audiblez
|
||||
# RUN python3 -m venv audiblez
|
||||
|
||||
CMD ["/usr/bin/sleep", "86400"]
|
||||
# RUN pip install audiblez
|
||||
|
||||
# # Default command
|
||||
# CMD ["/usr/bin/sleep", "86400"]
|
||||
688
stacks/nvidia/modules/nvidia/main.tf
Normal file
688
stacks/nvidia/modules/nvidia/main.tf
Normal file
|
|
@ -0,0 +1,688 @@
|
|||
variable "tls_secret_name" {}
|
||||
variable "tier" { type = string }
|
||||
|
||||
module "tls_secret" {
|
||||
source = "../../../../modules/kubernetes/setup_tls_secret"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
tls_secret_name = var.tls_secret_name
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace" "nvidia" {
|
||||
metadata {
|
||||
name = "nvidia"
|
||||
labels = {
|
||||
"istio-injection" : "disabled"
|
||||
tier = var.tier
|
||||
"resource-governance/custom-quota" = "true"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_resource_quota" "nvidia_quota" {
|
||||
metadata {
|
||||
name = "tier-quota"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
hard = {
|
||||
"limits.memory" = "48Gi"
|
||||
"requests.cpu" = "8"
|
||||
"requests.memory" = "12Gi"
|
||||
pods = "40"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Apply GPU taint and label to ensure only GPU workloads run on GPU node
|
||||
resource "null_resource" "gpu_node_config" {
|
||||
provisioner "local-exec" {
|
||||
command = <<-EOT
|
||||
kubectl taint nodes k8s-node1 nvidia.com/gpu=true:PreferNoSchedule --overwrite
|
||||
kubectl label nodes k8s-node1 gpu=true --overwrite
|
||||
EOT
|
||||
}
|
||||
|
||||
# Re-run if namespace changes (proxy for cluster changes)
|
||||
triggers = {
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
# [not needed anymore; part of the chart values] Apply to operator with:
|
||||
# kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'
|
||||
|
||||
resource "kubernetes_config_map" "time_slicing_config" {
|
||||
metadata {
|
||||
name = "time-slicing-config"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
|
||||
data = {
|
||||
any = <<-EOF
|
||||
flags:
|
||||
migStrategy: none
|
||||
sharing:
|
||||
timeSlicing:
|
||||
renameByDefault: false
|
||||
failRequestsGreaterThanOne: false
|
||||
resources:
|
||||
- name: nvidia.com/gpu
|
||||
replicas: 100
|
||||
EOF
|
||||
}
|
||||
depends_on = [kubernetes_namespace.nvidia]
|
||||
}
|
||||
|
||||
resource "helm_release" "nvidia-gpu-operator" {
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
name = "nvidia-gpu-operator"
|
||||
|
||||
repository = "https://helm.ngc.nvidia.com/nvidia"
|
||||
chart = "gpu-operator"
|
||||
atomic = true
|
||||
# version = "0.9.3"
|
||||
timeout = 6000
|
||||
|
||||
values = [templatefile("${path.module}/values.yaml", {})]
|
||||
depends_on = [kubernetes_config_map.time_slicing_config]
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "nvidia-exporter" {
|
||||
metadata {
|
||||
name = "nvidia-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
labels = {
|
||||
app = "nvidia-exporter"
|
||||
tier = var.tier
|
||||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "nvidia-exporter"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "nvidia-exporter"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
node_selector = {
|
||||
"gpu" : "true"
|
||||
}
|
||||
toleration {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Equal"
|
||||
value = "true"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
container {
|
||||
image = "nvidia/dcgm-exporter:latest"
|
||||
name = "nvidia-exporter"
|
||||
port {
|
||||
container_port = 9400
|
||||
}
|
||||
security_context {
|
||||
privileged = true
|
||||
capabilities {
|
||||
add = ["SYS_ADMIN"]
|
||||
}
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
memory = "192Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "192Mi"
|
||||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
depends_on = [helm_release.nvidia-gpu-operator]
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "nvidia-exporter" {
|
||||
metadata {
|
||||
name = "nvidia-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
labels = {
|
||||
"app" = "nvidia-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
app = "nvidia-exporter"
|
||||
}
|
||||
port {
|
||||
name = "http"
|
||||
port = 80
|
||||
target_port = 9400
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
module "ingress" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
name = "nvidia-exporter"
|
||||
root_domain = "viktorbarzin.lan"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
allow_local_access_only = true
|
||||
ssl_redirect = false
|
||||
}
|
||||
|
||||
# resource "kubernetes_ingress_v1" "nvidia-exporter" {
|
||||
# metadata {
|
||||
# name = "nvidia-exporter"
|
||||
# namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
# annotations = {
|
||||
# "kubernetes.io/ingress.class" = "nginx"
|
||||
# "nginx.ingress.kubernetes.io/whitelist-source-range" : "192.168.1.0/24, 10.0.0.0/8"
|
||||
# "nginx.ingress.kubernetes.io/ssl-redirect" : "false" # used only in LAN
|
||||
|
||||
# }
|
||||
# }
|
||||
# spec {
|
||||
# tls {
|
||||
# hosts = ["nvidia-exporter.viktorbarzin.lan"]
|
||||
# secret_name = var.tls_secret_name
|
||||
# }
|
||||
# rule {
|
||||
# host = "nvidia-exporter.viktorbarzin.lan"
|
||||
# http {
|
||||
# path {
|
||||
# backend {
|
||||
# service {
|
||||
# name = "nvidia-exporter"
|
||||
# port {
|
||||
# number = 80
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
|
||||
# resource "kubernetes_deployment" "gpu-container" {
|
||||
# metadata {
|
||||
# name = "gpu-container"
|
||||
# namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
# labels = {
|
||||
# app = "gpu-container"
|
||||
# }
|
||||
# }
|
||||
# spec {
|
||||
# replicas = 1
|
||||
# selector {
|
||||
# match_labels = {
|
||||
# app = "gpu-container"
|
||||
# }
|
||||
# }
|
||||
# template {
|
||||
# metadata {
|
||||
# labels = {
|
||||
# app = "gpu-container"
|
||||
# }
|
||||
# }
|
||||
# spec {
|
||||
# node_selector = {
|
||||
# "gpu" : "true"
|
||||
# }
|
||||
# container {
|
||||
# image = "ubuntu"
|
||||
# name = "gpu-container"
|
||||
# command = ["/usr/bin/sleep", "3600"]
|
||||
# # security_context {
|
||||
# # privileged = true
|
||||
# # capabilities {
|
||||
# # add = ["SYS_ADMIN"]
|
||||
# # }
|
||||
# # }
|
||||
# resources {
|
||||
# limits = {
|
||||
# "nvidia.com/gpu" = "1"
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# depends_on = [helm_release.nvidia-gpu-operator]
|
||||
# }
|
||||
|
||||
# GPU Pod Memory Exporter - exposes per-pod GPU memory usage as Prometheus metrics
|
||||
resource "kubernetes_config_map" "gpu_pod_exporter_script" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter-script"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
|
||||
data = {
|
||||
"exporter.py" = <<-EOF
|
||||
#!/usr/bin/env python3
|
||||
"""GPU Pod Memory Exporter - Collects per-pod GPU memory usage."""
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import urllib.request
|
||||
import ssl
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
|
||||
METRICS_PORT = 9401
|
||||
SCRAPE_INTERVAL = 15
|
||||
|
||||
# Kubernetes API configuration
|
||||
K8S_API = "https://kubernetes.default.svc"
|
||||
TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||
|
||||
# Cache for container ID to pod info mapping
|
||||
container_cache = {}
|
||||
cache_refresh_time = 0
|
||||
CACHE_TTL = 60 # Refresh cache every 60 seconds
|
||||
|
||||
def get_k8s_token():
|
||||
"""Read Kubernetes service account token."""
|
||||
try:
|
||||
with open(TOKEN_PATH, 'r') as f:
|
||||
return f.read().strip()
|
||||
except:
|
||||
return None
|
||||
|
||||
def refresh_container_cache():
|
||||
"""Refresh the container ID to pod mapping from Kubernetes API."""
|
||||
global container_cache, cache_refresh_time
|
||||
|
||||
token = get_k8s_token()
|
||||
if not token:
|
||||
return
|
||||
|
||||
try:
|
||||
# Create SSL context with K8s CA
|
||||
ctx = ssl.create_default_context()
|
||||
if os.path.exists(CA_PATH):
|
||||
ctx.load_verify_locations(CA_PATH)
|
||||
|
||||
# Get all pods on this node
|
||||
node_name = os.environ.get('NODE_NAME', '')
|
||||
url = f"{K8S_API}/api/v1/pods?fieldSelector=spec.nodeName={node_name}"
|
||||
|
||||
req = urllib.request.Request(url, headers={
|
||||
'Authorization': f'Bearer {token}',
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
|
||||
with urllib.request.urlopen(req, context=ctx, timeout=10) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
|
||||
new_cache = {}
|
||||
for pod in data.get('items', []):
|
||||
pod_name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
|
||||
# Get container statuses
|
||||
for status in pod.get('status', {}).get('containerStatuses', []):
|
||||
container_id = status.get('containerID', '')
|
||||
# Extract the ID part (e.g., "containerd://abc123..." -> "abc123")
|
||||
if '://' in container_id:
|
||||
container_id = container_id.split('://')[-1]
|
||||
if container_id:
|
||||
short_id = container_id[:12]
|
||||
new_cache[short_id] = {
|
||||
'pod': pod_name,
|
||||
'namespace': namespace,
|
||||
'container': status.get('name', 'unknown')
|
||||
}
|
||||
|
||||
container_cache = new_cache
|
||||
cache_refresh_time = time.time()
|
||||
print(f"Refreshed container cache: {len(new_cache)} containers")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error refreshing container cache: {e}")
|
||||
|
||||
def get_pod_info(container_id):
|
||||
"""Look up pod info for a container ID."""
|
||||
global cache_refresh_time
|
||||
|
||||
# Refresh cache if stale
|
||||
if time.time() - cache_refresh_time > CACHE_TTL:
|
||||
refresh_container_cache()
|
||||
|
||||
return container_cache.get(container_id, {
|
||||
'pod': 'unknown',
|
||||
'namespace': 'unknown',
|
||||
'container': 'unknown'
|
||||
})
|
||||
|
||||
def get_gpu_processes():
|
||||
"""Run nvidia-smi to get GPU process info."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-compute-apps=pid,used_memory,process_name", "--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f"nvidia-smi error: {result.stderr}")
|
||||
return []
|
||||
|
||||
processes = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = [p.strip() for p in line.split(',')]
|
||||
if len(parts) >= 3:
|
||||
pid, memory_mib, process_name = parts[0], parts[1], parts[2]
|
||||
processes.append({
|
||||
'pid': pid,
|
||||
'memory_bytes': int(memory_mib) * 1024 * 1024,
|
||||
'process_name': process_name
|
||||
})
|
||||
return processes
|
||||
except Exception as e:
|
||||
print(f"Error running nvidia-smi: {e}")
|
||||
return []
|
||||
|
||||
def get_container_id(pid):
|
||||
"""Map PID to container ID via cgroup."""
|
||||
cgroup_path = f"/host_proc/{pid}/cgroup"
|
||||
try:
|
||||
with open(cgroup_path, 'r') as f:
|
||||
for line in f:
|
||||
# Match container ID patterns (docker, containerd, cri-o)
|
||||
match = re.search(r'[:/]([a-f0-9]{64})', line)
|
||||
if match:
|
||||
return match.group(1)[:12]
|
||||
match = re.search(r'cri-containerd-([a-f0-9]{64})', line)
|
||||
if match:
|
||||
return match.group(1)[:12]
|
||||
except (FileNotFoundError, PermissionError):
|
||||
pass
|
||||
return "host"
|
||||
|
||||
# Global metrics storage
|
||||
current_metrics = []
|
||||
|
||||
def collect_metrics():
|
||||
"""Collect GPU memory metrics."""
|
||||
global current_metrics
|
||||
metrics = []
|
||||
processes = get_gpu_processes()
|
||||
|
||||
for proc in processes:
|
||||
container_id = get_container_id(proc['pid'])
|
||||
pod_info = get_pod_info(container_id)
|
||||
metrics.append({
|
||||
'container_id': container_id,
|
||||
'pid': proc['pid'],
|
||||
'process_name': proc['process_name'],
|
||||
'memory_bytes': proc['memory_bytes'],
|
||||
'pod': pod_info['pod'],
|
||||
'namespace': pod_info['namespace'],
|
||||
'container': pod_info['container']
|
||||
})
|
||||
|
||||
current_metrics = metrics
|
||||
|
||||
def format_metrics():
|
||||
"""Format metrics in Prometheus exposition format."""
|
||||
lines = [
|
||||
"# HELP gpu_pod_memory_used_bytes GPU memory used by pod",
|
||||
"# TYPE gpu_pod_memory_used_bytes gauge"
|
||||
]
|
||||
|
||||
for m in current_metrics:
|
||||
labels = ','.join([
|
||||
f'namespace="{m["namespace"]}"',
|
||||
f'pod="{m["pod"]}"',
|
||||
f'container="{m["container"]}"',
|
||||
f'process_name="{m["process_name"]}"',
|
||||
f'pid="{m["pid"]}"'
|
||||
])
|
||||
lines.append(f'gpu_pod_memory_used_bytes{{{labels}}} {m["memory_bytes"]}')
|
||||
|
||||
return '\n'.join(lines) + '\n'
|
||||
|
||||
class MetricsHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == '/metrics':
|
||||
content = format_metrics()
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'text/plain; charset=utf-8')
|
||||
self.end_headers()
|
||||
self.wfile.write(content.encode())
|
||||
elif self.path == '/health':
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'ok')
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
pass # Suppress request logging
|
||||
|
||||
def background_collector():
|
||||
"""Background thread to collect metrics periodically."""
|
||||
import threading
|
||||
def run():
|
||||
while True:
|
||||
collect_metrics()
|
||||
time.sleep(SCRAPE_INTERVAL)
|
||||
thread = threading.Thread(target=run, daemon=True)
|
||||
thread.start()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}")
|
||||
refresh_container_cache() # Initial cache load
|
||||
collect_metrics() # Initial collection
|
||||
background_collector()
|
||||
|
||||
server = HTTPServer(('', METRICS_PORT), MetricsHandler)
|
||||
server.serve_forever()
|
||||
EOF
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_service_account" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
}
|
||||
|
||||
rule {
|
||||
api_groups = [""]
|
||||
resources = ["pods"]
|
||||
verbs = ["list"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
}
|
||||
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "ClusterRole"
|
||||
name = kubernetes_cluster_role.gpu_pod_exporter.metadata[0].name
|
||||
}
|
||||
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_daemonset" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
tier = var.tier
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
host_pid = true
|
||||
service_account_name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name
|
||||
|
||||
node_selector = {
|
||||
"gpu" : "true"
|
||||
}
|
||||
|
||||
toleration {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Equal"
|
||||
value = "true"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
|
||||
container {
|
||||
name = "exporter"
|
||||
image = "python:3.11-slim"
|
||||
|
||||
command = ["/bin/bash", "-c"]
|
||||
args = [
|
||||
"python3 /scripts/exporter.py"
|
||||
]
|
||||
|
||||
env {
|
||||
name = "NODE_NAME"
|
||||
value_from {
|
||||
field_ref {
|
||||
field_path = "spec.nodeName"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
port {
|
||||
container_port = 9401
|
||||
name = "metrics"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "scripts"
|
||||
mount_path = "/scripts"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "host-proc"
|
||||
mount_path = "/host_proc"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "128Mi"
|
||||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/health"
|
||||
port = 9401
|
||||
}
|
||||
initial_delay_seconds = 30
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
}
|
||||
}
|
||||
|
||||
volume {
|
||||
name = "scripts"
|
||||
config_map {
|
||||
name = kubernetes_config_map.gpu_pod_exporter_script.metadata[0].name
|
||||
default_mode = "0755"
|
||||
}
|
||||
}
|
||||
|
||||
volume {
|
||||
name = "host-proc"
|
||||
host_path {
|
||||
path = "/proc"
|
||||
type = "Directory"
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
depends_on = [helm_release.nvidia-gpu-operator]
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
|
||||
port {
|
||||
name = "metrics"
|
||||
port = 80
|
||||
target_port = 9401
|
||||
}
|
||||
}
|
||||
}
|
||||
43
stacks/nvidia/modules/nvidia/values.yaml
Normal file
43
stacks/nvidia/modules/nvidia/values.yaml
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
driver:
|
||||
enabled: true
|
||||
# repository: nvcr.io/nvidia/driver
|
||||
# choose a driver version compatible with your GPU + CUDA 12.x (example)
|
||||
# NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
|
||||
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
|
||||
# 13.x >= 580
|
||||
# 12.x >= 525, <580
|
||||
# 11.x >= 450, <525
|
||||
#
|
||||
# Delete the cluster policy before each change
|
||||
# version: "575.57.08" # CUDA 12.9
|
||||
version: "570.195.03" # CUDA 12.8
|
||||
upgradePolicy:
|
||||
autoUpgrade: false
|
||||
|
||||
devicePlugin:
|
||||
config:
|
||||
name: time-slicing-config
|
||||
|
||||
# DCGM Exporter - reduced from 2560Mi to 1536Mi based on VPA upper bound of 1459Mi (1.05x margin)
|
||||
dcgmExporter:
|
||||
resources:
|
||||
requests:
|
||||
memory: "1536Mi"
|
||||
limits:
|
||||
memory: "1536Mi"
|
||||
|
||||
# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
|
||||
validator:
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
|
||||
# Tolerate GPU node taint for all GPU operator components
|
||||
daemonsets:
|
||||
tolerations:
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: "Equal"
|
||||
value: "true"
|
||||
effect: "NoSchedule"
|
||||
Loading…
Add table
Add a link
Reference in a new issue