Add per-pod GPU memory metrics exporter
- Add DaemonSet that runs on GPU node and exposes Prometheus metrics - Uses nvidia-smi to collect per-process GPU memory usage - Maps PIDs to container IDs via /proc/<pid>/cgroup - Exposes gpu_pod_memory_used_bytes metric at :9401/metrics - Add Prometheus scrape config for gpu-pod-memory job [ci skip]
This commit is contained in:
parent
09a5e3a273
commit
4a857ebefd
2 changed files with 294 additions and 0 deletions
|
|
@ -623,4 +623,9 @@ extraScrapeConfigs: |
|
|||
action: replace
|
||||
regex: '(.*)'
|
||||
replacement: 'nvidia_tesla_t4_$${1}'
|
||||
- job_name: 'gpu-pod-memory'
|
||||
static_configs:
|
||||
- targets:
|
||||
- "gpu-pod-exporter.nvidia.svc.cluster.local"
|
||||
metrics_path: '/metrics'
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,18 @@ resource "kubernetes_namespace" "nvidia" {
|
|||
}
|
||||
}
|
||||
|
||||
# Apply GPU taint to ensure only GPU workloads run on GPU node
|
||||
resource "null_resource" "gpu_node_taint" {
|
||||
provisioner "local-exec" {
|
||||
command = "kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite"
|
||||
}
|
||||
|
||||
# Re-run if namespace changes (proxy for cluster changes)
|
||||
triggers = {
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
# [not needed anymore; part of the chart values] Apply to operator with:
|
||||
# kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'
|
||||
|
||||
|
|
@ -82,6 +94,12 @@ resource "kubernetes_deployment" "nvidia-exporter" {
|
|||
node_selector = {
|
||||
"gpu" : "true"
|
||||
}
|
||||
toleration {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Equal"
|
||||
value = "true"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
container {
|
||||
image = "nvidia/dcgm-exporter:latest"
|
||||
name = "nvidia-exporter"
|
||||
|
|
@ -219,3 +237,274 @@ module "ingress" {
|
|||
# }
|
||||
# depends_on = [helm_release.nvidia-gpu-operator]
|
||||
# }
|
||||
|
||||
# GPU Pod Memory Exporter - exposes per-pod GPU memory usage as Prometheus metrics
|
||||
resource "kubernetes_config_map" "gpu_pod_exporter_script" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter-script"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
}
|
||||
|
||||
data = {
|
||||
"exporter.py" = <<-EOF
|
||||
#!/usr/bin/env python3
|
||||
"""GPU Pod Memory Exporter - Collects per-pod GPU memory usage."""
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
import re
|
||||
import os
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
|
||||
METRICS_PORT = 9401
|
||||
SCRAPE_INTERVAL = 15
|
||||
|
||||
def get_gpu_processes():
|
||||
"""Run nvidia-smi to get GPU process info."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-compute-apps=pid,used_memory,process_name", "--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f"nvidia-smi error: {result.stderr}")
|
||||
return []
|
||||
|
||||
processes = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = [p.strip() for p in line.split(',')]
|
||||
if len(parts) >= 3:
|
||||
pid, memory_mib, process_name = parts[0], parts[1], parts[2]
|
||||
processes.append({
|
||||
'pid': pid,
|
||||
'memory_bytes': int(memory_mib) * 1024 * 1024,
|
||||
'process_name': process_name
|
||||
})
|
||||
return processes
|
||||
except Exception as e:
|
||||
print(f"Error running nvidia-smi: {e}")
|
||||
return []
|
||||
|
||||
def get_container_id(pid):
|
||||
"""Map PID to container ID via cgroup."""
|
||||
cgroup_path = f"/host_proc/{pid}/cgroup"
|
||||
try:
|
||||
with open(cgroup_path, 'r') as f:
|
||||
for line in f:
|
||||
# Match container ID patterns (docker, containerd, cri-o)
|
||||
# e.g., /kubepods/pod.../containerid or /docker/containerid
|
||||
match = re.search(r'[:/]([a-f0-9]{64})', line)
|
||||
if match:
|
||||
return match.group(1)[:12] # Return short container ID
|
||||
# Also check for cri-containerd pattern
|
||||
match = re.search(r'cri-containerd-([a-f0-9]{64})', line)
|
||||
if match:
|
||||
return match.group(1)[:12]
|
||||
except (FileNotFoundError, PermissionError):
|
||||
pass
|
||||
return "host"
|
||||
|
||||
# Global metrics storage
|
||||
current_metrics = []
|
||||
|
||||
def collect_metrics():
|
||||
"""Collect GPU memory metrics."""
|
||||
global current_metrics
|
||||
metrics = []
|
||||
processes = get_gpu_processes()
|
||||
|
||||
for proc in processes:
|
||||
container_id = get_container_id(proc['pid'])
|
||||
metrics.append({
|
||||
'container_id': container_id,
|
||||
'pid': proc['pid'],
|
||||
'process_name': proc['process_name'],
|
||||
'memory_bytes': proc['memory_bytes']
|
||||
})
|
||||
|
||||
current_metrics = metrics
|
||||
|
||||
def format_metrics():
|
||||
"""Format metrics in Prometheus exposition format."""
|
||||
lines = [
|
||||
"# HELP gpu_pod_memory_used_bytes GPU memory used by container",
|
||||
"# TYPE gpu_pod_memory_used_bytes gauge"
|
||||
]
|
||||
|
||||
for m in current_metrics:
|
||||
labels = f'container_id="{m["container_id"]}",pid="{m["pid"]}",process_name="{m["process_name"]}"'
|
||||
lines.append(f"gpu_pod_memory_used_bytes{{{labels}}} {m['memory_bytes']}")
|
||||
|
||||
return '\n'.join(lines) + '\n'
|
||||
|
||||
class MetricsHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == '/metrics':
|
||||
content = format_metrics()
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'text/plain; charset=utf-8')
|
||||
self.end_headers()
|
||||
self.wfile.write(content.encode())
|
||||
elif self.path == '/health':
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'ok')
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
pass # Suppress request logging
|
||||
|
||||
def background_collector():
|
||||
"""Background thread to collect metrics periodically."""
|
||||
import threading
|
||||
def run():
|
||||
while True:
|
||||
collect_metrics()
|
||||
time.sleep(SCRAPE_INTERVAL)
|
||||
thread = threading.Thread(target=run, daemon=True)
|
||||
thread.start()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}")
|
||||
collect_metrics() # Initial collection
|
||||
background_collector()
|
||||
|
||||
server = HTTPServer(('', METRICS_PORT), MetricsHandler)
|
||||
server.serve_forever()
|
||||
EOF
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_daemonset" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
tier = var.tier
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
host_pid = true
|
||||
|
||||
node_selector = {
|
||||
"gpu" : "true"
|
||||
}
|
||||
|
||||
toleration {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Equal"
|
||||
value = "true"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
|
||||
container {
|
||||
name = "exporter"
|
||||
image = "python:3.11-slim"
|
||||
|
||||
command = ["/bin/bash", "-c"]
|
||||
args = [
|
||||
"python3 /scripts/exporter.py"
|
||||
]
|
||||
|
||||
port {
|
||||
container_port = 9401
|
||||
name = "metrics"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "scripts"
|
||||
mount_path = "/scripts"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "host-proc"
|
||||
mount_path = "/host_proc"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
limits = {
|
||||
cpu = "200m"
|
||||
memory = "256Mi"
|
||||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/health"
|
||||
port = 9401
|
||||
}
|
||||
initial_delay_seconds = 30
|
||||
period_seconds = 30
|
||||
}
|
||||
}
|
||||
|
||||
volume {
|
||||
name = "scripts"
|
||||
config_map {
|
||||
name = kubernetes_config_map.gpu_pod_exporter_script.metadata[0].name
|
||||
default_mode = "0755"
|
||||
}
|
||||
}
|
||||
|
||||
volume {
|
||||
name = "host-proc"
|
||||
host_path {
|
||||
path = "/proc"
|
||||
type = "Directory"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
depends_on = [helm_release.nvidia-gpu-operator]
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "gpu_pod_exporter" {
|
||||
metadata {
|
||||
name = "gpu-pod-exporter"
|
||||
namespace = kubernetes_namespace.nvidia.metadata[0].name
|
||||
labels = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
app = "gpu-pod-exporter"
|
||||
}
|
||||
|
||||
port {
|
||||
name = "metrics"
|
||||
port = 80
|
||||
target_port = 9401
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue