diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index 33374dc0..c65e9358 100755 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -623,4 +623,9 @@ extraScrapeConfigs: | action: replace regex: '(.*)' replacement: 'nvidia_tesla_t4_$${1}' + - job_name: 'gpu-pod-memory' + static_configs: + - targets: + - "gpu-pod-exporter.nvidia.svc.cluster.local" + metrics_path: '/metrics' diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf index 59f3619d..dc05a790 100644 --- a/modules/kubernetes/nvidia/main.tf +++ b/modules/kubernetes/nvidia/main.tf @@ -17,6 +17,18 @@ resource "kubernetes_namespace" "nvidia" { } } +# Apply GPU taint to ensure only GPU workloads run on GPU node +resource "null_resource" "gpu_node_taint" { + provisioner "local-exec" { + command = "kubectl taint nodes k8s-node1 nvidia.com/gpu=true:NoSchedule --overwrite" + } + + # Re-run if namespace changes (proxy for cluster changes) + triggers = { + namespace = kubernetes_namespace.nvidia.metadata[0].name + } +} + # [not needed anymore; part of the chart values] Apply to operator with: # kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}' @@ -82,6 +94,12 @@ resource "kubernetes_deployment" "nvidia-exporter" { node_selector = { "gpu" : "true" } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } container { image = "nvidia/dcgm-exporter:latest" name = "nvidia-exporter" @@ -219,3 +237,274 @@ module "ingress" { # } # depends_on = [helm_release.nvidia-gpu-operator] # } + +# GPU Pod Memory Exporter - exposes per-pod GPU memory usage as Prometheus metrics +resource "kubernetes_config_map" "gpu_pod_exporter_script" { + metadata { + name = "gpu-pod-exporter-script" + namespace = kubernetes_namespace.nvidia.metadata[0].name + } + + data = { + "exporter.py" = <<-EOF +#!/usr/bin/env python3 +"""GPU Pod Memory Exporter - Collects per-pod GPU memory usage.""" + +import subprocess +import time +import re +import os +from http.server import HTTPServer, BaseHTTPRequestHandler + +METRICS_PORT = 9401 +SCRAPE_INTERVAL = 15 + +def get_gpu_processes(): + """Run nvidia-smi to get GPU process info.""" + try: + result = subprocess.run( + ["nvidia-smi", "--query-compute-apps=pid,used_memory,process_name", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10 + ) + if result.returncode != 0: + print(f"nvidia-smi error: {result.stderr}") + return [] + + processes = [] + for line in result.stdout.strip().split('\n'): + if not line.strip(): + continue + parts = [p.strip() for p in line.split(',')] + if len(parts) >= 3: + pid, memory_mib, process_name = parts[0], parts[1], parts[2] + processes.append({ + 'pid': pid, + 'memory_bytes': int(memory_mib) * 1024 * 1024, + 'process_name': process_name + }) + return processes + except Exception as e: + print(f"Error running nvidia-smi: {e}") + return [] + +def get_container_id(pid): + """Map PID to container ID via cgroup.""" + cgroup_path = f"/host_proc/{pid}/cgroup" + try: + with open(cgroup_path, 'r') as f: + for line in f: + # Match container ID patterns (docker, containerd, cri-o) + # e.g., /kubepods/pod.../containerid or /docker/containerid + match = re.search(r'[:/]([a-f0-9]{64})', line) + if match: + return match.group(1)[:12] # Return short container ID + # Also check for cri-containerd pattern + match = re.search(r'cri-containerd-([a-f0-9]{64})', line) + if match: + return match.group(1)[:12] + except (FileNotFoundError, PermissionError): + pass + return "host" + +# Global metrics storage +current_metrics = [] + +def collect_metrics(): + """Collect GPU memory metrics.""" + global current_metrics + metrics = [] + processes = get_gpu_processes() + + for proc in processes: + container_id = get_container_id(proc['pid']) + metrics.append({ + 'container_id': container_id, + 'pid': proc['pid'], + 'process_name': proc['process_name'], + 'memory_bytes': proc['memory_bytes'] + }) + + current_metrics = metrics + +def format_metrics(): + """Format metrics in Prometheus exposition format.""" + lines = [ + "# HELP gpu_pod_memory_used_bytes GPU memory used by container", + "# TYPE gpu_pod_memory_used_bytes gauge" + ] + + for m in current_metrics: + labels = f'container_id="{m["container_id"]}",pid="{m["pid"]}",process_name="{m["process_name"]}"' + lines.append(f"gpu_pod_memory_used_bytes{{{labels}}} {m['memory_bytes']}") + + return '\n'.join(lines) + '\n' + +class MetricsHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/metrics': + content = format_metrics() + self.send_response(200) + self.send_header('Content-Type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write(content.encode()) + elif self.path == '/health': + self.send_response(200) + self.end_headers() + self.wfile.write(b'ok') + else: + self.send_response(404) + self.end_headers() + + def log_message(self, format, *args): + pass # Suppress request logging + +def background_collector(): + """Background thread to collect metrics periodically.""" + import threading + def run(): + while True: + collect_metrics() + time.sleep(SCRAPE_INTERVAL) + thread = threading.Thread(target=run, daemon=True) + thread.start() + +if __name__ == '__main__': + print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}") + collect_metrics() # Initial collection + background_collector() + + server = HTTPServer(('', METRICS_PORT), MetricsHandler) + server.serve_forever() +EOF + } +} + +resource "kubernetes_daemonset" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + namespace = kubernetes_namespace.nvidia.metadata[0].name + labels = { + app = "gpu-pod-exporter" + tier = var.tier + } + } + + spec { + selector { + match_labels = { + app = "gpu-pod-exporter" + } + } + + template { + metadata { + labels = { + app = "gpu-pod-exporter" + } + } + + spec { + host_pid = true + + node_selector = { + "gpu" : "true" + } + + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } + + container { + name = "exporter" + image = "python:3.11-slim" + + command = ["/bin/bash", "-c"] + args = [ + "python3 /scripts/exporter.py" + ] + + port { + container_port = 9401 + name = "metrics" + } + + volume_mount { + name = "scripts" + mount_path = "/scripts" + read_only = true + } + + volume_mount { + name = "host-proc" + mount_path = "/host_proc" + read_only = true + } + + resources { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + cpu = "200m" + memory = "256Mi" + "nvidia.com/gpu" = "1" + } + } + + liveness_probe { + http_get { + path = "/health" + port = 9401 + } + initial_delay_seconds = 30 + period_seconds = 30 + } + } + + volume { + name = "scripts" + config_map { + name = kubernetes_config_map.gpu_pod_exporter_script.metadata[0].name + default_mode = "0755" + } + } + + volume { + name = "host-proc" + host_path { + path = "/proc" + type = "Directory" + } + } + } + } + } + + depends_on = [helm_release.nvidia-gpu-operator] +} + +resource "kubernetes_service" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + namespace = kubernetes_namespace.nvidia.metadata[0].name + labels = { + app = "gpu-pod-exporter" + } + } + + spec { + selector = { + app = "gpu-pod-exporter" + } + + port { + name = "metrics" + port = 80 + target_port = 9401 + } + } +}