diff --git a/modules/kubernetes/drone/main.tf b/modules/kubernetes/drone/main.tf index 4d778e3d..fa5d162b 100644 --- a/modules/kubernetes/drone/main.tf +++ b/modules/kubernetes/drone/main.tf @@ -11,7 +11,7 @@ variable "rpc_host" { } variable "allowed_users" { # comma separated list - default = "viktorbarzin" + default = "viktorbarzin,ancamilea" } resource "kubernetes_namespace" "drone" { @@ -67,7 +67,7 @@ resource "kubernetes_deployment" "drone_server" { } spec { container { - image = "drone/drone" + image = "drone/drone:2.27.0" name = "drone-server" # resources { # limits = { @@ -119,6 +119,18 @@ resource "kubernetes_deployment" "drone_server" { name = "DRONE_CRON_INTERVAL" value = "1m" } + env { + name = "DRONE_LOGS_TRACE" + value = "true" + } + env { + name = "DRONE_LOGS_PRETTY" + value = "true" + } + env { + name = "DRONE_LOGS_TEXT" + value = "true" + } } volume { diff --git a/modules/kubernetes/frigate/main.tf b/modules/kubernetes/frigate/main.tf index 215836a4..32322951 100644 --- a/modules/kubernetes/frigate/main.tf +++ b/modules/kubernetes/frigate/main.tf @@ -48,6 +48,12 @@ resource "kubernetes_deployment" "frigate" { node_selector = { "gpu" : true } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } container { # image = "ghcr.io/blakeblackshear/frigate:stable" # image = "ghcr.io/blakeblackshear/frigate:stable-tensorrt" diff --git a/modules/kubernetes/nvidia/main.tf b/modules/kubernetes/nvidia/main.tf index dc05a790..6384f245 100644 --- a/modules/kubernetes/nvidia/main.tf +++ b/modules/kubernetes/nvidia/main.tf @@ -254,11 +254,98 @@ import subprocess import time import re import os +import json +import urllib.request +import ssl from http.server import HTTPServer, BaseHTTPRequestHandler METRICS_PORT = 9401 SCRAPE_INTERVAL = 15 +# Kubernetes API configuration +K8S_API = "https://kubernetes.default.svc" +TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" +CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + +# Cache for container ID to pod info mapping +container_cache = {} +cache_refresh_time = 0 +CACHE_TTL = 60 # Refresh cache every 60 seconds + +def get_k8s_token(): + """Read Kubernetes service account token.""" + try: + with open(TOKEN_PATH, 'r') as f: + return f.read().strip() + except: + return None + +def refresh_container_cache(): + """Refresh the container ID to pod mapping from Kubernetes API.""" + global container_cache, cache_refresh_time + + token = get_k8s_token() + if not token: + return + + try: + # Create SSL context with K8s CA + ctx = ssl.create_default_context() + if os.path.exists(CA_PATH): + ctx.load_verify_locations(CA_PATH) + + # Get all pods on this node + node_name = os.environ.get('NODE_NAME', '') + url = f"{K8S_API}/api/v1/pods?fieldSelector=spec.nodeName={node_name}" + + req = urllib.request.Request(url, headers={ + 'Authorization': f'Bearer {token}', + 'Accept': 'application/json' + }) + + with urllib.request.urlopen(req, context=ctx, timeout=10) as resp: + data = json.loads(resp.read().decode()) + + new_cache = {} + for pod in data.get('items', []): + pod_name = pod['metadata']['name'] + namespace = pod['metadata']['namespace'] + + # Get container statuses + for status in pod.get('status', {}).get('containerStatuses', []): + container_id = status.get('containerID', '') + # Extract the ID part (e.g., "containerd://abc123..." -> "abc123") + if '://' in container_id: + container_id = container_id.split('://')[-1] + if container_id: + short_id = container_id[:12] + new_cache[short_id] = { + 'pod': pod_name, + 'namespace': namespace, + 'container': status.get('name', 'unknown') + } + + container_cache = new_cache + cache_refresh_time = time.time() + print(f"Refreshed container cache: {len(new_cache)} containers") + + except Exception as e: + print(f"Error refreshing container cache: {e}") + +def get_pod_info(container_id): + """Look up pod info for a container ID.""" + global cache_refresh_time + + # Refresh cache if stale + if time.time() - cache_refresh_time > CACHE_TTL: + refresh_container_cache() + + return container_cache.get(container_id, { + 'pod': 'unknown', + 'namespace': 'unknown', + 'container': 'unknown' + }) + def get_gpu_processes(): """Run nvidia-smi to get GPU process info.""" try: @@ -294,11 +381,9 @@ def get_container_id(pid): with open(cgroup_path, 'r') as f: for line in f: # Match container ID patterns (docker, containerd, cri-o) - # e.g., /kubepods/pod.../containerid or /docker/containerid match = re.search(r'[:/]([a-f0-9]{64})', line) if match: - return match.group(1)[:12] # Return short container ID - # Also check for cri-containerd pattern + return match.group(1)[:12] match = re.search(r'cri-containerd-([a-f0-9]{64})', line) if match: return match.group(1)[:12] @@ -317,11 +402,15 @@ def collect_metrics(): for proc in processes: container_id = get_container_id(proc['pid']) + pod_info = get_pod_info(container_id) metrics.append({ 'container_id': container_id, 'pid': proc['pid'], 'process_name': proc['process_name'], - 'memory_bytes': proc['memory_bytes'] + 'memory_bytes': proc['memory_bytes'], + 'pod': pod_info['pod'], + 'namespace': pod_info['namespace'], + 'container': pod_info['container'] }) current_metrics = metrics @@ -329,13 +418,19 @@ def collect_metrics(): def format_metrics(): """Format metrics in Prometheus exposition format.""" lines = [ - "# HELP gpu_pod_memory_used_bytes GPU memory used by container", + "# HELP gpu_pod_memory_used_bytes GPU memory used by pod", "# TYPE gpu_pod_memory_used_bytes gauge" ] for m in current_metrics: - labels = f'container_id="{m["container_id"]}",pid="{m["pid"]}",process_name="{m["process_name"]}"' - lines.append(f"gpu_pod_memory_used_bytes{{{labels}}} {m['memory_bytes']}") + labels = ','.join([ + f'namespace="{m["namespace"]}"', + f'pod="{m["pod"]}"', + f'container="{m["container"]}"', + f'process_name="{m["process_name"]}"', + f'pid="{m["pid"]}"' + ]) + lines.append(f'gpu_pod_memory_used_bytes{{{labels}}} {m["memory_bytes"]}') return '\n'.join(lines) + '\n' @@ -370,6 +465,7 @@ def background_collector(): if __name__ == '__main__': print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}") + refresh_container_cache() # Initial cache load collect_metrics() # Initial collection background_collector() @@ -379,6 +475,43 @@ EOF } } +resource "kubernetes_service_account" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + namespace = kubernetes_namespace.nvidia.metadata[0].name + } +} + +resource "kubernetes_cluster_role" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + } + + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["list"] + } +} + +resource "kubernetes_cluster_role_binding" "gpu_pod_exporter" { + metadata { + name = "gpu-pod-exporter" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.gpu_pod_exporter.metadata[0].name + } + + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name + namespace = kubernetes_namespace.nvidia.metadata[0].name + } +} + resource "kubernetes_daemonset" "gpu_pod_exporter" { metadata { name = "gpu-pod-exporter" @@ -404,7 +537,8 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { } spec { - host_pid = true + host_pid = true + service_account_name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name node_selector = { "gpu" : "true" @@ -426,6 +560,15 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { "python3 /scripts/exporter.py" ] + env { + name = "NODE_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + port { container_port = 9401 name = "metrics" diff --git a/modules/kubernetes/nvidia/values.yaml b/modules/kubernetes/nvidia/values.yaml index 9049068c..3f9d4639 100644 --- a/modules/kubernetes/nvidia/values.yaml +++ b/modules/kubernetes/nvidia/values.yaml @@ -17,3 +17,11 @@ driver: devicePlugin: config: name: time-slicing-config + +# Tolerate GPU node taint for all GPU operator components +daemonsets: + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/kubernetes/ollama/values.yaml b/modules/kubernetes/ollama/values.yaml index 18f78d24..fe92927a 100644 --- a/modules/kubernetes/ollama/values.yaml +++ b/modules/kubernetes/ollama/values.yaml @@ -17,5 +17,12 @@ ollama: persistentVolume: enabled: true existingClaim: "ollama-pvc" -# nodeSelector: -# kubernetes.io/hostname: k8s-node1 + +nodeSelector: + gpu: "true" + +tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule"