From ce79bd5c04465d625a446cdcd4c47e8a44404bdb Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 11 Mar 2026 22:46:33 +0000 Subject: [PATCH] Add node hang instrumentation and scale down chromium services MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add journald collection to Alloy (loki.source.journal) for kernel OOM, panic, hung task, and soft lockup detection — ships system logs off-node so they survive hard resets - Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask, KernelSoftLockup, ContainerdDown) evaluating against node-journal logs - Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake), add alertmanager_url and enable_api - Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown, NodeHighIOWait (>30%) - Add caretta tolerations for control-plane and GPU nodes - Scale down chromium-based services to 0 for cluster stability: f1-stream, flaresolverr, changedetection, resume/printer --- stacks/changedetection/main.tf | 2 +- stacks/f1-stream/main.tf | 2 +- stacks/platform/modules/monitoring/alloy.yaml | 77 ++++ stacks/platform/modules/monitoring/caretta.tf | 12 + .../monitoring/dashboards/cluster_health.json | 430 +++++++++++++++++- stacks/platform/modules/monitoring/loki.yaml | 4 +- stacks/resume/main.tf | 4 +- stacks/servarr/flaresolverr/main.tf | 2 +- 8 files changed, 517 insertions(+), 16 deletions(-) diff --git a/stacks/changedetection/main.tf b/stacks/changedetection/main.tf index 1524688e..47f76c0c 100644 --- a/stacks/changedetection/main.tf +++ b/stacks/changedetection/main.tf @@ -43,7 +43,7 @@ resource "kubernetes_deployment" "changedetection" { } } spec { - replicas = 1 + replicas = 0 # Scaled down — sockpuppetbrowser (headless Chromium sidecar) causes node OOM strategy { type = "Recreate" } diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf index 5a731f2e..33c86ddc 100644 --- a/stacks/f1-stream/main.tf +++ b/stacks/f1-stream/main.tf @@ -39,7 +39,7 @@ resource "kubernetes_deployment" "f1-stream" { } } spec { - replicas = 1 + replicas = 0 # Scaled down for cluster stability — periodic scans cause memory pressure selector { match_labels = { app = "f1-stream" diff --git a/stacks/platform/modules/monitoring/alloy.yaml b/stacks/platform/modules/monitoring/alloy.yaml index ac3148e8..ab80e38e 100644 --- a/stacks/platform/modules/monitoring/alloy.yaml +++ b/stacks/platform/modules/monitoring/alloy.yaml @@ -99,6 +99,56 @@ alloy: forward_to = [loki.write.default.receiver] } + // Node-level journal log collection for kernel panics, OOMs, hung tasks, etc. + // Ships system logs off-node so they survive hard resets. + loki.source.journal "node_journal" { + forward_to = [loki.process.journal.receiver] + relabel_rules = loki.relabel.journal.rules + labels = { + job = "node-journal", + } + max_age = "12h" + } + + loki.relabel "journal" { + forward_to = [] + + rule { + source_labels = ["__journal__hostname"] + target_label = "node" + } + rule { + source_labels = ["__journal__systemd_unit"] + target_label = "unit" + } + rule { + source_labels = ["__journal_priority_keyword"] + target_label = "level" + } + rule { + source_labels = ["__journal__transport"] + target_label = "transport" + } + } + + // Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning) + // Also forwards kernel transport entries regardless of priority for OOM/panic detection. + loki.process "journal" { + stage.static_labels { + values = { + cluster = "default", + } + } + + // Drop info/debug/notice entries that aren't from the kernel transport + stage.match { + selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}" + action = "drop" + } + + forward_to = [loki.write.default.receiver] + } + // Kubernetes audit log collection from /var/log/kubernetes/audit.log // Requires alloy.mounts.varlog=true to mount /var/log from the host local.file_match "audit_logs" { @@ -117,6 +167,33 @@ alloy: # Mount /var/log from the host for file-based log collection (audit logs) mounts: varlog: true + # Mount journal directories for loki.source.journal + extra: + - name: journal-run + mountPath: /run/log/journal + readOnly: true + - name: journal-var + mountPath: /var/log/journal + readOnly: true + - name: machine-id + mountPath: /etc/machine-id + readOnly: true + +controller: + volumes: + extra: + - name: journal-run + hostPath: + path: /run/log/journal + type: DirectoryOrCreate + - name: journal-var + hostPath: + path: /var/log/journal + type: DirectoryOrCreate + - name: machine-id + hostPath: + path: /etc/machine-id + type: File # Resource limits for DaemonSet pods # Alloy tails logs from all containers on the node via K8s API and batches diff --git a/stacks/platform/modules/monitoring/caretta.tf b/stacks/platform/modules/monitoring/caretta.tf index 5f76ec17..d98e07d0 100644 --- a/stacks/platform/modules/monitoring/caretta.tf +++ b/stacks/platform/modules/monitoring/caretta.tf @@ -14,6 +14,18 @@ resource "helm_release" "caretta" { victoria-metrics-single = { enabled = false } + tolerations = [ + { + key = "node-role.kubernetes.io/control-plane" + operator = "Exists" + effect = "NoSchedule" + }, + { + key = "nvidia.com/gpu" + operator = "Exists" + effect = "NoSchedule" + } + ] resources = { requests = { cpu = "10m" diff --git a/stacks/platform/modules/monitoring/dashboards/cluster_health.json b/stacks/platform/modules/monitoring/dashboards/cluster_health.json index 050117ad..3d8dbdea 100644 --- a/stacks/platform/modules/monitoring/dashboards/cluster_health.json +++ b/stacks/platform/modules/monitoring/dashboards/cluster_health.json @@ -4202,7 +4202,7 @@ "h": 8, "w": 12, "x": 12, - "y": 68 + "y": 72 }, "id": 35, "options": { @@ -4233,6 +4233,405 @@ "title": "Restart Rate (24h)", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "green", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 68 + }, + "id": 112, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) / count(kube_pod_info) * 100", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Healthy Pods %", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 68 + }, + "id": 113, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\"}) OR vector(0)", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Unhealthy Pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "green", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 68 + }, + "id": 114, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_deployment_status_replicas_available) / sum(kube_deployment_spec_replicas) * 100", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Deployment Readiness %", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Ready Replicas" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 115, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "namespace" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (namespace) (kube_pod_status_phase{phase=\"Running\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count by (namespace) (kube_pod_info)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (namespace) (kube_deployment_status_replicas_available)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (namespace) (kube_deployment_spec_replicas)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count by (namespace) (kube_deployment_spec_replicas)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "E" + } + ], + "title": "Pod & Deployment Health by Namespace", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "renameByName": { + "Value #A": "Running Pods", + "Value #B": "Total Pods", + "Value #C": "Ready Replicas", + "Value #D": "Desired Replicas", + "Value #E": "Deployments" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 80 + }, + "id": 116, + "options": { + "dedupStrategy": "exact", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|OOMKilled|CrashLoopBackOff|fatal)\"", + "refId": "A" + } + ], + "title": "Failing Pod Logs", + "type": "logs" + }, { "datasource": { "type": "prometheus", @@ -4314,7 +4713,7 @@ "h": 8, "w": 12, "x": 0, - "y": 72 + "y": 88 }, "id": 36, "options": { @@ -4341,7 +4740,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "topk(15, kube_pod_container_status_restarts_total)", + "expr": "topk(15, round(increase(kube_pod_container_status_restarts_total[$__range])) > 0)", "format": "table", "instant": true, "legendFormat": "", @@ -4379,7 +4778,7 @@ "h": 1, "w": 24, "x": 0, - "y": 80 + "y": 97 }, "id": 50, "panels": [], @@ -4499,7 +4898,7 @@ "h": 8, "w": 12, "x": 0, - "y": 81 + "y": 98 }, "id": 51, "options": { @@ -4642,7 +5041,7 @@ "h": 8, "w": 12, "x": 12, - "y": 81 + "y": 98 }, "id": 52, "options": { @@ -4766,7 +5165,7 @@ "h": 8, "w": 24, "x": 0, - "y": 89 + "y": 106 }, "id": 53, "options": { @@ -4813,7 +5212,7 @@ "h": 1, "w": 24, "x": 0, - "y": 97 + "y": 114 }, "id": 60, "panels": [], @@ -4893,7 +5292,7 @@ "h": 8, "w": 24, "x": 0, - "y": 98 + "y": 115 }, "id": 61, "options": { @@ -4985,6 +5384,19 @@ "refresh": 1, "regex": "", "type": "datasource" + }, + { + "current": { + "text": "Loki", + "value": "P8E80F9AEF21F6940" + }, + "includeAll": false, + "name": "loki_datasource", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "type": "datasource" } ] }, diff --git a/stacks/platform/modules/monitoring/loki.yaml b/stacks/platform/modules/monitoring/loki.yaml index ce6c5b69..685031c3 100644 --- a/stacks/platform/modules/monitoring/loki.yaml +++ b/stacks/platform/modules/monitoring/loki.yaml @@ -33,7 +33,7 @@ loki: storage: type: local local: - directory: /loki/rules + directory: /var/loki/rules alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093 ring: kvstore: @@ -66,7 +66,7 @@ singleBinary: - name: wal mountPath: /loki-wal - name: rules - mountPath: /loki/rules/fake + mountPath: /var/loki/rules/fake resources: requests: cpu: 250m diff --git a/stacks/resume/main.tf b/stacks/resume/main.tf index e1caeccd..d4cc6e32 100644 --- a/stacks/resume/main.tf +++ b/stacks/resume/main.tf @@ -43,7 +43,7 @@ resource "kubernetes_deployment" "printer" { } } spec { - replicas = 1 + replicas = 0 # Scaled down — browserless chromium causes node OOM selector { match_labels = { app = "printer" @@ -147,7 +147,7 @@ resource "kubernetes_deployment" "resume" { } } spec { - replicas = 1 + replicas = 0 # Scaled down with printer — depends on browserless chromium selector { match_labels = { app = "resume" diff --git a/stacks/servarr/flaresolverr/main.tf b/stacks/servarr/flaresolverr/main.tf index 1bd4828a..07b2f717 100644 --- a/stacks/servarr/flaresolverr/main.tf +++ b/stacks/servarr/flaresolverr/main.tf @@ -14,7 +14,7 @@ resource "kubernetes_deployment" "flaresolverr" { } } spec { - replicas = 1 + replicas = 0 # Scaled down — headless Chrome with no effective resource limits causes node OOM selector { match_labels = { app = "flaresolverr"