From ce79bd5c04465d625a446cdcd4c47e8a44404bdb Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Wed, 11 Mar 2026 22:46:33 +0000
Subject: [PATCH] Add node hang instrumentation and scale down chromium
 services
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add journald collection to Alloy (loki.source.journal) for kernel OOM,
  panic, hung task, and soft lockup detection — ships system logs off-node
  so they survive hard resets
- Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask,
  KernelSoftLockup, ContainerdDown) evaluating against node-journal logs
- Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake),
  add alertmanager_url and enable_api
- Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown,
  NodeHighIOWait (>30%)
- Add caretta tolerations for control-plane and GPU nodes
- Scale down chromium-based services to 0 for cluster stability:
  f1-stream, flaresolverr, changedetection, resume/printer
---
 stacks/changedetection/main.tf                |   2 +-
 stacks/f1-stream/main.tf                      |   2 +-
 stacks/platform/modules/monitoring/alloy.yaml |  77 ++++
 stacks/platform/modules/monitoring/caretta.tf |  12 +
 .../monitoring/dashboards/cluster_health.json | 430 +++++++++++++++++-
 stacks/platform/modules/monitoring/loki.yaml  |   4 +-
 stacks/resume/main.tf                         |   4 +-
 stacks/servarr/flaresolverr/main.tf           |   2 +-
 8 files changed, 517 insertions(+), 16 deletions(-)

diff --git a/stacks/changedetection/main.tf b/stacks/changedetection/main.tf
index 1524688e..47f76c0c 100644
--- a/stacks/changedetection/main.tf
+++ b/stacks/changedetection/main.tf
@@ -43,7 +43,7 @@ resource "kubernetes_deployment" "changedetection" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 0 # Scaled down — sockpuppetbrowser (headless Chromium sidecar) causes node OOM
     strategy {
       type = "Recreate"
     }
diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf
index 5a731f2e..33c86ddc 100644
--- a/stacks/f1-stream/main.tf
+++ b/stacks/f1-stream/main.tf
@@ -39,7 +39,7 @@ resource "kubernetes_deployment" "f1-stream" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 0 # Scaled down for cluster stability — periodic scans cause memory pressure
     selector {
       match_labels = {
         app = "f1-stream"
diff --git a/stacks/platform/modules/monitoring/alloy.yaml b/stacks/platform/modules/monitoring/alloy.yaml
index ac3148e8..ab80e38e 100644
--- a/stacks/platform/modules/monitoring/alloy.yaml
+++ b/stacks/platform/modules/monitoring/alloy.yaml
@@ -99,6 +99,56 @@ alloy:
         forward_to = [loki.write.default.receiver]
       }
 
+      // Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
+      // Ships system logs off-node so they survive hard resets.
+      loki.source.journal "node_journal" {
+        forward_to = [loki.process.journal.receiver]
+        relabel_rules = loki.relabel.journal.rules
+        labels = {
+          job = "node-journal",
+        }
+        max_age = "12h"
+      }
+
+      loki.relabel "journal" {
+        forward_to = []
+
+        rule {
+          source_labels = ["__journal__hostname"]
+          target_label  = "node"
+        }
+        rule {
+          source_labels = ["__journal__systemd_unit"]
+          target_label  = "unit"
+        }
+        rule {
+          source_labels = ["__journal_priority_keyword"]
+          target_label  = "level"
+        }
+        rule {
+          source_labels = ["__journal__transport"]
+          target_label  = "transport"
+        }
+      }
+
+      // Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
+      // Also forwards kernel transport entries regardless of priority for OOM/panic detection.
+      loki.process "journal" {
+        stage.static_labels {
+          values = {
+            cluster = "default",
+          }
+        }
+
+        // Drop info/debug/notice entries that aren't from the kernel transport
+        stage.match {
+          selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
+          action   = "drop"
+        }
+
+        forward_to = [loki.write.default.receiver]
+      }
+
       // Kubernetes audit log collection from /var/log/kubernetes/audit.log
       // Requires alloy.mounts.varlog=true to mount /var/log from the host
       local.file_match "audit_logs" {
@@ -117,6 +167,33 @@ alloy:
   # Mount /var/log from the host for file-based log collection (audit logs)
   mounts:
     varlog: true
+    # Mount journal directories for loki.source.journal
+    extra:
+      - name: journal-run
+        mountPath: /run/log/journal
+        readOnly: true
+      - name: journal-var
+        mountPath: /var/log/journal
+        readOnly: true
+      - name: machine-id
+        mountPath: /etc/machine-id
+        readOnly: true
+
+controller:
+  volumes:
+    extra:
+      - name: journal-run
+        hostPath:
+          path: /run/log/journal
+          type: DirectoryOrCreate
+      - name: journal-var
+        hostPath:
+          path: /var/log/journal
+          type: DirectoryOrCreate
+      - name: machine-id
+        hostPath:
+          path: /etc/machine-id
+          type: File
 
   # Resource limits for DaemonSet pods
   # Alloy tails logs from all containers on the node via K8s API and batches
diff --git a/stacks/platform/modules/monitoring/caretta.tf b/stacks/platform/modules/monitoring/caretta.tf
index 5f76ec17..d98e07d0 100644
--- a/stacks/platform/modules/monitoring/caretta.tf
+++ b/stacks/platform/modules/monitoring/caretta.tf
@@ -14,6 +14,18 @@ resource "helm_release" "caretta" {
     victoria-metrics-single = {
       enabled = false
     }
+    tolerations = [
+      {
+        key      = "node-role.kubernetes.io/control-plane"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      },
+      {
+        key      = "nvidia.com/gpu"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      }
+    ]
     resources = {
       requests = {
         cpu    = "10m"
diff --git a/stacks/platform/modules/monitoring/dashboards/cluster_health.json b/stacks/platform/modules/monitoring/dashboards/cluster_health.json
index 050117ad..3d8dbdea 100644
--- a/stacks/platform/modules/monitoring/dashboards/cluster_health.json
+++ b/stacks/platform/modules/monitoring/dashboards/cluster_health.json
@@ -4202,7 +4202,7 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 68
+        "y": 72
       },
       "id": 35,
       "options": {
@@ -4233,6 +4233,405 @@
       "title": "Restart Rate (24h)",
       "type": "timeseries"
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 80
+              },
+              {
+                "color": "green",
+                "value": 95
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 68
+      },
+      "id": 112,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) / count(kube_pod_info) * 100",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Healthy Pods %",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 68
+      },
+      "id": 113,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\"}) OR vector(0)",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Unhealthy Pods",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 80
+              },
+              {
+                "color": "green",
+                "value": 95
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 68
+      },
+      "id": 114,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum(kube_deployment_status_replicas_available) / sum(kube_deployment_spec_replicas) * 100",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Deployment Readiness %",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "auto"
+            },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Ready Replicas"
+            },
+            "properties": [
+              {
+                "id": "custom.cellOptions",
+                "value": {
+                  "mode": "gradient",
+                  "type": "gauge"
+                }
+              },
+              {
+                "id": "thresholds",
+                "value": {
+                  "mode": "absolute",
+                  "steps": [
+                    {
+                      "color": "red",
+                      "value": null
+                    },
+                    {
+                      "color": "green",
+                      "value": 1
+                    }
+                  ]
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 72
+      },
+      "id": 115,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "fields": "",
+          "reducer": ["sum"],
+          "show": false
+        },
+        "showHeader": true,
+        "sortBy": [
+          {
+            "desc": false,
+            "displayName": "namespace"
+          }
+        ]
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (namespace) (kube_pod_status_phase{phase=\"Running\"})",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count by (namespace) (kube_pod_info)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (namespace) (kube_deployment_status_replicas_available)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (namespace) (kube_deployment_spec_replicas)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "D"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count by (namespace) (kube_deployment_spec_replicas)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "E"
+        }
+      ],
+      "title": "Pod & Deployment Health by Namespace",
+      "transformations": [
+        {
+          "id": "merge",
+          "options": {}
+        },
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "Time": true
+            },
+            "renameByName": {
+              "Value #A": "Running Pods",
+              "Value #B": "Total Pods",
+              "Value #C": "Ready Replicas",
+              "Value #D": "Desired Replicas",
+              "Value #E": "Deployments"
+            }
+          }
+        }
+      ],
+      "type": "table"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${loki_datasource}"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 80
+      },
+      "id": 116,
+      "options": {
+        "dedupStrategy": "exact",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": true,
+        "sortOrder": "Descending",
+        "wrapLogMessage": true
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${loki_datasource}"
+          },
+          "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|OOMKilled|CrashLoopBackOff|fatal)\"",
+          "refId": "A"
+        }
+      ],
+      "title": "Failing Pod Logs",
+      "type": "logs"
+    },
     {
       "datasource": {
         "type": "prometheus",
@@ -4314,7 +4713,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 72
+        "y": 88
       },
       "id": 36,
       "options": {
@@ -4341,7 +4740,7 @@
             "type": "prometheus",
             "uid": "${datasource}"
           },
-          "expr": "topk(15, kube_pod_container_status_restarts_total)",
+          "expr": "topk(15, round(increase(kube_pod_container_status_restarts_total[$__range])) > 0)",
           "format": "table",
           "instant": true,
           "legendFormat": "",
@@ -4379,7 +4778,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 80
+        "y": 97
       },
       "id": 50,
       "panels": [],
@@ -4499,7 +4898,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 81
+        "y": 98
       },
       "id": 51,
       "options": {
@@ -4642,7 +5041,7 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 81
+        "y": 98
       },
       "id": 52,
       "options": {
@@ -4766,7 +5165,7 @@
         "h": 8,
         "w": 24,
         "x": 0,
-        "y": 89
+        "y": 106
       },
       "id": 53,
       "options": {
@@ -4813,7 +5212,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 97
+        "y": 114
       },
       "id": 60,
       "panels": [],
@@ -4893,7 +5292,7 @@
         "h": 8,
         "w": 24,
         "x": 0,
-        "y": 98
+        "y": 115
       },
       "id": 61,
       "options": {
@@ -4985,6 +5384,19 @@
         "refresh": 1,
         "regex": "",
         "type": "datasource"
+      },
+      {
+        "current": {
+          "text": "Loki",
+          "value": "P8E80F9AEF21F6940"
+        },
+        "includeAll": false,
+        "name": "loki_datasource",
+        "options": [],
+        "query": "loki",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
       }
     ]
   },
diff --git a/stacks/platform/modules/monitoring/loki.yaml b/stacks/platform/modules/monitoring/loki.yaml
index ce6c5b69..685031c3 100644
--- a/stacks/platform/modules/monitoring/loki.yaml
+++ b/stacks/platform/modules/monitoring/loki.yaml
@@ -33,7 +33,7 @@ loki:
     storage:
       type: local
       local:
-        directory: /loki/rules
+        directory: /var/loki/rules
     alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
     ring:
       kvstore:
@@ -66,7 +66,7 @@ singleBinary:
     - name: wal
       mountPath: /loki-wal
     - name: rules
-      mountPath: /loki/rules/fake
+      mountPath: /var/loki/rules/fake
   resources:
     requests:
       cpu: 250m
diff --git a/stacks/resume/main.tf b/stacks/resume/main.tf
index e1caeccd..d4cc6e32 100644
--- a/stacks/resume/main.tf
+++ b/stacks/resume/main.tf
@@ -43,7 +43,7 @@ resource "kubernetes_deployment" "printer" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 0 # Scaled down — browserless chromium causes node OOM
     selector {
       match_labels = {
         app = "printer"
@@ -147,7 +147,7 @@ resource "kubernetes_deployment" "resume" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 0 # Scaled down with printer — depends on browserless chromium
     selector {
       match_labels = {
         app = "resume"
diff --git a/stacks/servarr/flaresolverr/main.tf b/stacks/servarr/flaresolverr/main.tf
index 1bd4828a..07b2f717 100644
--- a/stacks/servarr/flaresolverr/main.tf
+++ b/stacks/servarr/flaresolverr/main.tf
@@ -14,7 +14,7 @@ resource "kubernetes_deployment" "flaresolverr" {
     }
   }
   spec {
-    replicas = 1
+    replicas = 0 # Scaled down — headless Chrome with no effective resource limits causes node OOM
     selector {
       match_labels = {
         app = "flaresolverr"