Add node hang instrumentation and scale down chromium services

- Add journald collection to Alloy (loki.source.journal) for kernel OOM, panic, hung task, and soft lockup detection — ships system logs off-node so they survive hard resets - Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask, KernelSoftLockup, ContainerdDown) evaluating against node-journal logs - Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake), add alertmanager_url and enable_api - Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown, NodeHighIOWait (>30%) - Add caretta tolerations for control-plane and GPU nodes - Scale down chromium-based services to 0 for cluster stability: f1-stream, flaresolverr, changedetection, resume/printer
2026-03-11 22:46:33 +00:00 · 2026-03-11 22:46:33 +00:00 · ce79bd5c04
commit ce79bd5c04
parent 8029823f79
8 changed files with 517 additions and 16 deletions
--- a/stacks/changedetection/main.tf
+++ b/stacks/changedetection/main.tf
@ -43,7 +43,7 @@ resource "kubernetes_deployment" "changedetection" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0 # Scaled down — sockpuppetbrowser (headless Chromium sidecar) causes node OOM
    strategy {
      type = "Recreate"
    }
--- a/stacks/f1-stream/main.tf
+++ b/stacks/f1-stream/main.tf
@ -39,7 +39,7 @@ resource "kubernetes_deployment" "f1-stream" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0 # Scaled down for cluster stability — periodic scans cause memory pressure
    selector {
      match_labels = {
        app = "f1-stream"
--- a/stacks/platform/modules/monitoring/alloy.yaml
+++ b/stacks/platform/modules/monitoring/alloy.yaml
@ -99,6 +99,56 @@ alloy:
        forward_to = [loki.write.default.receiver]
      }

+      // Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
+      // Ships system logs off-node so they survive hard resets.
+      loki.source.journal "node_journal" {
+        forward_to = [loki.process.journal.receiver]
+        relabel_rules = loki.relabel.journal.rules
+        labels = {
+          job = "node-journal",
+        }
+        max_age = "12h"
+      }
+
+      loki.relabel "journal" {
+        forward_to = []
+
+        rule {
+          source_labels = ["__journal__hostname"]
+          target_label  = "node"
+        }
+        rule {
+          source_labels = ["__journal__systemd_unit"]
+          target_label  = "unit"
+        }
+        rule {
+          source_labels = ["__journal_priority_keyword"]
+          target_label  = "level"
+        }
+        rule {
+          source_labels = ["__journal__transport"]
+          target_label  = "transport"
+        }
+      }
+
+      // Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
+      // Also forwards kernel transport entries regardless of priority for OOM/panic detection.
+      loki.process "journal" {
+        stage.static_labels {
+          values = {
+            cluster = "default",
+          }
+        }
+
+        // Drop info/debug/notice entries that aren't from the kernel transport
+        stage.match {
+          selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
+          action   = "drop"
+        }
+
+        forward_to = [loki.write.default.receiver]
+      }
+
      // Kubernetes audit log collection from /var/log/kubernetes/audit.log
      // Requires alloy.mounts.varlog=true to mount /var/log from the host
      local.file_match "audit_logs" {
@ -117,6 +167,33 @@ alloy:
  # Mount /var/log from the host for file-based log collection (audit logs)
  mounts:
    varlog: true
+    # Mount journal directories for loki.source.journal
+    extra:
+      - name: journal-run
+        mountPath: /run/log/journal
+        readOnly: true
+      - name: journal-var
+        mountPath: /var/log/journal
+        readOnly: true
+      - name: machine-id
+        mountPath: /etc/machine-id
+        readOnly: true
+
+controller:
+  volumes:
+    extra:
+      - name: journal-run
+        hostPath:
+          path: /run/log/journal
+          type: DirectoryOrCreate
+      - name: journal-var
+        hostPath:
+          path: /var/log/journal
+          type: DirectoryOrCreate
+      - name: machine-id
+        hostPath:
+          path: /etc/machine-id
+          type: File

  # Resource limits for DaemonSet pods
  # Alloy tails logs from all containers on the node via K8s API and batches
--- a/stacks/platform/modules/monitoring/caretta.tf
+++ b/stacks/platform/modules/monitoring/caretta.tf
@ -14,6 +14,18 @@ resource "helm_release" "caretta" {
    victoria-metrics-single = {
      enabled = false
    }
+    tolerations = [
+      {
+        key      = "node-role.kubernetes.io/control-plane"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      },
+      {
+        key      = "nvidia.com/gpu"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      }
+    ]
    resources = {
      requests = {
        cpu    = "10m"
--- a/stacks/platform/modules/monitoring/dashboards/cluster_health.json
+++ b/stacks/platform/modules/monitoring/dashboards/cluster_health.json
@ -4202,7 +4202,7 @@
        "h": 8,
        "w": 12,
        "x": 12,
-        "y": 68
+        "y": 72
      },
      "id": 35,
      "options": {
@ -4233,6 +4233,405 @@
      "title": "Restart Rate (24h)",
      "type": "timeseries"
    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 80
+              },
+              {
+                "color": "green",
+                "value": 95
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 68
+      },
+      "id": 112,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) / count(kube_pod_info) * 100",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Healthy Pods %",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 68
+      },
+      "id": 113,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\"}) OR vector(0)",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Unhealthy Pods",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 80
+              },
+              {
+                "color": "green",
+                "value": 95
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 68
+      },
+      "id": 114,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum(kube_deployment_status_replicas_available) / sum(kube_deployment_spec_replicas) * 100",
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Deployment Readiness %",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "auto"
+            },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Ready Replicas"
+            },
+            "properties": [
+              {
+                "id": "custom.cellOptions",
+                "value": {
+                  "mode": "gradient",
+                  "type": "gauge"
+                }
+              },
+              {
+                "id": "thresholds",
+                "value": {
+                  "mode": "absolute",
+                  "steps": [
+                    {
+                      "color": "red",
+                      "value": null
+                    },
+                    {
+                      "color": "green",
+                      "value": 1
+                    }
+                  ]
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 72
+      },
+      "id": 115,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "fields": "",
+          "reducer": ["sum"],
+          "show": false
+        },
+        "showHeader": true,
+        "sortBy": [
+          {
+            "desc": false,
+            "displayName": "namespace"
+          }
+        ]
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (namespace) (kube_pod_status_phase{phase=\"Running\"})",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count by (namespace) (kube_pod_info)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (namespace) (kube_deployment_status_replicas_available)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (namespace) (kube_deployment_spec_replicas)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "D"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count by (namespace) (kube_deployment_spec_replicas)",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "",
+          "refId": "E"
+        }
+      ],
+      "title": "Pod & Deployment Health by Namespace",
+      "transformations": [
+        {
+          "id": "merge",
+          "options": {}
+        },
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "Time": true
+            },
+            "renameByName": {
+              "Value #A": "Running Pods",
+              "Value #B": "Total Pods",
+              "Value #C": "Ready Replicas",
+              "Value #D": "Desired Replicas",
+              "Value #E": "Deployments"
+            }
+          }
+        }
+      ],
+      "type": "table"
+    },
+    {
+      "datasource": {
+        "type": "loki",
+        "uid": "${loki_datasource}"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 80
+      },
+      "id": 116,
+      "options": {
+        "dedupStrategy": "exact",
+        "enableLogDetails": true,
+        "prettifyLogMessage": false,
+        "showCommonLabels": false,
+        "showLabels": false,
+        "showTime": true,
+        "sortOrder": "Descending",
+        "wrapLogMessage": true
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "loki",
+            "uid": "${loki_datasource}"
+          },
+          "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|OOMKilled|CrashLoopBackOff|fatal)\"",
+          "refId": "A"
+        }
+      ],
+      "title": "Failing Pod Logs",
+      "type": "logs"
+    },
    {
      "datasource": {
        "type": "prometheus",
@ -4314,7 +4713,7 @@
        "h": 8,
        "w": 12,
        "x": 0,
-        "y": 72
+        "y": 88
      },
      "id": 36,
      "options": {
@ -4341,7 +4740,7 @@
            "type": "prometheus",
            "uid": "${datasource}"
          },
-          "expr": "topk(15, kube_pod_container_status_restarts_total)",
+          "expr": "topk(15, round(increase(kube_pod_container_status_restarts_total[$__range])) > 0)",
          "format": "table",
          "instant": true,
          "legendFormat": "",
@ -4379,7 +4778,7 @@
        "h": 1,
        "w": 24,
        "x": 0,
-        "y": 80
+        "y": 97
      },
      "id": 50,
      "panels": [],
@ -4499,7 +4898,7 @@
        "h": 8,
        "w": 12,
        "x": 0,
-        "y": 81
+        "y": 98
      },
      "id": 51,
      "options": {
@ -4642,7 +5041,7 @@
        "h": 8,
        "w": 12,
        "x": 12,
-        "y": 81
+        "y": 98
      },
      "id": 52,
      "options": {
@ -4766,7 +5165,7 @@
        "h": 8,
        "w": 24,
        "x": 0,
-        "y": 89
+        "y": 106
      },
      "id": 53,
      "options": {
@ -4813,7 +5212,7 @@
        "h": 1,
        "w": 24,
        "x": 0,
-        "y": 97
+        "y": 114
      },
      "id": 60,
      "panels": [],
@ -4893,7 +5292,7 @@
        "h": 8,
        "w": 24,
        "x": 0,
-        "y": 98
+        "y": 115
      },
      "id": 61,
      "options": {
@ -4985,6 +5384,19 @@
        "refresh": 1,
        "regex": "",
        "type": "datasource"
+      },
+      {
+        "current": {
+          "text": "Loki",
+          "value": "P8E80F9AEF21F6940"
+        },
+        "includeAll": false,
+        "name": "loki_datasource",
+        "options": [],
+        "query": "loki",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
      }
    ]
  },
--- a/stacks/platform/modules/monitoring/loki.yaml
+++ b/stacks/platform/modules/monitoring/loki.yaml
@ -33,7 +33,7 @@ loki:
    storage:
      type: local
      local:
-        directory: /loki/rules
+        directory: /var/loki/rules
    alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
    ring:
      kvstore:
@ -66,7 +66,7 @@ singleBinary:
    - name: wal
      mountPath: /loki-wal
    - name: rules
-      mountPath: /loki/rules/fake
+      mountPath: /var/loki/rules/fake
  resources:
    requests:
      cpu: 250m
--- a/stacks/resume/main.tf
+++ b/stacks/resume/main.tf
@ -43,7 +43,7 @@ resource "kubernetes_deployment" "printer" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0 # Scaled down — browserless chromium causes node OOM
    selector {
      match_labels = {
        app = "printer"
@ -147,7 +147,7 @@ resource "kubernetes_deployment" "resume" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0 # Scaled down with printer — depends on browserless chromium
    selector {
      match_labels = {
        app = "resume"
--- a/stacks/servarr/flaresolverr/main.tf
+++ b/stacks/servarr/flaresolverr/main.tf
@ -14,7 +14,7 @@ resource "kubernetes_deployment" "flaresolverr" {
    }
  }
  spec {
-    replicas = 1
+    replicas = 0 # Scaled down — headless Chrome with no effective resource limits causes node OOM
    selector {
      match_labels = {
        app = "flaresolverr"