Add node hang instrumentation and scale down chromium services

- Add journald collection to Alloy (loki.source.journal) for kernel OOM, panic, hung task, and soft lockup detection — ships system logs off-node so they survive hard resets - Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask, KernelSoftLockup, ContainerdDown) evaluating against node-journal logs - Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake), add alertmanager_url and enable_api - Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown, NodeHighIOWait (>30%) - Add caretta tolerations for control-plane and GPU nodes - Scale down chromium-based services to 0 for cluster stability: f1-stream, flaresolverr, changedetection, resume/printer
2026-03-11 22:46:33 +00:00 · 2026-03-11 22:46:33 +00:00 · ce79bd5c04
commit ce79bd5c04
parent 8029823f79
8 changed files with 517 additions and 16 deletions
--- a/stacks/platform/modules/monitoring/caretta.tf
+++ b/stacks/platform/modules/monitoring/caretta.tf
@ -14,6 +14,18 @@ resource "helm_release" "caretta" {
    victoria-metrics-single = {
      enabled = false
    }
+    tolerations = [
+      {
+        key      = "node-role.kubernetes.io/control-plane"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      },
+      {
+        key      = "nvidia.com/gpu"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      }
+    ]
    resources = {
      requests = {
        cpu    = "10m"