Add node hang instrumentation and scale down chromium services

- Add journald collection to Alloy (loki.source.journal) for kernel OOM,
  panic, hung task, and soft lockup detection — ships system logs off-node
  so they survive hard resets
- Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask,
  KernelSoftLockup, ContainerdDown) evaluating against node-journal logs
- Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake),
  add alertmanager_url and enable_api
- Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown,
  NodeHighIOWait (>30%)
- Add caretta tolerations for control-plane and GPU nodes
- Scale down chromium-based services to 0 for cluster stability:
  f1-stream, flaresolverr, changedetection, resume/printer
This commit is contained in:
Viktor Barzin 2026-03-11 22:46:33 +00:00
parent 8029823f79
commit ce79bd5c04
8 changed files with 517 additions and 16 deletions

View file

@ -43,7 +43,7 @@ resource "kubernetes_deployment" "changedetection" {
}
}
spec {
replicas = 1
replicas = 0 # Scaled down sockpuppetbrowser (headless Chromium sidecar) causes node OOM
strategy {
type = "Recreate"
}

View file

@ -39,7 +39,7 @@ resource "kubernetes_deployment" "f1-stream" {
}
}
spec {
replicas = 1
replicas = 0 # Scaled down for cluster stability periodic scans cause memory pressure
selector {
match_labels = {
app = "f1-stream"

View file

@ -99,6 +99,56 @@ alloy:
forward_to = [loki.write.default.receiver]
}
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
// Ships system logs off-node so they survive hard resets.
loki.source.journal "node_journal" {
forward_to = [loki.process.journal.receiver]
relabel_rules = loki.relabel.journal.rules
labels = {
job = "node-journal",
}
max_age = "12h"
}
loki.relabel "journal" {
forward_to = []
rule {
source_labels = ["__journal__hostname"]
target_label = "node"
}
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
rule {
source_labels = ["__journal_priority_keyword"]
target_label = "level"
}
rule {
source_labels = ["__journal__transport"]
target_label = "transport"
}
}
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
loki.process "journal" {
stage.static_labels {
values = {
cluster = "default",
}
}
// Drop info/debug/notice entries that aren't from the kernel transport
stage.match {
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
action = "drop"
}
forward_to = [loki.write.default.receiver]
}
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
// Requires alloy.mounts.varlog=true to mount /var/log from the host
local.file_match "audit_logs" {
@ -117,6 +167,33 @@ alloy:
# Mount /var/log from the host for file-based log collection (audit logs)
mounts:
varlog: true
# Mount journal directories for loki.source.journal
extra:
- name: journal-run
mountPath: /run/log/journal
readOnly: true
- name: journal-var
mountPath: /var/log/journal
readOnly: true
- name: machine-id
mountPath: /etc/machine-id
readOnly: true
controller:
volumes:
extra:
- name: journal-run
hostPath:
path: /run/log/journal
type: DirectoryOrCreate
- name: journal-var
hostPath:
path: /var/log/journal
type: DirectoryOrCreate
- name: machine-id
hostPath:
path: /etc/machine-id
type: File
# Resource limits for DaemonSet pods
# Alloy tails logs from all containers on the node via K8s API and batches

View file

@ -14,6 +14,18 @@ resource "helm_release" "caretta" {
victoria-metrics-single = {
enabled = false
}
tolerations = [
{
key = "node-role.kubernetes.io/control-plane"
operator = "Exists"
effect = "NoSchedule"
},
{
key = "nvidia.com/gpu"
operator = "Exists"
effect = "NoSchedule"
}
]
resources = {
requests = {
cpu = "10m"

View file

@ -4202,7 +4202,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 68
"y": 72
},
"id": 35,
"options": {
@ -4233,6 +4233,405 @@
"title": "Restart Rate (24h)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 80
},
{
"color": "green",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 12,
"y": 68
},
"id": 112,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_pod_status_phase{phase=\"Running\"}) / count(kube_pod_info) * 100",
"legendFormat": "",
"refId": "A"
}
],
"title": "Healthy Pods %",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 1
},
{
"color": "red",
"value": 5
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 16,
"y": 68
},
"id": 113,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "count(kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\"}) OR vector(0)",
"legendFormat": "",
"refId": "A"
}
],
"title": "Unhealthy Pods",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 80
},
{
"color": "green",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 20,
"y": 68
},
"id": 114,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum(kube_deployment_status_replicas_available) / sum(kube_deployment_spec_replicas) * 100",
"legendFormat": "",
"refId": "A"
}
],
"title": "Deployment Readiness %",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Ready Replicas"
},
"properties": [
{
"id": "custom.cellOptions",
"value": {
"mode": "gradient",
"type": "gauge"
}
},
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 72
},
"id": 115,
"options": {
"cellHeight": "sm",
"footer": {
"countRows": false,
"fields": "",
"reducer": ["sum"],
"show": false
},
"showHeader": true,
"sortBy": [
{
"desc": false,
"displayName": "namespace"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (namespace) (kube_pod_status_phase{phase=\"Running\"})",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "count by (namespace) (kube_pod_info)",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (namespace) (kube_deployment_status_replicas_available)",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (namespace) (kube_deployment_spec_replicas)",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "count by (namespace) (kube_deployment_spec_replicas)",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "E"
}
],
"title": "Pod & Deployment Health by Namespace",
"transformations": [
{
"id": "merge",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
},
"renameByName": {
"Value #A": "Running Pods",
"Value #B": "Total Pods",
"Value #C": "Ready Replicas",
"Value #D": "Desired Replicas",
"Value #E": "Deployments"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "loki",
"uid": "${loki_datasource}"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 80
},
"id": 116,
"options": {
"dedupStrategy": "exact",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": true
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "${loki_datasource}"
},
"expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|OOMKilled|CrashLoopBackOff|fatal)\"",
"refId": "A"
}
],
"title": "Failing Pod Logs",
"type": "logs"
},
{
"datasource": {
"type": "prometheus",
@ -4314,7 +4713,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 72
"y": 88
},
"id": 36,
"options": {
@ -4341,7 +4740,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "topk(15, kube_pod_container_status_restarts_total)",
"expr": "topk(15, round(increase(kube_pod_container_status_restarts_total[$__range])) > 0)",
"format": "table",
"instant": true,
"legendFormat": "",
@ -4379,7 +4778,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 80
"y": 97
},
"id": 50,
"panels": [],
@ -4499,7 +4898,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 81
"y": 98
},
"id": 51,
"options": {
@ -4642,7 +5041,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 81
"y": 98
},
"id": 52,
"options": {
@ -4766,7 +5165,7 @@
"h": 8,
"w": 24,
"x": 0,
"y": 89
"y": 106
},
"id": 53,
"options": {
@ -4813,7 +5212,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 97
"y": 114
},
"id": 60,
"panels": [],
@ -4893,7 +5292,7 @@
"h": 8,
"w": 24,
"x": 0,
"y": 98
"y": 115
},
"id": 61,
"options": {
@ -4985,6 +5384,19 @@
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"current": {
"text": "Loki",
"value": "P8E80F9AEF21F6940"
},
"includeAll": false,
"name": "loki_datasource",
"options": [],
"query": "loki",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},

View file

@ -33,7 +33,7 @@ loki:
storage:
type: local
local:
directory: /loki/rules
directory: /var/loki/rules
alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
ring:
kvstore:
@ -66,7 +66,7 @@ singleBinary:
- name: wal
mountPath: /loki-wal
- name: rules
mountPath: /loki/rules/fake
mountPath: /var/loki/rules/fake
resources:
requests:
cpu: 250m

View file

@ -43,7 +43,7 @@ resource "kubernetes_deployment" "printer" {
}
}
spec {
replicas = 1
replicas = 0 # Scaled down browserless chromium causes node OOM
selector {
match_labels = {
app = "printer"
@ -147,7 +147,7 @@ resource "kubernetes_deployment" "resume" {
}
}
spec {
replicas = 1
replicas = 0 # Scaled down with printer depends on browserless chromium
selector {
match_labels = {
app = "resume"

View file

@ -14,7 +14,7 @@ resource "kubernetes_deployment" "flaresolverr" {
}
}
spec {
replicas = 1
replicas = 0 # Scaled down headless Chrome with no effective resource limits causes node OOM
selector {
match_labels = {
app = "flaresolverr"