Add node hang instrumentation and scale down chromium services

- Add journald collection to Alloy (loki.source.journal) for kernel OOM,
  panic, hung task, and soft lockup detection — ships system logs off-node
  so they survive hard resets
- Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask,
  KernelSoftLockup, ContainerdDown) evaluating against node-journal logs
- Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake),
  add alertmanager_url and enable_api
- Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown,
  NodeHighIOWait (>30%)
- Add caretta tolerations for control-plane and GPU nodes
- Scale down chromium-based services to 0 for cluster stability:
  f1-stream, flaresolverr, changedetection, resume/printer
This commit is contained in:
Viktor Barzin 2026-03-11 22:46:33 +00:00
parent 8029823f79
commit ce79bd5c04
8 changed files with 517 additions and 16 deletions

View file

@ -99,6 +99,56 @@ alloy:
forward_to = [loki.write.default.receiver]
}
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
// Ships system logs off-node so they survive hard resets.
loki.source.journal "node_journal" {
forward_to = [loki.process.journal.receiver]
relabel_rules = loki.relabel.journal.rules
labels = {
job = "node-journal",
}
max_age = "12h"
}
loki.relabel "journal" {
forward_to = []
rule {
source_labels = ["__journal__hostname"]
target_label = "node"
}
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
rule {
source_labels = ["__journal_priority_keyword"]
target_label = "level"
}
rule {
source_labels = ["__journal__transport"]
target_label = "transport"
}
}
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
loki.process "journal" {
stage.static_labels {
values = {
cluster = "default",
}
}
// Drop info/debug/notice entries that aren't from the kernel transport
stage.match {
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
action = "drop"
}
forward_to = [loki.write.default.receiver]
}
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
// Requires alloy.mounts.varlog=true to mount /var/log from the host
local.file_match "audit_logs" {
@ -117,6 +167,33 @@ alloy:
# Mount /var/log from the host for file-based log collection (audit logs)
mounts:
varlog: true
# Mount journal directories for loki.source.journal
extra:
- name: journal-run
mountPath: /run/log/journal
readOnly: true
- name: journal-var
mountPath: /var/log/journal
readOnly: true
- name: machine-id
mountPath: /etc/machine-id
readOnly: true
controller:
volumes:
extra:
- name: journal-run
hostPath:
path: /run/log/journal
type: DirectoryOrCreate
- name: journal-var
hostPath:
path: /var/log/journal
type: DirectoryOrCreate
- name: machine-id
hostPath:
path: /etc/machine-id
type: File
# Resource limits for DaemonSet pods
# Alloy tails logs from all containers on the node via K8s API and batches