Add node hang instrumentation and scale down chromium services
- Add journald collection to Alloy (loki.source.journal) for kernel OOM, panic, hung task, and soft lockup detection — ships system logs off-node so they survive hard resets - Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask, KernelSoftLockup, ContainerdDown) evaluating against node-journal logs - Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake), add alertmanager_url and enable_api - Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown, NodeHighIOWait (>30%) - Add caretta tolerations for control-plane and GPU nodes - Scale down chromium-based services to 0 for cluster stability: f1-stream, flaresolverr, changedetection, resume/printer
This commit is contained in:
parent
8029823f79
commit
ce79bd5c04
8 changed files with 517 additions and 16 deletions
|
|
@ -99,6 +99,56 @@ alloy:
|
|||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
|
||||
// Ships system logs off-node so they survive hard resets.
|
||||
loki.source.journal "node_journal" {
|
||||
forward_to = [loki.process.journal.receiver]
|
||||
relabel_rules = loki.relabel.journal.rules
|
||||
labels = {
|
||||
job = "node-journal",
|
||||
}
|
||||
max_age = "12h"
|
||||
}
|
||||
|
||||
loki.relabel "journal" {
|
||||
forward_to = []
|
||||
|
||||
rule {
|
||||
source_labels = ["__journal__hostname"]
|
||||
target_label = "node"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__journal__systemd_unit"]
|
||||
target_label = "unit"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__journal_priority_keyword"]
|
||||
target_label = "level"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__journal__transport"]
|
||||
target_label = "transport"
|
||||
}
|
||||
}
|
||||
|
||||
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
|
||||
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
|
||||
loki.process "journal" {
|
||||
stage.static_labels {
|
||||
values = {
|
||||
cluster = "default",
|
||||
}
|
||||
}
|
||||
|
||||
// Drop info/debug/notice entries that aren't from the kernel transport
|
||||
stage.match {
|
||||
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
|
||||
action = "drop"
|
||||
}
|
||||
|
||||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
|
||||
// Requires alloy.mounts.varlog=true to mount /var/log from the host
|
||||
local.file_match "audit_logs" {
|
||||
|
|
@ -117,6 +167,33 @@ alloy:
|
|||
# Mount /var/log from the host for file-based log collection (audit logs)
|
||||
mounts:
|
||||
varlog: true
|
||||
# Mount journal directories for loki.source.journal
|
||||
extra:
|
||||
- name: journal-run
|
||||
mountPath: /run/log/journal
|
||||
readOnly: true
|
||||
- name: journal-var
|
||||
mountPath: /var/log/journal
|
||||
readOnly: true
|
||||
- name: machine-id
|
||||
mountPath: /etc/machine-id
|
||||
readOnly: true
|
||||
|
||||
controller:
|
||||
volumes:
|
||||
extra:
|
||||
- name: journal-run
|
||||
hostPath:
|
||||
path: /run/log/journal
|
||||
type: DirectoryOrCreate
|
||||
- name: journal-var
|
||||
hostPath:
|
||||
path: /var/log/journal
|
||||
type: DirectoryOrCreate
|
||||
- name: machine-id
|
||||
hostPath:
|
||||
path: /etc/machine-id
|
||||
type: File
|
||||
|
||||
# Resource limits for DaemonSet pods
|
||||
# Alloy tails logs from all containers on the node via K8s API and batches
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue