Add node hang instrumentation and scale down chromium services
- Add journald collection to Alloy (loki.source.journal) for kernel OOM, panic, hung task, and soft lockup detection — ships system logs off-node so they survive hard resets - Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask, KernelSoftLockup, ContainerdDown) evaluating against node-journal logs - Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake), add alertmanager_url and enable_api - Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown, NodeHighIOWait (>30%) - Add caretta tolerations for control-plane and GPU nodes - Scale down chromium-based services to 0 for cluster stability: f1-stream, flaresolverr, changedetection, resume/printer
This commit is contained in:
parent
8029823f79
commit
ce79bd5c04
8 changed files with 517 additions and 16 deletions
|
|
@ -43,7 +43,7 @@ resource "kubernetes_deployment" "changedetection" {
|
|||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
replicas = 0 # Scaled down — sockpuppetbrowser (headless Chromium sidecar) causes node OOM
|
||||
strategy {
|
||||
type = "Recreate"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ resource "kubernetes_deployment" "f1-stream" {
|
|||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
replicas = 0 # Scaled down for cluster stability — periodic scans cause memory pressure
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "f1-stream"
|
||||
|
|
|
|||
|
|
@ -99,6 +99,56 @@ alloy:
|
|||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
|
||||
// Ships system logs off-node so they survive hard resets.
|
||||
loki.source.journal "node_journal" {
|
||||
forward_to = [loki.process.journal.receiver]
|
||||
relabel_rules = loki.relabel.journal.rules
|
||||
labels = {
|
||||
job = "node-journal",
|
||||
}
|
||||
max_age = "12h"
|
||||
}
|
||||
|
||||
loki.relabel "journal" {
|
||||
forward_to = []
|
||||
|
||||
rule {
|
||||
source_labels = ["__journal__hostname"]
|
||||
target_label = "node"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__journal__systemd_unit"]
|
||||
target_label = "unit"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__journal_priority_keyword"]
|
||||
target_label = "level"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__journal__transport"]
|
||||
target_label = "transport"
|
||||
}
|
||||
}
|
||||
|
||||
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
|
||||
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
|
||||
loki.process "journal" {
|
||||
stage.static_labels {
|
||||
values = {
|
||||
cluster = "default",
|
||||
}
|
||||
}
|
||||
|
||||
// Drop info/debug/notice entries that aren't from the kernel transport
|
||||
stage.match {
|
||||
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
|
||||
action = "drop"
|
||||
}
|
||||
|
||||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
|
||||
// Requires alloy.mounts.varlog=true to mount /var/log from the host
|
||||
local.file_match "audit_logs" {
|
||||
|
|
@ -117,6 +167,33 @@ alloy:
|
|||
# Mount /var/log from the host for file-based log collection (audit logs)
|
||||
mounts:
|
||||
varlog: true
|
||||
# Mount journal directories for loki.source.journal
|
||||
extra:
|
||||
- name: journal-run
|
||||
mountPath: /run/log/journal
|
||||
readOnly: true
|
||||
- name: journal-var
|
||||
mountPath: /var/log/journal
|
||||
readOnly: true
|
||||
- name: machine-id
|
||||
mountPath: /etc/machine-id
|
||||
readOnly: true
|
||||
|
||||
controller:
|
||||
volumes:
|
||||
extra:
|
||||
- name: journal-run
|
||||
hostPath:
|
||||
path: /run/log/journal
|
||||
type: DirectoryOrCreate
|
||||
- name: journal-var
|
||||
hostPath:
|
||||
path: /var/log/journal
|
||||
type: DirectoryOrCreate
|
||||
- name: machine-id
|
||||
hostPath:
|
||||
path: /etc/machine-id
|
||||
type: File
|
||||
|
||||
# Resource limits for DaemonSet pods
|
||||
# Alloy tails logs from all containers on the node via K8s API and batches
|
||||
|
|
|
|||
|
|
@ -14,6 +14,18 @@ resource "helm_release" "caretta" {
|
|||
victoria-metrics-single = {
|
||||
enabled = false
|
||||
}
|
||||
tolerations = [
|
||||
{
|
||||
key = "node-role.kubernetes.io/control-plane"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
},
|
||||
{
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
]
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
|
|
|
|||
|
|
@ -4202,7 +4202,7 @@
|
|||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 68
|
||||
"y": 72
|
||||
},
|
||||
"id": 35,
|
||||
"options": {
|
||||
|
|
@ -4233,6 +4233,405 @@
|
|||
"title": "Restart Rate (24h)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 95
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 12,
|
||||
"y": 68
|
||||
},
|
||||
"id": 112,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum(kube_pod_status_phase{phase=\"Running\"}) / count(kube_pod_info) * 100",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Healthy Pods %",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 16,
|
||||
"y": 68
|
||||
},
|
||||
"id": 113,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "count(kube_pod_status_phase{phase=~\"Failed|Pending|Unknown\"}) OR vector(0)",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Unhealthy Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 95
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 20,
|
||||
"y": 68
|
||||
},
|
||||
"id": 114,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum(kube_deployment_status_replicas_available) / sum(kube_deployment_spec_replicas) * 100",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Deployment Readiness %",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Ready Replicas"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "gradient",
|
||||
"type": "gauge"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 72
|
||||
},
|
||||
"id": 115,
|
||||
"options": {
|
||||
"cellHeight": "sm",
|
||||
"footer": {
|
||||
"countRows": false,
|
||||
"fields": "",
|
||||
"reducer": ["sum"],
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": false,
|
||||
"displayName": "namespace"
|
||||
}
|
||||
]
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (namespace) (kube_pod_status_phase{phase=\"Running\"})",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "count by (namespace) (kube_pod_info)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (namespace) (kube_deployment_status_replicas_available)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (namespace) (kube_deployment_spec_replicas)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "D"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "count by (namespace) (kube_deployment_spec_replicas)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "E"
|
||||
}
|
||||
],
|
||||
"title": "Pod & Deployment Health by Namespace",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "Running Pods",
|
||||
"Value #B": "Total Pods",
|
||||
"Value #C": "Ready Replicas",
|
||||
"Value #D": "Desired Replicas",
|
||||
"Value #E": "Deployments"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 80
|
||||
},
|
||||
"id": 116,
|
||||
"options": {
|
||||
"dedupStrategy": "exact",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|OOMKilled|CrashLoopBackOff|fatal)\"",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Failing Pod Logs",
|
||||
"type": "logs"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
|
|
@ -4314,7 +4713,7 @@
|
|||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 72
|
||||
"y": 88
|
||||
},
|
||||
"id": 36,
|
||||
"options": {
|
||||
|
|
@ -4341,7 +4740,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "topk(15, kube_pod_container_status_restarts_total)",
|
||||
"expr": "topk(15, round(increase(kube_pod_container_status_restarts_total[$__range])) > 0)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
|
|
@ -4379,7 +4778,7 @@
|
|||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 80
|
||||
"y": 97
|
||||
},
|
||||
"id": 50,
|
||||
"panels": [],
|
||||
|
|
@ -4499,7 +4898,7 @@
|
|||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 81
|
||||
"y": 98
|
||||
},
|
||||
"id": 51,
|
||||
"options": {
|
||||
|
|
@ -4642,7 +5041,7 @@
|
|||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 81
|
||||
"y": 98
|
||||
},
|
||||
"id": 52,
|
||||
"options": {
|
||||
|
|
@ -4766,7 +5165,7 @@
|
|||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 89
|
||||
"y": 106
|
||||
},
|
||||
"id": 53,
|
||||
"options": {
|
||||
|
|
@ -4813,7 +5212,7 @@
|
|||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 97
|
||||
"y": 114
|
||||
},
|
||||
"id": 60,
|
||||
"panels": [],
|
||||
|
|
@ -4893,7 +5292,7 @@
|
|||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 98
|
||||
"y": 115
|
||||
},
|
||||
"id": 61,
|
||||
"options": {
|
||||
|
|
@ -4985,6 +5384,19 @@
|
|||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": {
|
||||
"text": "Loki",
|
||||
"value": "P8E80F9AEF21F6940"
|
||||
},
|
||||
"includeAll": false,
|
||||
"name": "loki_datasource",
|
||||
"options": [],
|
||||
"query": "loki",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ loki:
|
|||
storage:
|
||||
type: local
|
||||
local:
|
||||
directory: /loki/rules
|
||||
directory: /var/loki/rules
|
||||
alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
|
||||
ring:
|
||||
kvstore:
|
||||
|
|
@ -66,7 +66,7 @@ singleBinary:
|
|||
- name: wal
|
||||
mountPath: /loki-wal
|
||||
- name: rules
|
||||
mountPath: /loki/rules/fake
|
||||
mountPath: /var/loki/rules/fake
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ resource "kubernetes_deployment" "printer" {
|
|||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
replicas = 0 # Scaled down — browserless chromium causes node OOM
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "printer"
|
||||
|
|
@ -147,7 +147,7 @@ resource "kubernetes_deployment" "resume" {
|
|||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
replicas = 0 # Scaled down with printer — depends on browserless chromium
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "resume"
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ resource "kubernetes_deployment" "flaresolverr" {
|
|||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
replicas = 0 # Scaled down — headless Chrome with no effective resource limits causes node OOM
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "flaresolverr"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue