- Add journald collection to Alloy (loki.source.journal) for kernel OOM, panic, hung task, and soft lockup detection — ships system logs off-node so they survive hard resets - Add 5 Loki alerting rules (KernelOOMKiller, KernelPanic, KernelHungTask, KernelSoftLockup, ContainerdDown) evaluating against node-journal logs - Fix Loki ruler config: correct rules mount path (/var/loki/rules/fake), add alertmanager_url and enable_api - Add Prometheus alerts: NodeMemoryPressureTrending (>85%), NodeExporterDown, NodeHighIOWait (>30%) - Add caretta tolerations for control-plane and GPU nodes - Scale down chromium-based services to 0 for cluster stability: f1-stream, flaresolverr, changedetection, resume/printer
110 lines
2 KiB
YAML
110 lines
2 KiB
YAML
loki:
|
|
commonConfig:
|
|
replication_factor: 1
|
|
schemaConfig:
|
|
configs:
|
|
- from: "2025-04-01"
|
|
store: tsdb
|
|
object_store: filesystem
|
|
schema: v13
|
|
index:
|
|
prefix: loki_index_
|
|
period: 24h
|
|
ingester:
|
|
chunk_idle_period: 12h
|
|
max_chunk_age: 24h
|
|
chunk_retain_period: 1m
|
|
chunk_target_size: 1572864
|
|
wal:
|
|
dir: /loki-wal
|
|
pattern_ingester:
|
|
enabled: true
|
|
limits_config:
|
|
allow_structured_metadata: true
|
|
volume_enabled: true
|
|
retention_period: 720h
|
|
compactor:
|
|
retention_enabled: true
|
|
working_directory: /var/loki/compactor
|
|
compaction_interval: 1h
|
|
delete_request_store: filesystem
|
|
ruler:
|
|
enable_api: true
|
|
storage:
|
|
type: local
|
|
local:
|
|
directory: /var/loki/rules
|
|
alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
rule_path: /var/loki/scratch
|
|
storage:
|
|
type: "filesystem"
|
|
auth_enabled: false
|
|
|
|
minio:
|
|
enabled: false
|
|
|
|
deploymentMode: SingleBinary
|
|
|
|
singleBinary:
|
|
replicas: 1
|
|
persistence:
|
|
enabled: true
|
|
size: 50Gi
|
|
storageClass: "iscsi-truenas"
|
|
extraVolumes:
|
|
- name: wal
|
|
emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 2Gi
|
|
- name: rules
|
|
configMap:
|
|
name: loki-alert-rules
|
|
extraVolumeMounts:
|
|
- name: wal
|
|
mountPath: /loki-wal
|
|
- name: rules
|
|
mountPath: /var/loki/rules/fake
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 2Gi
|
|
limits:
|
|
cpu: "1"
|
|
memory: 4Gi
|
|
|
|
# Zero out replica counts of other deployment modes
|
|
backend:
|
|
replicas: 0
|
|
read:
|
|
replicas: 0
|
|
write:
|
|
replicas: 0
|
|
ingester:
|
|
replicas: 0
|
|
querier:
|
|
replicas: 0
|
|
queryFrontend:
|
|
replicas: 0
|
|
queryScheduler:
|
|
replicas: 0
|
|
distributor:
|
|
replicas: 0
|
|
compactor:
|
|
replicas: 0
|
|
indexGateway:
|
|
replicas: 0
|
|
bloomCompactor:
|
|
replicas: 0
|
|
bloomGateway:
|
|
replicas: 0
|
|
|
|
# Disable optional components for single binary mode
|
|
gateway:
|
|
enabled: false
|
|
chunksCache:
|
|
enabled: false
|
|
resultsCache:
|
|
enabled: false
|