fix alerts and reduce Prometheus disk write rate

- linkwarden: add Reloader match annotation to DB secret so pods
  auto-restart on Vault credential rotation (was causing 100% 5xx)
- authentik: increase memory limits (server 1Gi→1.5Gi, worker 896Mi→1Gi)
  to prevent OOM kills
- prometheus: drop 113k high-cardinality series to reduce HDD write rate
  from ~8.8 to ~6.0 MB/s (31% reduction):
  - drop all traefik/apiserver/etcd histogram bucket metrics
  - drop goflow2_flow_process_nf_templates_total (9.3k series)
  - drop container_tasks_state and container_memory_failures_total
  - rewrite HighServiceLatency alert to use avg latency (_sum/_count)
  - update cluster_health dashboard to match
- raise KubeletRuntimeOperationsLatency threshold from 30s to 60s
This commit is contained in:
Viktor Barzin 2026-03-28 15:42:14 +02:00
parent 7267e53e2f
commit 8a5a53a832
4 changed files with 334 additions and 10 deletions

View file

@ -25,9 +25,9 @@ server:
resources:
requests:
cpu: 100m
memory: 1Gi
memory: 1.5Gi
limits:
memory: 1Gi
memory: 1.5Gi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
@ -58,9 +58,9 @@ worker:
resources:
requests:
cpu: 100m
memory: 896Mi
memory: 1Gi
limits:
memory: 896Mi
memory: 1Gi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname

View file

@ -68,6 +68,11 @@ resource "kubernetes_manifest" "db_external_secret" {
target = {
name = "linkwarden-db-creds"
template = {
metadata = {
annotations = {
"reloader.stakater.com/match" = "true"
}
}
data = {
DATABASE_URL = "postgresql://linkwarden:{{ .password }}@${var.postgresql_host}:5432/linkwarden"
DB_PASSWORD = "{{ .password }}"

View file

@ -3134,7 +3134,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
"expr": "sum(rate(traefik_entrypoint_request_duration_seconds_sum[5m])) / sum(rate(traefik_entrypoint_request_duration_seconds_count[5m]))",
"legendFormat": "",
"refId": "A"
}

View file

@ -281,6 +281,315 @@ server:
insecure_skip_verify: true
serverFiles:
prometheus.yml:
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: kubernetes-apiservers
scheme: https
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
metric_relabel_configs:
- source_labels: [__name__]
regex: '(apiserver_request_duration_seconds|apiserver_request_sli_duration_seconds|apiserver_request_body_size_bytes|etcd_request_duration_seconds)_bucket'
action: drop
- job_name: kubernetes-nodes
scheme: https
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- job_name: kubernetes-nodes-cadvisor
scheme: https
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
metric_relabel_configs:
- source_labels: [__name__]
regex: 'container_tasks_state|container_memory_failures_total'
action: drop
- job_name: kubernetes-service-endpoints
honor_labels: true
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (.+?)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: service
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
- job_name: kubernetes-service-endpoints-slow
honor_labels: true
scrape_interval: 5m
scrape_timeout: 30s
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (.+?)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: service
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
- job_name: prometheus-pushgateway
honor_labels: true
kubernetes_sd_configs:
- role: service
relabel_configs:
- action: keep
regex: pushgateway
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- job_name: kubernetes-services
honor_labels: true
metrics_path: /probe
params:
module:
- http_2xx
kubernetes_sd_configs:
- role: service
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- source_labels:
- __address__
target_label: __param_target
- replacement: blackbox
target_label: __address__
- source_labels:
- __param_target
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- source_labels:
- __meta_kubernetes_service_name
target_label: service
- job_name: kubernetes-pods
honor_labels: true
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$2]:$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $2:$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
- job_name: kubernetes-pods-slow
honor_labels: true
scrape_interval: 5m
scrape_timeout: 30s
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$2]:$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $2:$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
# prometheus.yml:
# storage:
# tsdb:
@ -953,7 +1262,7 @@ serverFiles:
annotations:
summary: "{{ $value | printf \"%.0f\" }} pods stuck in ContainerCreating on {{ $labels.node }}"
- alert: KubeletRuntimeOperationsLatency
expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 30
expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 60
for: 10m
labels:
severity: warning
@ -1012,15 +1321,17 @@ serverFiles:
summary: "4xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 30%)"
- alert: HighServiceLatency
expr: |
histogram_quantile(0.99,
sum(rate(traefik_service_request_duration_seconds_bucket{service!~".*idrac.*"}[5m])) by (service, le)
) > 30
(
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*"}[5m])) by (service)
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service)
) > 10
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) > 0.01
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m
labels:
severity: warning
annotations:
summary: "p99 latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 30s)"
summary: "Avg latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 10s)"
- alert: TLSCertExpiringSoon
expr: (traefik_tls_certs_not_after - time()) / 86400 < 7
for: 1h
@ -1445,6 +1756,10 @@ extraScrapeConfigs: |
regex: metrics
- source_labels: [__meta_kubernetes_pod_name]
target_label: instance
metric_relabel_configs:
- source_labels: [__name__]
regex: 'traefik_(router|service|entrypoint)_request_duration_seconds_bucket'
action: drop
- job_name: 'realestate-crawler-api'
kubernetes_sd_configs:
- role: endpoints
@ -1481,4 +1796,8 @@ extraScrapeConfigs: |
- targets:
- "goflow2.monitoring.svc.cluster.local:8080"
metrics_path: '/metrics'
metric_relabel_configs:
- source_labels: [__name__]
regex: 'goflow2_flow_process_nf_templates_total'
action: drop