fix alerts and reduce Prometheus disk write rate
- linkwarden: add Reloader match annotation to DB secret so pods auto-restart on Vault credential rotation (was causing 100% 5xx) - authentik: increase memory limits (server 1Gi→1.5Gi, worker 896Mi→1Gi) to prevent OOM kills - prometheus: drop 113k high-cardinality series to reduce HDD write rate from ~8.8 to ~6.0 MB/s (31% reduction): - drop all traefik/apiserver/etcd histogram bucket metrics - drop goflow2_flow_process_nf_templates_total (9.3k series) - drop container_tasks_state and container_memory_failures_total - rewrite HighServiceLatency alert to use avg latency (_sum/_count) - update cluster_health dashboard to match - raise KubeletRuntimeOperationsLatency threshold from 30s to 60s
This commit is contained in:
parent
7267e53e2f
commit
8a5a53a832
4 changed files with 334 additions and 10 deletions
|
|
@ -25,9 +25,9 @@ server:
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
memory: 1Gi
|
memory: 1.5Gi
|
||||||
limits:
|
limits:
|
||||||
memory: 1Gi
|
memory: 1.5Gi
|
||||||
topologySpreadConstraints:
|
topologySpreadConstraints:
|
||||||
- maxSkew: 1
|
- maxSkew: 1
|
||||||
topologyKey: kubernetes.io/hostname
|
topologyKey: kubernetes.io/hostname
|
||||||
|
|
@ -58,9 +58,9 @@ worker:
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
memory: 896Mi
|
memory: 1Gi
|
||||||
limits:
|
limits:
|
||||||
memory: 896Mi
|
memory: 1Gi
|
||||||
topologySpreadConstraints:
|
topologySpreadConstraints:
|
||||||
- maxSkew: 1
|
- maxSkew: 1
|
||||||
topologyKey: kubernetes.io/hostname
|
topologyKey: kubernetes.io/hostname
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,11 @@ resource "kubernetes_manifest" "db_external_secret" {
|
||||||
target = {
|
target = {
|
||||||
name = "linkwarden-db-creds"
|
name = "linkwarden-db-creds"
|
||||||
template = {
|
template = {
|
||||||
|
metadata = {
|
||||||
|
annotations = {
|
||||||
|
"reloader.stakater.com/match" = "true"
|
||||||
|
}
|
||||||
|
}
|
||||||
data = {
|
data = {
|
||||||
DATABASE_URL = "postgresql://linkwarden:{{ .password }}@${var.postgresql_host}:5432/linkwarden"
|
DATABASE_URL = "postgresql://linkwarden:{{ .password }}@${var.postgresql_host}:5432/linkwarden"
|
||||||
DB_PASSWORD = "{{ .password }}"
|
DB_PASSWORD = "{{ .password }}"
|
||||||
|
|
|
||||||
|
|
@ -3134,7 +3134,7 @@
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "${datasource}"
|
"uid": "${datasource}"
|
||||||
},
|
},
|
||||||
"expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
|
"expr": "sum(rate(traefik_entrypoint_request_duration_seconds_sum[5m])) / sum(rate(traefik_entrypoint_request_duration_seconds_count[5m]))",
|
||||||
"legendFormat": "",
|
"legendFormat": "",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -281,6 +281,315 @@ server:
|
||||||
insecure_skip_verify: true
|
insecure_skip_verify: true
|
||||||
|
|
||||||
serverFiles:
|
serverFiles:
|
||||||
|
prometheus.yml:
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: prometheus
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- localhost:9090
|
||||||
|
- job_name: kubernetes-apiservers
|
||||||
|
scheme: https
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: endpoints
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: default;kubernetes;https
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
- __meta_kubernetes_service_name
|
||||||
|
- __meta_kubernetes_endpoint_port_name
|
||||||
|
tls_config:
|
||||||
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
insecure_skip_verify: true
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: '(apiserver_request_duration_seconds|apiserver_request_sli_duration_seconds|apiserver_request_body_size_bytes|etcd_request_duration_seconds)_bucket'
|
||||||
|
action: drop
|
||||||
|
- job_name: kubernetes-nodes
|
||||||
|
scheme: https
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: node
|
||||||
|
relabel_configs:
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
- replacement: kubernetes.default.svc:443
|
||||||
|
target_label: __address__
|
||||||
|
- regex: (.+)
|
||||||
|
replacement: /api/v1/nodes/$1/proxy/metrics
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_node_name
|
||||||
|
target_label: __metrics_path__
|
||||||
|
tls_config:
|
||||||
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
insecure_skip_verify: true
|
||||||
|
- job_name: kubernetes-nodes-cadvisor
|
||||||
|
scheme: https
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: node
|
||||||
|
relabel_configs:
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
- replacement: kubernetes.default.svc:443
|
||||||
|
target_label: __address__
|
||||||
|
- regex: (.+)
|
||||||
|
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_node_name
|
||||||
|
target_label: __metrics_path__
|
||||||
|
tls_config:
|
||||||
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
insecure_skip_verify: true
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'container_tasks_state|container_memory_failures_total'
|
||||||
|
action: drop
|
||||||
|
- job_name: kubernetes-service-endpoints
|
||||||
|
honor_labels: true
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: endpoints
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
||||||
|
- action: drop
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
||||||
|
- action: replace
|
||||||
|
regex: (https?)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||||
|
target_label: __scheme__
|
||||||
|
- action: replace
|
||||||
|
regex: (.+)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||||
|
target_label: __metrics_path__
|
||||||
|
- action: replace
|
||||||
|
regex: (.+?)(?::\d+)?;(\d+)
|
||||||
|
replacement: $1:$2
|
||||||
|
source_labels:
|
||||||
|
- __address__
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||||
|
target_label: __address__
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
||||||
|
replacement: __param_$1
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_service_label_(.+)
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
target_label: namespace
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_name
|
||||||
|
target_label: service
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
target_label: node
|
||||||
|
- job_name: kubernetes-service-endpoints-slow
|
||||||
|
honor_labels: true
|
||||||
|
scrape_interval: 5m
|
||||||
|
scrape_timeout: 30s
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: endpoints
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
||||||
|
- action: replace
|
||||||
|
regex: (https?)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||||
|
target_label: __scheme__
|
||||||
|
- action: replace
|
||||||
|
regex: (.+)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||||
|
target_label: __metrics_path__
|
||||||
|
- action: replace
|
||||||
|
regex: (.+?)(?::\d+)?;(\d+)
|
||||||
|
replacement: $1:$2
|
||||||
|
source_labels:
|
||||||
|
- __address__
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||||
|
target_label: __address__
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
||||||
|
replacement: __param_$1
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_service_label_(.+)
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
target_label: namespace
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_name
|
||||||
|
target_label: service
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
target_label: node
|
||||||
|
- job_name: prometheus-pushgateway
|
||||||
|
honor_labels: true
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: service
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: pushgateway
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
||||||
|
- job_name: kubernetes-services
|
||||||
|
honor_labels: true
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module:
|
||||||
|
- http_2xx
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: service
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
||||||
|
- source_labels:
|
||||||
|
- __address__
|
||||||
|
target_label: __param_target
|
||||||
|
- replacement: blackbox
|
||||||
|
target_label: __address__
|
||||||
|
- source_labels:
|
||||||
|
- __param_target
|
||||||
|
target_label: instance
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_service_label_(.+)
|
||||||
|
- source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
target_label: namespace
|
||||||
|
- source_labels:
|
||||||
|
- __meta_kubernetes_service_name
|
||||||
|
target_label: service
|
||||||
|
- job_name: kubernetes-pods
|
||||||
|
honor_labels: true
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
|
||||||
|
- action: drop
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
||||||
|
- action: replace
|
||||||
|
regex: (https?)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
||||||
|
target_label: __scheme__
|
||||||
|
- action: replace
|
||||||
|
regex: (.+)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||||
|
target_label: __metrics_path__
|
||||||
|
- action: replace
|
||||||
|
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
||||||
|
replacement: '[$2]:$1'
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||||
|
- __meta_kubernetes_pod_ip
|
||||||
|
target_label: __address__
|
||||||
|
- action: replace
|
||||||
|
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
||||||
|
replacement: $2:$1
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||||
|
- __meta_kubernetes_pod_ip
|
||||||
|
target_label: __address__
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||||
|
replacement: __param_$1
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_pod_label_(.+)
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
target_label: namespace
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_name
|
||||||
|
target_label: pod
|
||||||
|
- action: drop
|
||||||
|
regex: Pending|Succeeded|Failed|Completed
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_phase
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
target_label: node
|
||||||
|
- job_name: kubernetes-pods-slow
|
||||||
|
honor_labels: true
|
||||||
|
scrape_interval: 5m
|
||||||
|
scrape_timeout: 30s
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
relabel_configs:
|
||||||
|
- action: keep
|
||||||
|
regex: true
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
||||||
|
- action: replace
|
||||||
|
regex: (https?)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
||||||
|
target_label: __scheme__
|
||||||
|
- action: replace
|
||||||
|
regex: (.+)
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||||
|
target_label: __metrics_path__
|
||||||
|
- action: replace
|
||||||
|
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
||||||
|
replacement: '[$2]:$1'
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||||
|
- __meta_kubernetes_pod_ip
|
||||||
|
target_label: __address__
|
||||||
|
- action: replace
|
||||||
|
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
||||||
|
replacement: $2:$1
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||||
|
- __meta_kubernetes_pod_ip
|
||||||
|
target_label: __address__
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||||
|
replacement: __param_$1
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_pod_label_(.+)
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
target_label: namespace
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_name
|
||||||
|
target_label: pod
|
||||||
|
- action: drop
|
||||||
|
regex: Pending|Succeeded|Failed|Completed
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_phase
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
target_label: node
|
||||||
# prometheus.yml:
|
# prometheus.yml:
|
||||||
# storage:
|
# storage:
|
||||||
# tsdb:
|
# tsdb:
|
||||||
|
|
@ -953,7 +1262,7 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $value | printf \"%.0f\" }} pods stuck in ContainerCreating on {{ $labels.node }}"
|
summary: "{{ $value | printf \"%.0f\" }} pods stuck in ContainerCreating on {{ $labels.node }}"
|
||||||
- alert: KubeletRuntimeOperationsLatency
|
- alert: KubeletRuntimeOperationsLatency
|
||||||
expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 30
|
expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 60
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -1012,15 +1321,17 @@ serverFiles:
|
||||||
summary: "4xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 30%)"
|
summary: "4xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 30%)"
|
||||||
- alert: HighServiceLatency
|
- alert: HighServiceLatency
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99,
|
(
|
||||||
sum(rate(traefik_service_request_duration_seconds_bucket{service!~".*idrac.*"}[5m])) by (service, le)
|
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*"}[5m])) by (service)
|
||||||
) > 30
|
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service)
|
||||||
|
) > 10
|
||||||
|
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) > 0.01
|
||||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "p99 latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 30s)"
|
summary: "Avg latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 10s)"
|
||||||
- alert: TLSCertExpiringSoon
|
- alert: TLSCertExpiringSoon
|
||||||
expr: (traefik_tls_certs_not_after - time()) / 86400 < 7
|
expr: (traefik_tls_certs_not_after - time()) / 86400 < 7
|
||||||
for: 1h
|
for: 1h
|
||||||
|
|
@ -1445,6 +1756,10 @@ extraScrapeConfigs: |
|
||||||
regex: metrics
|
regex: metrics
|
||||||
- source_labels: [__meta_kubernetes_pod_name]
|
- source_labels: [__meta_kubernetes_pod_name]
|
||||||
target_label: instance
|
target_label: instance
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'traefik_(router|service|entrypoint)_request_duration_seconds_bucket'
|
||||||
|
action: drop
|
||||||
- job_name: 'realestate-crawler-api'
|
- job_name: 'realestate-crawler-api'
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
- role: endpoints
|
- role: endpoints
|
||||||
|
|
@ -1481,4 +1796,8 @@ extraScrapeConfigs: |
|
||||||
- targets:
|
- targets:
|
||||||
- "goflow2.monitoring.svc.cluster.local:8080"
|
- "goflow2.monitoring.svc.cluster.local:8080"
|
||||||
metrics_path: '/metrics'
|
metrics_path: '/metrics'
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'goflow2_flow_process_nf_templates_total'
|
||||||
|
action: drop
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue