fix alerts and reduce Prometheus disk write rate
- linkwarden: add Reloader match annotation to DB secret so pods auto-restart on Vault credential rotation (was causing 100% 5xx) - authentik: increase memory limits (server 1Gi→1.5Gi, worker 896Mi→1Gi) to prevent OOM kills - prometheus: drop 113k high-cardinality series to reduce HDD write rate from ~8.8 to ~6.0 MB/s (31% reduction): - drop all traefik/apiserver/etcd histogram bucket metrics - drop goflow2_flow_process_nf_templates_total (9.3k series) - drop container_tasks_state and container_memory_failures_total - rewrite HighServiceLatency alert to use avg latency (_sum/_count) - update cluster_health dashboard to match - raise KubeletRuntimeOperationsLatency threshold from 30s to 60s
This commit is contained in:
parent
7267e53e2f
commit
8a5a53a832
4 changed files with 334 additions and 10 deletions
|
|
@ -25,9 +25,9 @@ server:
|
|||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 1Gi
|
||||
memory: 1.5Gi
|
||||
limits:
|
||||
memory: 1Gi
|
||||
memory: 1.5Gi
|
||||
topologySpreadConstraints:
|
||||
- maxSkew: 1
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
|
@ -58,9 +58,9 @@ worker:
|
|||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 896Mi
|
||||
memory: 1Gi
|
||||
limits:
|
||||
memory: 896Mi
|
||||
memory: 1Gi
|
||||
topologySpreadConstraints:
|
||||
- maxSkew: 1
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
|
|
|||
|
|
@ -68,6 +68,11 @@ resource "kubernetes_manifest" "db_external_secret" {
|
|||
target = {
|
||||
name = "linkwarden-db-creds"
|
||||
template = {
|
||||
metadata = {
|
||||
annotations = {
|
||||
"reloader.stakater.com/match" = "true"
|
||||
}
|
||||
}
|
||||
data = {
|
||||
DATABASE_URL = "postgresql://linkwarden:{{ .password }}@${var.postgresql_host}:5432/linkwarden"
|
||||
DB_PASSWORD = "{{ .password }}"
|
||||
|
|
|
|||
|
|
@ -3134,7 +3134,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"expr": "sum(rate(traefik_entrypoint_request_duration_seconds_sum[5m])) / sum(rate(traefik_entrypoint_request_duration_seconds_count[5m]))",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -281,6 +281,315 @@ server:
|
|||
insecure_skip_verify: true
|
||||
|
||||
serverFiles:
|
||||
prometheus.yml:
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9090
|
||||
- job_name: kubernetes-apiservers
|
||||
scheme: https
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: default;kubernetes;https
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_endpoint_port_name
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: '(apiserver_request_duration_seconds|apiserver_request_sli_duration_seconds|apiserver_request_body_size_bytes|etcd_request_duration_seconds)_bucket'
|
||||
action: drop
|
||||
- job_name: kubernetes-nodes
|
||||
scheme: https
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- replacement: kubernetes.default.svc:443
|
||||
target_label: __address__
|
||||
- regex: (.+)
|
||||
replacement: /api/v1/nodes/$1/proxy/metrics
|
||||
source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
target_label: __metrics_path__
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
- job_name: kubernetes-nodes-cadvisor
|
||||
scheme: https
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- replacement: kubernetes.default.svc:443
|
||||
target_label: __address__
|
||||
- regex: (.+)
|
||||
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
||||
source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
target_label: __metrics_path__
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'container_tasks_state|container_memory_failures_total'
|
||||
action: drop
|
||||
- job_name: kubernetes-service-endpoints
|
||||
honor_labels: true
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
||||
- action: drop
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (.+?)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
source_labels:
|
||||
- __address__
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
- job_name: kubernetes-service-endpoints-slow
|
||||
honor_labels: true
|
||||
scrape_interval: 5m
|
||||
scrape_timeout: 30s
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (.+?)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
source_labels:
|
||||
- __address__
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
- job_name: prometheus-pushgateway
|
||||
honor_labels: true
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: pushgateway
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
||||
- job_name: kubernetes-services
|
||||
honor_labels: true
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module:
|
||||
- http_2xx
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
||||
- source_labels:
|
||||
- __address__
|
||||
target_label: __param_target
|
||||
- replacement: blackbox
|
||||
target_label: __address__
|
||||
- source_labels:
|
||||
- __param_target
|
||||
target_label: instance
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- job_name: kubernetes-pods
|
||||
honor_labels: true
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
|
||||
- action: drop
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
||||
replacement: '[$2]:$1'
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: replace
|
||||
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
||||
replacement: $2:$1
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_name
|
||||
target_label: pod
|
||||
- action: drop
|
||||
regex: Pending|Succeeded|Failed|Completed
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_phase
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
- job_name: kubernetes-pods-slow
|
||||
honor_labels: true
|
||||
scrape_interval: 5m
|
||||
scrape_timeout: 30s
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
||||
replacement: '[$2]:$1'
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: replace
|
||||
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
||||
replacement: $2:$1
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_name
|
||||
target_label: pod
|
||||
- action: drop
|
||||
regex: Pending|Succeeded|Failed|Completed
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_phase
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
# prometheus.yml:
|
||||
# storage:
|
||||
# tsdb:
|
||||
|
|
@ -953,7 +1262,7 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "{{ $value | printf \"%.0f\" }} pods stuck in ContainerCreating on {{ $labels.node }}"
|
||||
- alert: KubeletRuntimeOperationsLatency
|
||||
expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 30
|
||||
expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 60
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -1012,15 +1321,17 @@ serverFiles:
|
|||
summary: "4xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 30%)"
|
||||
- alert: HighServiceLatency
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(traefik_service_request_duration_seconds_bucket{service!~".*idrac.*"}[5m])) by (service, le)
|
||||
) > 30
|
||||
(
|
||||
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*"}[5m])) by (service)
|
||||
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service)
|
||||
) > 10
|
||||
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) > 0.01
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "p99 latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 30s)"
|
||||
summary: "Avg latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 10s)"
|
||||
- alert: TLSCertExpiringSoon
|
||||
expr: (traefik_tls_certs_not_after - time()) / 86400 < 7
|
||||
for: 1h
|
||||
|
|
@ -1445,6 +1756,10 @@ extraScrapeConfigs: |
|
|||
regex: metrics
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: instance
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'traefik_(router|service|entrypoint)_request_duration_seconds_bucket'
|
||||
action: drop
|
||||
- job_name: 'realestate-crawler-api'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
|
|
@ -1481,4 +1796,8 @@ extraScrapeConfigs: |
|
|||
- targets:
|
||||
- "goflow2.monitoring.svc.cluster.local:8080"
|
||||
metrics_path: '/metrics'
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'goflow2_flow_process_nf_templates_total'
|
||||
action: drop
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue