reduce Prometheus cardinality round 2: drop 137k more series

- fix traefik double-scrape: kubernetes-pods job was scraping traefik
  pods again (43k duplicate series). Added namespace drop rule.
- drop unused cadvisor metrics: container_fs_*, container_blkio_*,
  container_pressure_*, container_spec_*, and misc (30k series)
- drop more apiserver histogram buckets: watch_list, watch_cache,
  response_sizes, watch_events, admission_controller, workqueue (11k)
- drop unused kube-state-metrics: replicaset_*, pod_tolerations,
  pod_labels, endpoint_*, service_*, configmap_*, etc (53k series)

Post-relabel samples: 332k → 142k (-57%)
Ingestion rate: 5,480 → 3,239 samples/sec (-41%)
This commit is contained in:
Viktor Barzin 2026-03-28 23:51:24 +02:00
parent aceea7db94
commit a9ca65bc31

View file

@ -304,7 +304,7 @@ serverFiles:
insecure_skip_verify: true
metric_relabel_configs:
- source_labels: [__name__]
regex: '(apiserver_request_duration_seconds|apiserver_request_sli_duration_seconds|apiserver_request_body_size_bytes|etcd_request_duration_seconds)_bucket'
regex: '(apiserver_request_duration_seconds|apiserver_request_sli_duration_seconds|apiserver_request_body_size_bytes|etcd_request_duration_seconds|apiserver_watch_list_duration_seconds|apiserver_watch_cache_read_wait_seconds|apiserver_response_sizes|apiserver_watch_events_sizes|apiserver_admission_controller_admission_duration_seconds|workqueue_queue_duration_seconds|workqueue_work_duration_seconds)_bucket'
action: drop
- job_name: kubernetes-nodes
scheme: https
@ -346,6 +346,9 @@ serverFiles:
- source_labels: [__name__]
regex: 'container_tasks_state|container_memory_failures_total'
action: drop
- source_labels: [__name__]
regex: 'container_fs_.*|container_blkio_.*|container_pressure_.*|container_spec_.*|container_ulimits_soft|container_file_descriptors|container_threads|container_threads_max|container_sockets|container_processes|container_last_seen|machine_nvm_.*|machine_swap_bytes|machine_cpu_physical_cores|machine_cpu_sockets'
action: drop
- job_name: kubernetes-service-endpoints
honor_labels: true
kubernetes_sd_configs:
@ -393,6 +396,10 @@ serverFiles:
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
metric_relabel_configs:
- source_labels: [__name__]
regex: 'kube_replicaset_.*|kube_pod_tolerations|kube_pod_status_scheduled|kube_deployment_status_condition|kube_pod_labels|kube_pod_created|kube_pod_owner|kube_pod_container_info|kube_pod_init_container_.*|kube_endpoint_.*|kube_service_.*|kube_configmap_.*|kube_secret_.*|kube_lease_.*|kube_ingress_.*|kube_networkpolicy_.*|kube_certificatesigningrequest_.*|kube_limitrange_.*|kube_mutatingwebhookconfiguration_.*|kube_validatingwebhookconfiguration_.*|kube_verticalpodautoscaler_.*|kube_clusterrole.*|kube_role.*|kube_poddisruptionbudget_.*'
action: drop
- job_name: kubernetes-service-endpoints-slow
honor_labels: true
scrape_interval: 5m
@ -485,6 +492,10 @@ serverFiles:
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: drop
regex: traefik
source_labels:
- __meta_kubernetes_namespace
- action: drop
regex: true
source_labels: