diff --git a/stacks/authentik/modules/authentik/values.yaml b/stacks/authentik/modules/authentik/values.yaml index b51f6221..55624c35 100644 --- a/stacks/authentik/modules/authentik/values.yaml +++ b/stacks/authentik/modules/authentik/values.yaml @@ -25,9 +25,9 @@ server: resources: requests: cpu: 100m - memory: 1Gi + memory: 1.5Gi limits: - memory: 1Gi + memory: 1.5Gi topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname @@ -58,9 +58,9 @@ worker: resources: requests: cpu: 100m - memory: 896Mi + memory: 1Gi limits: - memory: 896Mi + memory: 1Gi topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/stacks/linkwarden/main.tf b/stacks/linkwarden/main.tf index c7741eb2..b836a9a9 100644 --- a/stacks/linkwarden/main.tf +++ b/stacks/linkwarden/main.tf @@ -68,6 +68,11 @@ resource "kubernetes_manifest" "db_external_secret" { target = { name = "linkwarden-db-creds" template = { + metadata = { + annotations = { + "reloader.stakater.com/match" = "true" + } + } data = { DATABASE_URL = "postgresql://linkwarden:{{ .password }}@${var.postgresql_host}:5432/linkwarden" DB_PASSWORD = "{{ .password }}" diff --git a/stacks/monitoring/modules/monitoring/dashboards/cluster_health.json b/stacks/monitoring/modules/monitoring/dashboards/cluster_health.json index 3d8dbdea..8d3e14b2 100644 --- a/stacks/monitoring/modules/monitoring/dashboards/cluster_health.json +++ b/stacks/monitoring/modules/monitoring/dashboards/cluster_health.json @@ -3134,7 +3134,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))", + "expr": "sum(rate(traefik_entrypoint_request_duration_seconds_sum[5m])) / sum(rate(traefik_entrypoint_request_duration_seconds_count[5m]))", "legendFormat": "", "refId": "A" } diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 9f5b0f0d..4548e87d 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -281,6 +281,315 @@ server: insecure_skip_verify: true serverFiles: + prometheus.yml: + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - job_name: kubernetes-apiservers + scheme: https + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + metric_relabel_configs: + - source_labels: [__name__] + regex: '(apiserver_request_duration_seconds|apiserver_request_sli_duration_seconds|apiserver_request_body_size_bytes|etcd_request_duration_seconds)_bucket' + action: drop + - job_name: kubernetes-nodes + scheme: https + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - job_name: kubernetes-nodes-cadvisor + scheme: https + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + metric_relabel_configs: + - source_labels: [__name__] + regex: 'container_tasks_state|container_memory_failures_total' + action: drop + - job_name: kubernetes-service-endpoints + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - job_name: kubernetes-service-endpoints-slow + honor_labels: true + scrape_interval: 5m + scrape_timeout: 30s + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - job_name: prometheus-pushgateway + honor_labels: true + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - job_name: kubernetes-services + honor_labels: true + metrics_path: /probe + params: + module: + - http_2xx + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - job_name: kubernetes-pods + honor_labels: true + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - job_name: kubernetes-pods-slow + honor_labels: true + scrape_interval: 5m + scrape_timeout: 30s + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node # prometheus.yml: # storage: # tsdb: @@ -953,7 +1262,7 @@ serverFiles: annotations: summary: "{{ $value | printf \"%.0f\" }} pods stuck in ContainerCreating on {{ $labels.node }}" - alert: KubeletRuntimeOperationsLatency - expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 30 + expr: histogram_quantile(0.99, sum by (instance, operation_type, le) (rate(kubelet_runtime_operations_duration_seconds_bucket[10m]))) > 60 for: 10m labels: severity: warning @@ -1012,15 +1321,17 @@ serverFiles: summary: "4xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 30%)" - alert: HighServiceLatency expr: | - histogram_quantile(0.99, - sum(rate(traefik_service_request_duration_seconds_bucket{service!~".*idrac.*"}[5m])) by (service, le) - ) > 30 + ( + sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*"}[5m])) by (service) + / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) + ) > 10 + and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*"}[5m])) by (service) > 0.01 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 5m labels: severity: warning annotations: - summary: "p99 latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 30s)" + summary: "Avg latency on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}s (threshold: 10s)" - alert: TLSCertExpiringSoon expr: (traefik_tls_certs_not_after - time()) / 86400 < 7 for: 1h @@ -1445,6 +1756,10 @@ extraScrapeConfigs: | regex: metrics - source_labels: [__meta_kubernetes_pod_name] target_label: instance + metric_relabel_configs: + - source_labels: [__name__] + regex: 'traefik_(router|service|entrypoint)_request_duration_seconds_bucket' + action: drop - job_name: 'realestate-crawler-api' kubernetes_sd_configs: - role: endpoints @@ -1481,4 +1796,8 @@ extraScrapeConfigs: | - targets: - "goflow2.monitoring.svc.cluster.local:8080" metrics_path: '/metrics' + metric_relabel_configs: + - source_labels: [__name__] + regex: 'goflow2_flow_process_nf_templates_total' + action: drop