From 6fb2c1c7ba0607fa22fa5f08456150a6a0b56282 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 11 May 2026 19:20:54 +0000 Subject: [PATCH] dbaas+monitoring: bump PG max_connections to 200, add scrape + alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cluster grew past the 100-conn default — steady-state idle was 90/100, leaving zero headroom for terragrunt applies or transient surges. The ceiling was being discovered by Terraform crashing (pq: "remaining connection slots are reserved for roles with the SUPERUSER attribute"), not by alerting, because we had no PG scrape config at all. dbaas (Tier 0): * max_connections: 100 → 200 * shared_buffers: 512MB → 1GB (Postgres recommends ~25% of pod memory) * effective_cache_size: 1536MB → 2560MB (scaled with pod memory) * pod memory: 2Gi → 3Gi (rough rule of thumb: enough for shared_buffers + ~16MB work_mem * concurrent sorts + OS cache + overhead) * Triggers bump on null_resource.pg_cluster forces CNPG to re-apply, which rolls the cluster (standby first, then primary failover). monitoring: * New scrape job 'cnpg' on dbaas namespace pods labeled cnpg.io/podRole=instance, port name=metrics (9187). Relabels add cnpg_cluster + cnpg_role labels for alert grouping. * PGConnectionsHigh (warning, >85% for 10m) — heads-up before exhaustion. * PGConnectionsCritical (critical, >95% for 3m) — last call before refusing connections. Verified: cnpg targets up, sum(cnpg_backends_total)=84, max_connections metric=200, alert ratio 0.42 → both alerts inactive. --- stacks/dbaas/modules/dbaas/main.tf | 19 +++++++--- .../monitoring/prometheus_chart_values.tpl | 37 +++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index f4537634..bc173513 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -1054,8 +1054,8 @@ resource "null_resource" "pg_cluster" { image = "ghcr.io/cloudnative-pg/postgis:16" storage_size = "20Gi" storage_class = "proxmox-lvm-encrypted" - memory_limit = "2Gi" - pg_params = "v2-shared512-walcomp-workmem16" + memory_limit = "3Gi" + pg_params = "v3-shared1024-walcomp-workmem16-max200" } provisioner "local-exec" { @@ -1072,8 +1072,15 @@ resource "null_resource" "pg_cluster" { postgresql: parameters: search_path: '"$user", public' - shared_buffers: "512MB" - effective_cache_size: "1536MB" + # Cluster grew past the 100-conn default ceiling (~90/100 idle + # steady-state in May 2026; authentik+matrix alone hold ~55). + # Bumped to 200 with shared_buffers/effective_cache_size/memory + # scaled proportionally. work_mem stays at 16MB — that's per + # sort/hash op, not per connection, so 16MB * 200 isn't the + # worst case. + max_connections: "200" + shared_buffers: "1024MB" + effective_cache_size: "2560MB" work_mem: "16MB" wal_compression: "on" random_page_cost: "4" @@ -1093,9 +1100,9 @@ resource "null_resource" "pg_cluster" { resources: requests: cpu: "50m" - memory: "2Gi" + memory: "3Gi" limits: - memory: "2Gi" + memory: "3Gi" EOF EOT } diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 305dfd5c..be600107 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1632,6 +1632,23 @@ serverFiles: severity: warning annotations: summary: "CNPG operator down — PostgreSQL failover/management degraded" + - alert: PGConnectionsHigh + # Per-cluster connection utilisation. Sums per-database backend + # counts on each pod, then takes max across pods (primary holds + # the real workload; replicas only have streaming_replica conns). + expr: (max by (cnpg_cluster) (sum by (cnpg_cluster, instance) (cnpg_backends_total))) / (max by (cnpg_cluster) (cnpg_pg_settings_setting{name="max_connections"})) > 0.85 + for: 10m + labels: + severity: warning + annotations: + summary: "PostgreSQL {{ $labels.cnpg_cluster }}: connection utilisation {{ $value | humanizePercentage }} — approaching max_connections ceiling" + - alert: PGConnectionsCritical + expr: (max by (cnpg_cluster) (sum by (cnpg_cluster, instance) (cnpg_backends_total))) / (max by (cnpg_cluster) (cnpg_pg_settings_setting{name="max_connections"})) > 0.95 + for: 3m + labels: + severity: critical + annotations: + summary: "PostgreSQL {{ $labels.cnpg_cluster }}: connection utilisation {{ $value | humanizePercentage }} — new client connections will be refused soon. Bump max_connections or reap idle backends." - name: Cluster rules: - alert: NodeDown @@ -2602,6 +2619,26 @@ extraScrapeConfigs: | action: keep regex: '.*-envoy-prom' + - job_name: 'cnpg' + # Scrapes the CNPG built-in postgres exporter (port 9187, named "metrics") + # on every cluster instance pod. Adds cnpg_cluster + cnpg_role labels so + # alerts (PGConnectionsHigh/Critical) can group by cluster. + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - dbaas + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_cnpg_io_podRole, __meta_kubernetes_pod_container_port_name] + action: keep + regex: 'instance;metrics' + - source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster] + target_label: cnpg_cluster + - source_labels: [__meta_kubernetes_pod_label_cnpg_io_instanceRole] + target_label: cnpg_role + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - job_name: 'crowdsec' static_configs: - targets: