dbaas+monitoring: bump PG max_connections to 200, add scrape + alerts
Cluster grew past the 100-conn default — steady-state idle was 90/100,
leaving zero headroom for terragrunt applies or transient surges. The
ceiling was being discovered by Terraform crashing (pq: "remaining
connection slots are reserved for roles with the SUPERUSER attribute"),
not by alerting, because we had no PG scrape config at all.
dbaas (Tier 0):
* max_connections: 100 → 200
* shared_buffers: 512MB → 1GB (Postgres recommends ~25% of pod memory)
* effective_cache_size: 1536MB → 2560MB (scaled with pod memory)
* pod memory: 2Gi → 3Gi (rough rule of thumb: enough for shared_buffers
+ ~16MB work_mem * concurrent sorts + OS cache + overhead)
* Triggers bump on null_resource.pg_cluster forces CNPG to re-apply,
which rolls the cluster (standby first, then primary failover).
monitoring:
* New scrape job 'cnpg' on dbaas namespace pods labeled
cnpg.io/podRole=instance, port name=metrics (9187). Relabels add
cnpg_cluster + cnpg_role labels for alert grouping.
* PGConnectionsHigh (warning, >85% for 10m) — heads-up before exhaustion.
* PGConnectionsCritical (critical, >95% for 3m) — last call before
refusing connections.
Verified: cnpg targets up, sum(cnpg_backends_total)=84, max_connections
metric=200, alert ratio 0.42 → both alerts inactive.
This commit is contained in:
parent
9e5a5fb0c7
commit
20774f794d
2 changed files with 50 additions and 6 deletions
|
|
@ -1054,8 +1054,8 @@ resource "null_resource" "pg_cluster" {
|
||||||
image = "ghcr.io/cloudnative-pg/postgis:16"
|
image = "ghcr.io/cloudnative-pg/postgis:16"
|
||||||
storage_size = "20Gi"
|
storage_size = "20Gi"
|
||||||
storage_class = "proxmox-lvm-encrypted"
|
storage_class = "proxmox-lvm-encrypted"
|
||||||
memory_limit = "2Gi"
|
memory_limit = "3Gi"
|
||||||
pg_params = "v2-shared512-walcomp-workmem16"
|
pg_params = "v3-shared1024-walcomp-workmem16-max200"
|
||||||
}
|
}
|
||||||
|
|
||||||
provisioner "local-exec" {
|
provisioner "local-exec" {
|
||||||
|
|
@ -1072,8 +1072,15 @@ resource "null_resource" "pg_cluster" {
|
||||||
postgresql:
|
postgresql:
|
||||||
parameters:
|
parameters:
|
||||||
search_path: '"$user", public'
|
search_path: '"$user", public'
|
||||||
shared_buffers: "512MB"
|
# Cluster grew past the 100-conn default ceiling (~90/100 idle
|
||||||
effective_cache_size: "1536MB"
|
# steady-state in May 2026; authentik+matrix alone hold ~55).
|
||||||
|
# Bumped to 200 with shared_buffers/effective_cache_size/memory
|
||||||
|
# scaled proportionally. work_mem stays at 16MB — that's per
|
||||||
|
# sort/hash op, not per connection, so 16MB * 200 isn't the
|
||||||
|
# worst case.
|
||||||
|
max_connections: "200"
|
||||||
|
shared_buffers: "1024MB"
|
||||||
|
effective_cache_size: "2560MB"
|
||||||
work_mem: "16MB"
|
work_mem: "16MB"
|
||||||
wal_compression: "on"
|
wal_compression: "on"
|
||||||
random_page_cost: "4"
|
random_page_cost: "4"
|
||||||
|
|
@ -1093,9 +1100,9 @@ resource "null_resource" "pg_cluster" {
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "50m"
|
cpu: "50m"
|
||||||
memory: "2Gi"
|
memory: "3Gi"
|
||||||
limits:
|
limits:
|
||||||
memory: "2Gi"
|
memory: "3Gi"
|
||||||
EOF
|
EOF
|
||||||
EOT
|
EOT
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1632,6 +1632,23 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "CNPG operator down — PostgreSQL failover/management degraded"
|
summary: "CNPG operator down — PostgreSQL failover/management degraded"
|
||||||
|
- alert: PGConnectionsHigh
|
||||||
|
# Per-cluster connection utilisation. Sums per-database backend
|
||||||
|
# counts on each pod, then takes max across pods (primary holds
|
||||||
|
# the real workload; replicas only have streaming_replica conns).
|
||||||
|
expr: (max by (cnpg_cluster) (sum by (cnpg_cluster, instance) (cnpg_backends_total))) / (max by (cnpg_cluster) (cnpg_pg_settings_setting{name="max_connections"})) > 0.85
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL {{ $labels.cnpg_cluster }}: connection utilisation {{ $value | humanizePercentage }} — approaching max_connections ceiling"
|
||||||
|
- alert: PGConnectionsCritical
|
||||||
|
expr: (max by (cnpg_cluster) (sum by (cnpg_cluster, instance) (cnpg_backends_total))) / (max by (cnpg_cluster) (cnpg_pg_settings_setting{name="max_connections"})) > 0.95
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL {{ $labels.cnpg_cluster }}: connection utilisation {{ $value | humanizePercentage }} — new client connections will be refused soon. Bump max_connections or reap idle backends."
|
||||||
- name: Cluster
|
- name: Cluster
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeDown
|
- alert: NodeDown
|
||||||
|
|
@ -2602,6 +2619,26 @@ extraScrapeConfigs: |
|
||||||
action: keep
|
action: keep
|
||||||
regex: '.*-envoy-prom'
|
regex: '.*-envoy-prom'
|
||||||
|
|
||||||
|
- job_name: 'cnpg'
|
||||||
|
# Scrapes the CNPG built-in postgres exporter (port 9187, named "metrics")
|
||||||
|
# on every cluster instance pod. Adds cnpg_cluster + cnpg_role labels so
|
||||||
|
# alerts (PGConnectionsHigh/Critical) can group by cluster.
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
namespaces:
|
||||||
|
names:
|
||||||
|
- dbaas
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_podRole, __meta_kubernetes_pod_container_port_name]
|
||||||
|
action: keep
|
||||||
|
regex: 'instance;metrics'
|
||||||
|
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
|
||||||
|
target_label: cnpg_cluster
|
||||||
|
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_instanceRole]
|
||||||
|
target_label: cnpg_role
|
||||||
|
- source_labels: [__meta_kubernetes_pod_name]
|
||||||
|
target_label: pod
|
||||||
|
|
||||||
- job_name: 'crowdsec'
|
- job_name: 'crowdsec'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue