diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 496f30d7..bf51ef57 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -197,7 +197,7 @@ the workflow's built-in `GITHUB_TOKEN` (`packages: write`). **`postgresql_host`** in `config.tfvars` is `pg-cluster-rw.dbaas.svc.cluster.local` (the CNPG primary). The legacy `postgresql.dbaas` service is a live compatibility alias (selector `cnpg.io/instanceRole=primary`, so it also reaches the primary — authentik's PgBouncer still points at it) — but use `pg-cluster-rw` for anything new. This variable is shared by ~12 stacks. -**CNPG tuning** (in `stacks/dbaas/modules/dbaas/main.tf`): `shared_buffers=512MB`, `work_mem=16MB`, `wal_compression=on`, `effective_cache_size=1536MB`, pod memory 2Gi. +**CNPG tuning** (in `stacks/dbaas/modules/dbaas/main.tf`): `shared_buffers=1024MB`, `effective_cache_size=2560MB`, `work_mem=16MB`, `max_connections=200`, `wal_compression=on`, pod memory 3Gi. **Write-reduction (2026-06-29, code-oflt):** `checkpoint_timeout=15min` + `max_wal_size=4GB` + `min_wal_size=1GB` — checkpoints were 100% timer-driven at the 5-min default, bursting full-page-writes onto the contended sdc HDD; all three are reloadable (no restart). ## Networking & Resilience - **Critical path services scaled to 3**: Traefik, Authentik, CrowdSec LAPI, PgBouncer, Cloudflared. diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index d940f642..6f5aa79d 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -829,7 +829,7 @@ resource "kubernetes_deployment" "phpmyadmin" { metadata[0].annotations["keel.sh/trigger"], metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 metadata[0].annotations["keel.sh/match-tag"], - spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates + spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 ] } @@ -1159,6 +1159,16 @@ resource "null_resource" "pg_cluster" { wal_compression: "on" random_page_cost: "4" checkpoint_completion_target: "0.9" + # Write-reduction (2026-06-29, code-oflt): checkpoints were 100% + # timer-driven at the 5-min PG default, each firing a full-page-write + # burst + flush onto the contended sdc HDD. Stretch the timer to 15min + # and raise max/min_wal_size so size-triggered checkpoints stay rare and + # WAL segments get recycled (not churned). All three are reloadable + # (sighup) -> CNPG applies them without a restart. Bounded recovery-time + # tradeoff; completion_target 0.9 still smears each checkpoint's IO. + checkpoint_timeout: "15min" + max_wal_size: "4GB" + min_wal_size: "1GB" enableAlterSystem: true enableSuperuserAccess: true inheritedMetadata: @@ -1605,7 +1615,7 @@ resource "kubernetes_deployment" "pgadmin" { metadata[0].annotations["keel.sh/trigger"], metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 metadata[0].annotations["keel.sh/match-tag"], - spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates + spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 ] }