Reduce disk write amplification across cluster (~200-350 GB/day savings) [ci skip]

- Prometheus: persist metric whitelist (keep rules) to Helm template, preventing regression from 33K to 250K samples/scrape on next apply. Reduce retention 52w→26w. - MySQL InnoDB: aggressive write reduction — flush_log_at_trx_commit=0, sync_binlog=0, doublewrite=OFF, io_capacity=100/200, redo_log=1GB, flush_neighbors=1, reduced page cleaners. - etcd: increase snapshot-count 10000→50000 to reduce WAL snapshot frequency. - VM disks: enable TRIM/discard passthrough to LVM thin pool via create-vm module. - Cloud-init: enable fstrim.timer, journald limits (500M/7d/compress). - Kubelet: containerLogMaxSize=10Mi, containerLogMaxFiles=3. - Technitium: DNS query log retention 0→30 days (was unlimited writes to MySQL). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:01:21 +00:00 · 2026-04-09 19:01:21 +00:00 · 6101fb99f9
commit 6101fb99f9
parent 98aaba98da
8 changed files with 127 additions and 8 deletions
--- a/stacks/platform/modules/dbaas/main.tf
+++ b/stacks/platform/modules/dbaas/main.tf
@ -175,13 +175,30 @@ resource "helm_release" "mysql_cluster" {
        innodb_log_buffer_size=16777216
        # Limit connections (peak usage ~40, no need for 151)
        max_connections=80
-        # Reduce disk write amplification (defaults were SSD-tuned, we're on HDD/LVM thin)
-        innodb_io_capacity=200
-        innodb_io_capacity_max=400
-        innodb_flush_log_at_trx_commit=2
+        # --- Disk write reduction (HDD/LVM thin) ---
+        # Flush redo log once per second, not per commit. Up to 1s data loss on MySQL crash,
+        # but group replication provides redundancy across 3 nodes.
+        innodb_flush_log_at_trx_commit=0
+        # OS decides when to flush binlog (not per commit)
        sync_binlog=0
+        # HDD-tuned I/O capacity
+        innodb_io_capacity=100
+        innodb_io_capacity_max=200
+        # 1GB redo log capacity — larger log means less frequent checkpoint flushes
+        innodb_redo_log_capacity=1073741824
+        # 1GB buffer pool
        innodb_buffer_pool_size=1073741824
-        innodb_redo_log_capacity=536870912
+        # Disable doublewrite — halves write amplification. Safe with group replication
+        innodb_doublewrite=OFF
+        # Flush neighbors on HDD (coalesce adjacent dirty pages into single I/O)
+        innodb_flush_neighbors=1
+        # Reduce page cleaner aggressiveness
+        innodb_lru_scan_depth=256
+        innodb_page_cleaners=1
+        # Reduce adaptive flushing — let dirty pages accumulate longer before background flush
+        innodb_adaptive_flushing_lwm=10
+        innodb_max_dirty_pages_pct=90
+        innodb_max_dirty_pages_pct_lwm=10
      EOT
    }