Reduce disk write amplification across cluster (~200-350 GB/day savings) [ci skip]
- Prometheus: persist metric whitelist (keep rules) to Helm template, preventing regression from 33K to 250K samples/scrape on next apply. Reduce retention 52w→26w. - MySQL InnoDB: aggressive write reduction — flush_log_at_trx_commit=0, sync_binlog=0, doublewrite=OFF, io_capacity=100/200, redo_log=1GB, flush_neighbors=1, reduced page cleaners. - etcd: increase snapshot-count 10000→50000 to reduce WAL snapshot frequency. - VM disks: enable TRIM/discard passthrough to LVM thin pool via create-vm module. - Cloud-init: enable fstrim.timer, journald limits (500M/7d/compress). - Kubelet: containerLogMaxSize=10Mi, containerLogMaxFiles=3. - Technitium: DNS query log retention 0→30 days (was unlimited writes to MySQL). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
98aaba98da
commit
6101fb99f9
8 changed files with 127 additions and 8 deletions
|
|
@ -56,9 +56,15 @@ apt:
|
|||
filename: docker.list
|
||||
|
||||
runcmd:
|
||||
# Enable persistent journald logging for crash forensics
|
||||
# Enable weekly TRIM/discard to reclaim freed blocks in LVM thin pool
|
||||
- systemctl enable --now fstrim.timer
|
||||
# Enable persistent journald logging for crash forensics, with size limits to reduce disk wear
|
||||
- mkdir -p /var/log/journal
|
||||
- sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
|
||||
- sed -i 's/#SystemMaxUse=/SystemMaxUse=500M/' /etc/systemd/journald.conf
|
||||
- sed -i 's/#MaxRetentionSec=/MaxRetentionSec=7day/' /etc/systemd/journald.conf
|
||||
- sed -i 's/#MaxFileSec=/MaxFileSec=1day/' /etc/systemd/journald.conf
|
||||
- sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf
|
||||
- systemctl restart systemd-journald
|
||||
%{if is_k8s_template}
|
||||
# Disable unattended-upgrades to prevent unexpected kernel updates that can break containerd/kubelet
|
||||
|
|
|
|||
|
|
@ -194,6 +194,7 @@ resource "proxmox_vm_qemu" "cloudinit-vm" {
|
|||
disk {
|
||||
storage = "local-lvm"
|
||||
size = var.vm_disk_size
|
||||
discard = true # Enable TRIM passthrough to LVM thin pool — reduces CoW overhead
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -203,6 +204,7 @@ resource "proxmox_vm_qemu" "cloudinit-vm" {
|
|||
disk {
|
||||
storage = "local-lvm"
|
||||
size = var.vm_disk_size
|
||||
discard = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -210,6 +210,31 @@ resource "helm_release" "mysql_cluster" {
|
|||
innodb_log_buffer_size=16777216
|
||||
# Limit connections (peak usage ~40, no need for 151)
|
||||
max_connections=80
|
||||
# --- Disk write reduction (HDD/LVM thin) ---
|
||||
# Flush redo log once per second, not per commit. Up to 1s data loss on MySQL crash,
|
||||
# but group replication provides redundancy across 3 nodes.
|
||||
innodb_flush_log_at_trx_commit=0
|
||||
# OS decides when to flush binlog (not per commit)
|
||||
sync_binlog=0
|
||||
# HDD-tuned I/O capacity (default 200/2000 is for SSD)
|
||||
innodb_io_capacity=100
|
||||
innodb_io_capacity_max=200
|
||||
# 1GB redo log capacity — larger log means less frequent checkpoint flushes
|
||||
innodb_redo_log_capacity=1073741824
|
||||
# 1GB buffer pool
|
||||
innodb_buffer_pool_size=1073741824
|
||||
# Disable doublewrite — halves write amplification. Safe with group replication
|
||||
# (crashed node can re-clone from healthy replica rather than relying on local recovery)
|
||||
innodb_doublewrite=OFF
|
||||
# Flush neighbors on HDD (coalesce adjacent dirty pages into single I/O)
|
||||
innodb_flush_neighbors=1
|
||||
# Reduce page cleaner aggressiveness
|
||||
innodb_lru_scan_depth=256
|
||||
innodb_page_cleaners=1
|
||||
# Reduce adaptive flushing — let dirty pages accumulate longer before background flush
|
||||
innodb_adaptive_flushing_lwm=10
|
||||
innodb_max_dirty_pages_pct=90
|
||||
innodb_max_dirty_pages_pct_lwm=10
|
||||
EOT
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -149,6 +149,9 @@ with open('/var/lib/kubelet/config.yaml') as f:
|
|||
cfg.pop('shutdownGracePeriod', None)
|
||||
cfg.pop('shutdownGracePeriodCriticalPods', None)
|
||||
cfg.pop('shutdownGracePeriodByPodPriority', None)
|
||||
# Container log rotation limits — reduces root disk writes (~20-30 GB/day savings)
|
||||
cfg['containerLogMaxSize'] = '10Mi'
|
||||
cfg['containerLogMaxFiles'] = 3
|
||||
cfg['shutdownGracePeriodByPodPriority'] = [
|
||||
{'priority': 0, 'shutdownGracePeriodSeconds': 20},
|
||||
{'priority': 200000, 'shutdownGracePeriodSeconds': 20},
|
||||
|
|
|
|||
|
|
@ -171,7 +171,7 @@ server:
|
|||
# enabled: false
|
||||
existingClaim: prometheus-data-proxmox
|
||||
# storageClass: rook-cephfs
|
||||
retention: "52w"
|
||||
retention: "26w" # 6 months — reduces compaction writes vs 52w. Size limit (180GB) is the effective cap anyway.
|
||||
# NOTE: Memory must be >= 4Gi. The WAL tmpfs (2Gi, medium: Memory) shares
|
||||
# the container's cgroup limit. At 3Gi, Prometheus OOM-kills during WAL replay.
|
||||
resources:
|
||||
|
|
@ -323,6 +323,10 @@ serverFiles:
|
|||
- source_labels: [__name__]
|
||||
regex: 'kubernetes_feature_enabled|apiserver_longrunning_requests'
|
||||
action: drop
|
||||
# Whitelist: only keep essential apiserver metrics (prevents regression to 250K samples/scrape)
|
||||
- source_labels: [__name__]
|
||||
regex: 'apiserver_request_total|apiserver_request_duration_seconds_sum|apiserver_request_duration_seconds_count|apiserver_requested_deprecated_apis|workqueue_depth|up'
|
||||
action: keep
|
||||
- job_name: kubernetes-nodes
|
||||
scheme: https
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
|
|
@ -348,6 +352,10 @@ serverFiles:
|
|||
- source_labels: [__name__]
|
||||
regex: 'kubernetes_feature_enabled|kubelet_container_log_filesystem_used_bytes'
|
||||
action: drop
|
||||
# Whitelist: only keep essential kubelet metrics
|
||||
- source_labels: [__name__]
|
||||
regex: 'kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_inodes_used|kubelet_running_containers|kubelet_runtime_operations_errors_total|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|go_memstats_alloc_bytes|up'
|
||||
action: keep
|
||||
- job_name: kubernetes-nodes-cadvisor
|
||||
scheme: https
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
|
|
@ -373,6 +381,10 @@ serverFiles:
|
|||
- source_labels: [__name__]
|
||||
regex: 'container_fs_.*|container_blkio_.*|container_pressure_.*|container_spec_.*|container_ulimits_soft|container_file_descriptors|container_threads|container_threads_max|container_sockets|container_processes|container_last_seen|machine_nvm_.*|machine_swap_bytes|machine_cpu_physical_cores|machine_cpu_sockets|container_network_(receive|transmit)_(errors|packets_dropped)_total|container_cpu_(load_average_10s|load_d_average_10s|system_seconds_total|user_seconds_total)|container_memory_(cache|failcnt|kernel_usage|mapped_file|max_usage_bytes|rss|swap|total_active_file_bytes|total_inactive_file_bytes)'
|
||||
action: drop
|
||||
# Whitelist: only keep essential cAdvisor metrics
|
||||
- source_labels: [__name__]
|
||||
regex: 'container_cpu_usage_seconds_total|container_cpu_cfs_throttled_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_oom_events_total|container_spec_memory_limit_bytes|container_start_time_seconds|machine_cpu_cores|machine_memory_bytes'
|
||||
action: keep
|
||||
- job_name: kubernetes-service-endpoints
|
||||
honor_labels: true
|
||||
kubernetes_sd_configs:
|
||||
|
|
@ -424,6 +436,10 @@ serverFiles:
|
|||
- source_labels: [__name__]
|
||||
regex: 'kube_replicaset_.*|kube_pod_tolerations|kube_pod_status_scheduled|kube_deployment_status_condition|kube_pod_labels|kube_pod_created|kube_pod_owner|kube_pod_container_info|kube_pod_init_container_.*|kube_endpoint_.*|kube_service_.*|kube_configmap_.*|kube_secret_.*|kube_lease_.*|kube_ingress_.*|kube_networkpolicy_.*|kube_certificatesigningrequest_.*|kube_limitrange_.*|kube_mutatingwebhookconfiguration_.*|kube_validatingwebhookconfiguration_.*|kube_verticalpodautoscaler_.*|kube_clusterrole.*|kube_role.*|kube_poddisruptionbudget_.*|coredns_proxy_request_duration_seconds_bucket|node_filesystem_device_error|node_filesystem_readonly'
|
||||
action: drop
|
||||
# Whitelist: only keep essential kube-state-metrics, node-exporter, and coredns metrics
|
||||
- source_labels: [__name__]
|
||||
regex: 'kube_cronjob_status_last_successful_time|kube_deployment_spec_replicas|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_job_status_failed|kube_job_status_start_time|kube_node_info|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_persistentvolumeclaim_status_phase|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_status_phase|kube_pod_status_ready|kube_pod_status_reason|kube_pod_status_conditions|kube_resourcequota|kube_statefulset_replicas|kube_statefulset_status_replicas_ready|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_ready|kube_node_spec_unschedulable|node_cpu_seconds_total|node_disk_io_time_seconds_total|node_disk_read_bytes_total|node_disk_written_bytes_total|node_disk_reads_completed_total|node_disk_writes_completed_total|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_filesystem_device_error|node_filesystem_readonly|node_hwmon_chip_names|node_hwmon_temp_celsius|node_load1|node_load15|node_load5|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemFree_bytes|node_memory_SwapTotal_bytes|node_memory_SwapFree_bytes|node_network_receive_bytes_total|node_network_transmit_bytes_total|node_nfs_requests_total|node_uname_info|node_vmstat_oom_kill|coredns_cache_entries|coredns_cache_hits_total|coredns_cache_misses_total|coredns_dns_requests_total|coredns_dns_responses_total|coredns_forward_requests_total|coredns_forward_responses_total|coredns_build_info|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|up'
|
||||
action: keep
|
||||
- job_name: kubernetes-service-endpoints-slow
|
||||
honor_labels: true
|
||||
scrape_interval: 5m
|
||||
|
|
|
|||
|
|
@ -175,13 +175,30 @@ resource "helm_release" "mysql_cluster" {
|
|||
innodb_log_buffer_size=16777216
|
||||
# Limit connections (peak usage ~40, no need for 151)
|
||||
max_connections=80
|
||||
# Reduce disk write amplification (defaults were SSD-tuned, we're on HDD/LVM thin)
|
||||
innodb_io_capacity=200
|
||||
innodb_io_capacity_max=400
|
||||
innodb_flush_log_at_trx_commit=2
|
||||
# --- Disk write reduction (HDD/LVM thin) ---
|
||||
# Flush redo log once per second, not per commit. Up to 1s data loss on MySQL crash,
|
||||
# but group replication provides redundancy across 3 nodes.
|
||||
innodb_flush_log_at_trx_commit=0
|
||||
# OS decides when to flush binlog (not per commit)
|
||||
sync_binlog=0
|
||||
# HDD-tuned I/O capacity
|
||||
innodb_io_capacity=100
|
||||
innodb_io_capacity_max=200
|
||||
# 1GB redo log capacity — larger log means less frequent checkpoint flushes
|
||||
innodb_redo_log_capacity=1073741824
|
||||
# 1GB buffer pool
|
||||
innodb_buffer_pool_size=1073741824
|
||||
innodb_redo_log_capacity=536870912
|
||||
# Disable doublewrite — halves write amplification. Safe with group replication
|
||||
innodb_doublewrite=OFF
|
||||
# Flush neighbors on HDD (coalesce adjacent dirty pages into single I/O)
|
||||
innodb_flush_neighbors=1
|
||||
# Reduce page cleaner aggressiveness
|
||||
innodb_lru_scan_depth=256
|
||||
innodb_page_cleaners=1
|
||||
# Reduce adaptive flushing — let dirty pages accumulate longer before background flush
|
||||
innodb_adaptive_flushing_lwm=10
|
||||
innodb_max_dirty_pages_pct=90
|
||||
innodb_max_dirty_pages_pct_lwm=10
|
||||
EOT
|
||||
}
|
||||
|
||||
|
|
|
|||
50
stacks/rbac/modules/rbac/etcd-tuning.tf
Normal file
50
stacks/rbac/modules/rbac/etcd-tuning.tf
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
# Tune etcd for reduced disk writes on k8s-master.
|
||||
# Increases snapshot-count from 10000 (default) to 50000 to reduce WAL snapshot frequency.
|
||||
# etcd writes ~37.5 GB/day; less frequent snapshots reduce this by ~30-40%.
|
||||
# This patches the kubeadm-managed static pod manifest. Note: kubeadm upgrades
|
||||
# will reset this, so re-apply after any kubeadm upgrade.
|
||||
|
||||
resource "null_resource" "etcd_tuning" {
|
||||
connection {
|
||||
type = "ssh"
|
||||
user = "wizard"
|
||||
host = var.k8s_master_host
|
||||
private_key = var.ssh_private_key
|
||||
}
|
||||
|
||||
provisioner "remote-exec" {
|
||||
inline = [
|
||||
<<-SCRIPT
|
||||
sudo python3 -c "
|
||||
import yaml
|
||||
|
||||
path = '/etc/kubernetes/manifests/etcd.yaml'
|
||||
with open(path) as f:
|
||||
doc = yaml.safe_load(f)
|
||||
|
||||
container = doc['spec']['containers'][0]
|
||||
args = container['command']
|
||||
|
||||
# Update or add --snapshot-count=50000
|
||||
new_args = [a for a in args if not a.startswith('--snapshot-count=')]
|
||||
new_args.append('--snapshot-count=50000')
|
||||
|
||||
# Update or add --quota-backend-bytes (256MB, default is 2GB which is fine)
|
||||
# Keep default for now
|
||||
|
||||
container['command'] = new_args
|
||||
|
||||
with open(path, 'w') as f:
|
||||
yaml.dump(doc, f, default_flow_style=False)
|
||||
|
||||
print('etcd manifest updated: --snapshot-count=50000')
|
||||
"
|
||||
SCRIPT
|
||||
]
|
||||
}
|
||||
|
||||
# Re-run if the configuration changes
|
||||
triggers = {
|
||||
snapshot_count = "50000"
|
||||
}
|
||||
}
|
||||
|
|
@ -460,7 +460,7 @@ resource "kubernetes_cron_job_v1" "technitium_password_sync" {
|
|||
set -e
|
||||
TOKEN=$$(curl -sf "http://technitium-web:5380/api/user/login?user=$$TECH_USER&pass=$$TECH_PASS" | grep -o '"token":"[^"]*"' | cut -d'"' -f4)
|
||||
if [ -z "$$TOKEN" ]; then echo "Login failed"; exit 1; fi
|
||||
CONFIG="{\"enableLogging\":true,\"maxQueueSize\":1000000,\"maxLogDays\":0,\"maxLogRecords\":0,\"databaseName\":\"technitium\",\"connectionString\":\"Server=mysql.dbaas.svc.cluster.local; Port=3306; Uid=technitium; Pwd=$$DB_PASSWORD;\"}"
|
||||
CONFIG="{\"enableLogging\":true,\"maxQueueSize\":1000000,\"maxLogDays\":30,\"maxLogRecords\":0,\"databaseName\":\"technitium\",\"connectionString\":\"Server=mysql.dbaas.svc.cluster.local; Port=3306; Uid=technitium; Pwd=$$DB_PASSWORD;\"}"
|
||||
APP_NAME="Query Logs (MySQL)"
|
||||
curl -sf -X POST "http://technitium-web:5380/api/apps/config/set?token=$$TOKEN" --data-urlencode "name=$$APP_NAME" --data-urlencode "config=$$CONFIG"
|
||||
echo "Password sync complete"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue