#cloud-config hostname: terraform-vm users: - name: wizard sudo: ALL=(ALL) NOPASSWD:ALL ssh_authorized_keys: - ${authorized_ssh_key} passwd: ${passwd} lock_passwd: false # enable passwd login shell: /bin/bash package_update: true package_upgrade: true packages: - htop - vim - curl - jq - tcpdump - tree - tmux - wget - net-tools - zsh - apt-transport-https - ca-certificates - gpg - isc-dhcp-client - cloud-guest-utils # to enable resizing of disk via growpart - qemu-guest-agent - nginx # docker - docker-ce - docker-ce-cli - containerd.io - docker-buildx-plugin - docker-compose-plugin %{if is_k8s_template} # kubernetes - kubeadm - kubelet # iSCSI client for CSI-backed database storage - open-iscsi %{endif} apt: sources: %{if is_k8s_template} kubernetes: source: "deb https://pkgs.k8s.io/core:/stable:/v1.32/deb/ /" keyid: "DE15B14486CD377B9E876E1A234654DA9A296436" filename: kubernetes.list %{endif} docker: source: "deb https://download.docker.com/linux/ubuntu noble stable" keyid: "9DC858229FC7DD38854AE2D88D81803C0EBFCD88" filename: docker.list runcmd: # Enable weekly TRIM/discard to reclaim freed blocks in LVM thin pool - systemctl enable --now fstrim.timer # Enable persistent journald logging for crash forensics, with size limits to reduce disk wear - mkdir -p /var/log/journal - sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf - sed -i 's/#SystemMaxUse=/SystemMaxUse=500M/' /etc/systemd/journald.conf - sed -i 's/#MaxRetentionSec=/MaxRetentionSec=7day/' /etc/systemd/journald.conf - sed -i 's/#MaxFileSec=/MaxFileSec=1day/' /etc/systemd/journald.conf - sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf - systemctl restart systemd-journald %{if is_k8s_template} # Disable unattended-upgrades to prevent unexpected kernel updates that can break containerd/kubelet # (Root cause of 26h cluster outage: unattended-upgrades → kernel update → containerd failure) - systemctl disable --now unattended-upgrades || true - apt-get remove -y unattended-upgrades || true - apt-mark hold kubelet kubeadm kubectl - systemctl stop kubelet - containerd config default | sudo tee /etc/containerd/config.toml - ${containerd_config_update_command} - systemctl restart containerd - systemctl enable --now iscsid # Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable # CRC32C data/header digests to detect bit flips over the network. # Prevents SQLite corruption from transient iSCSI session drops. - sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf - sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf - sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf - | if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf fi - systemctl restart iscsid # Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet) - mkdir -p /sentinel # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical) - fallocate -l 4G /swapfile - chmod 600 /swapfile - mkswap /swapfile - swapon /swapfile - echo '/swapfile none swap sw 0 0' >> /etc/fstab - sysctl -w vm.swappiness=10 - echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf - ${k8s_join_command} - systemctl enable kubelet - systemctl start kubelet %{ endif } %{ for provision_cmd in provision_cmds ~} - ${provision_cmd} %{ endfor ~}