#cloud-config hostname: terraform-vm users: - name: wizard sudo: ALL=(ALL) NOPASSWD:ALL ssh_authorized_keys: - ${authorized_ssh_key} passwd: ${passwd} lock_passwd: false # enable passwd login shell: /bin/bash package_update: true package_upgrade: true packages: - htop - vim - curl - jq - tcpdump - tree - tmux - wget - net-tools - zsh - apt-transport-https - ca-certificates - gpg - isc-dhcp-client - cloud-guest-utils # to enable resizing of disk via growpart - qemu-guest-agent - nginx # docker - docker-ce - docker-ce-cli - containerd.io - docker-buildx-plugin - docker-compose-plugin %{if is_k8s_template} # kubernetes - kubeadm - kubelet # iSCSI client for CSI-backed database storage - open-iscsi %{endif} apt: sources: %{if is_k8s_template} kubernetes: source: "deb https://pkgs.k8s.io/core:/stable:/v1.32/deb/ /" keyid: "DE15B14486CD377B9E876E1A234654DA9A296436" filename: kubernetes.list %{endif} docker: source: "deb https://download.docker.com/linux/ubuntu noble stable" keyid: "9DC858229FC7DD38854AE2D88D81803C0EBFCD88" filename: docker.list runcmd: # Enable weekly TRIM/discard to reclaim freed blocks in LVM thin pool - systemctl enable --now fstrim.timer # Enable persistent journald logging for crash forensics, with size limits to reduce disk wear - mkdir -p /var/log/journal - sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf - sed -i 's/#SystemMaxUse=/SystemMaxUse=500M/' /etc/systemd/journald.conf - sed -i 's/#MaxRetentionSec=/MaxRetentionSec=7day/' /etc/systemd/journald.conf - sed -i 's/#MaxFileSec=/MaxFileSec=1day/' /etc/systemd/journald.conf - sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf - systemctl restart systemd-journald %{if is_k8s_template} # Re-enabled 2026-05-10: unattended-upgrades is back on, but with a tight # Allowed-Origins list, a Package-Blacklist for k8s/containerd/runc/calico, # and Automatic-Reboot disabled (kured + sentinel-gate handles reboots in a # 24h-soaked rolling window, gated by Prometheus alerts). # Original outage (March 2026) was kernel update → containerd overlayfs corruption. # Mitigations: 24h cool-down between node reboots, Prometheus halt-on-alert, # apt-mark hold on k8s components, Package-Blacklist for runtime components. - apt-get install -y unattended-upgrades update-notifier-common - | cat > /etc/apt/apt.conf.d/52unattended-upgrades-k8s <<'EOF' Unattended-Upgrade::Allowed-Origins { "$${distro_id}:$${distro_codename}"; "$${distro_id}:$${distro_codename}-security"; "$${distro_id}:$${distro_codename}-updates"; "$${distro_id}ESMApps:$${distro_codename}-apps-security"; "$${distro_id}ESM:$${distro_codename}-infra-security"; }; Unattended-Upgrade::Package-Blacklist { "^containerd(\.io)?$$"; "^runc$$"; "^cri-tools$$"; "^kubernetes-cni$$"; "^calico-.*"; "^cni-plugins-.*"; "^docker-ce$$"; }; Unattended-Upgrade::DevRelease "false"; Unattended-Upgrade::Automatic-Reboot "false"; EOF - | cat > /etc/apt/apt.conf.d/20auto-upgrades <<'EOF' APT::Periodic::Update-Package-Lists "1"; APT::Periodic::Unattended-Upgrade "1"; EOF - systemctl unmask unattended-upgrades 2>/dev/null || true - systemctl enable --now unattended-upgrades - apt-mark hold kubelet kubeadm kubectl - apt-mark hold containerd containerd.io runc 2>/dev/null || true - systemctl stop kubelet - containerd config default | sudo tee /etc/containerd/config.toml - ${containerd_config_update_command} - systemctl restart containerd - systemctl enable --now iscsid # Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable # CRC32C data/header digests to detect bit flips over the network. # Prevents SQLite corruption from transient iSCSI session drops. - sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf - sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf - sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf - | if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf fi - systemctl restart iscsid # Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet) - mkdir -p /sentinel # Create 4Gi swap file for worker node memory pressure relief (NOT for master — etcd is latency-critical) - fallocate -l 4G /swapfile - chmod 600 /swapfile - mkswap /swapfile - swapon /swapfile - echo '/swapfile none swap sw 0 0' >> /etc/fstab - sysctl -w vm.swappiness=10 - echo 'vm.swappiness=10' >> /etc/sysctl.d/99-swap.conf - ${k8s_join_command} - systemctl enable kubelet - systemctl start kubelet %{ endif } %{ for provision_cmd in provision_cmds ~} - ${provision_cmd} %{ endfor ~}