infra/scripts/provision-k8s-worker

#!/usr/bin/env bash
# provision-k8s-worker NAME VMID IP[/CIDR]
#
# Clone PVE template 2000 (ubuntu-2404-cloudinit-k8s-template) into a new
# VM, configure resources to match k8s-node3/4 (32G RAM, 8 vCPU, host CPU,
# 256G disk, VLAN 20 on vmbr1), attach the shared cicustom snippet
# (/var/lib/vz/snippets/k8s_cloud_init.yaml), and start it. Cloud-init
# inside the VM installs containerd + kubelet, applies the bundled
# setup script, and runs the kubeadm join. No manual steps after this.
#
# Hostname is derived from `qm set --name $NAME` and read by cloud-init
# from Proxmox metadata — DO NOT hard-code in the snippet.
#
# Idempotent: aborts if VMID already exists or IP is already in use.
#
# Usage:
#   ssh root@192.168.1.127 bash -s -- k8s-node6 206 10.0.20.106 < provision-k8s-worker
# or, if the script lives on the PVE host:
#   provision-k8s-worker k8s-node6 206 10.0.20.106
#
# Run on the PVE host (needs qm + /var/lib/vz/snippets access).
set -euo pipefail

if [ $# -ne 3 ]; then
    echo "usage: $0 NAME VMID IP" >&2
    echo "  e.g. $0 k8s-node6 206 10.0.20.106" >&2
    exit 2
fi

NAME=$1
VMID=$2
IP=$3
CIDR_IP="${IP}/22"
GW="10.0.20.1"
DNS="10.0.20.201"
SEARCH="viktorbarzin.lan"
TEMPLATE_ID=2000
STORAGE="local-lvm"
USER_SNIPPET="local:snippets/k8s_cloud_init.yaml"
# Per-node meta-data snippet — written below — supplies local-hostname.
# Proxmox's auto-generated metadata DOESN'T include hostname when
# cicustom user=… is set, so the shared user-data snippet alone leaves
# nodes joining as "ubuntu" (image default). Per-node meta-data is the
# clean fix.
META_SNIPPET_FILE="/var/lib/vz/snippets/${NAME}-meta.yaml"
META_SNIPPET="local:snippets/${NAME}-meta.yaml"
BRIDGE="vmbr1"
VLAN=20

# Sanity: VMID must be free
if qm status "$VMID" >/dev/null 2>&1; then
    echo "ERROR: VM $VMID already exists. Refusing to clobber." >&2
    qm status "$VMID" >&2
    exit 1
fi

# Sanity: IP must not be pingable
if ping -c 1 -W 1 "$IP" >/dev/null 2>&1; then
    echo "ERROR: $IP is already responding to ping. Refusing to assign." >&2
    exit 1
fi

# Sanity: snippet must exist
if [ ! -f "/var/lib/vz/snippets/k8s_cloud_init.yaml" ]; then
    echo "ERROR: /var/lib/vz/snippets/k8s_cloud_init.yaml missing." >&2
    echo "  Run `tg apply` in infra/stacks/infra/ to regenerate it." >&2
    exit 1
fi

# Sanity: template must be a template
if ! qm config "$TEMPLATE_ID" | grep -q '^template: 1'; then
    echo "ERROR: VMID $TEMPLATE_ID is not a template." >&2
    exit 1
fi

echo "[1/6] write per-node meta-data snippet ($META_SNIPPET_FILE)"
cat > "$META_SNIPPET_FILE" <<META
local-hostname: $NAME
instance-id: $NAME-$(date +%s)
META

echo "[2/6] qm clone $TEMPLATE_ID -> $VMID ($NAME)"
qm clone "$TEMPLATE_ID" "$VMID" --name "$NAME" --full true --storage "$STORAGE"

echo "[3/6] qm set $VMID — VM resources + network + cicustom"
qm set "$VMID" \
    --agent 1 \
    --balloon 32768 \
    --cores 8 \
    --cpu host \
    --memory 32768 \
    --net0 "virtio,bridge=$BRIDGE,tag=$VLAN" \
    --ipconfig0 "ip=$CIDR_IP,gw=$GW" \
    --nameserver "$DNS" \
    --searchdomain "$SEARCH" \
    --onboot 1 \
    --startup 'order=5,up=45,down=420' \
    --cicustom "user=$USER_SNIPPET,meta=$META_SNIPPET"

echo "[4/6] qm resize $VMID scsi0 256G"
qm resize "$VMID" scsi0 256G

echo "[5/6] qm start $VMID"
qm start "$VMID"

echo "[6/6] Done. Cloud-init runs now; node should appear in 'kubectl get nodes' within ~6-10 min."
echo "  Tail cloud-init: socat -u UNIX-CONNECT:/var/run/qemu-server/$VMID.serial0 STDOUT | strings"
echo "  Final config:"
qm config "$VMID" | grep -E '^(name|cores|memory|net0|ipconfig0|cicustom|scsi0|onboot):'