#!/usr/bin/env bash # Extend disk storage on a Kubernetes node VM. # Drains the node, shuts down the VM, resizes the disk in Proxmox, # boots the VM, expands the filesystem, and uncordons the node. # # Usage: ./scripts/extend_vm_storage.sh # Example: ./scripts/extend_vm_storage.sh k8s-node2 +64G # --- Constants --- PROXMOX_HOST="root@192.168.1.127" VM_SSH_USER="wizard" KUBECTL="kubectl --kubeconfig $(pwd)/config" SHUTDOWN_TIMEOUT=300 SSH_WAIT_TIMEOUT=300 POLL_INTERVAL=5 # --- Colors --- RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' NC='\033[0m' info() { echo -e "${BLUE}[INFO]${NC} $*"; } ok() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } error() { echo -e "${RED}[ERROR]${NC} $*"; } # --- Node-to-VMID mapping --- declare -A NODE_VMID=( [k8s-master]=200 [k8s-node1]=201 [k8s-node2]=202 [k8s-node3]=203 [k8s-node4]=204 ) # --- Cleanup trap --- DRAINED_NODE="" cleanup() { if [[ -n "$DRAINED_NODE" ]]; then echo "" error "Script exited unexpectedly!" warn "The node '$DRAINED_NODE' may still be cordoned/drained." warn "Recovery steps:" warn " 1. Check VM status: ssh $PROXMOX_HOST 'qm status ${NODE_VMID[$DRAINED_NODE]}'" warn " 2. Start VM if stopped: ssh $PROXMOX_HOST 'qm start ${NODE_VMID[$DRAINED_NODE]}'" warn " 3. Uncordon node: $KUBECTL uncordon $DRAINED_NODE" fi } trap cleanup EXIT # --- Input validation --- usage() { echo "Usage: $0 " echo "" echo "Arguments:" echo " node-name One of: ${!NODE_VMID[*]}" echo " size-increment Disk size increase, e.g. +64G, +128G" echo "" echo "Example:" echo " $0 k8s-node2 +64G" exit 1 } if [[ $# -ne 2 ]]; then usage fi NODE_NAME="$1" SIZE_INCREMENT="$2" if [[ -z "${NODE_VMID[$NODE_NAME]+x}" ]]; then error "Unknown node: '$NODE_NAME'" echo "Valid nodes: ${!NODE_VMID[*]}" exit 1 fi if [[ ! "$SIZE_INCREMENT" =~ ^\+[0-9]+G$ ]]; then error "Invalid size increment: '$SIZE_INCREMENT'" echo "Must match pattern +G, e.g. +64G" exit 1 fi VMID="${NODE_VMID[$NODE_NAME]}" # --- Resolve node IP via kubectl --- info "Resolving IP for node '$NODE_NAME'..." NODE_IP=$($KUBECTL get node "$NODE_NAME" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}' 2>/dev/null) if [[ -z "$NODE_IP" ]]; then error "Could not resolve IP for node '$NODE_NAME'. Is the cluster reachable?" exit 1 fi ok "Node IP: $NODE_IP" # --- Query current disk size --- info "Querying current disk size for VM $VMID..." SCSI0_LINE=$(ssh "$PROXMOX_HOST" "qm config $VMID" 2>/dev/null | grep '^scsi0:') if [[ -z "$SCSI0_LINE" ]]; then error "Could not read scsi0 config for VM $VMID." exit 1 fi # Extract size value, e.g. "size=64G" from the config line CURRENT_SIZE=$(echo "$SCSI0_LINE" | sed -n 's/.*size=\([0-9]*G\).*/\1/p') if [[ -z "$CURRENT_SIZE" ]]; then error "Could not parse current disk size from: $SCSI0_LINE" exit 1 fi CURRENT_SIZE_NUM=${CURRENT_SIZE%G} INCREMENT_NUM=${SIZE_INCREMENT//[+G]/} NEW_SIZE_NUM=$((CURRENT_SIZE_NUM + INCREMENT_NUM)) ok "Current disk size: ${CURRENT_SIZE_NUM}G → New size: ${NEW_SIZE_NUM}G (${SIZE_INCREMENT})" if [[ $NEW_SIZE_NUM -le $CURRENT_SIZE_NUM ]]; then error "New size (${NEW_SIZE_NUM}G) must be greater than current size (${CURRENT_SIZE_NUM}G)." exit 1 fi # --- Confirmation --- echo "" echo "=========================================" echo " Extend VM Storage" echo "=========================================" echo " Node: $NODE_NAME" echo " VMID: $VMID" echo " Node IP: $NODE_IP" echo " Current: ${CURRENT_SIZE_NUM}G" echo " Increment: $SIZE_INCREMENT" echo " New size: ${NEW_SIZE_NUM}G" echo " Proxmox: $PROXMOX_HOST" echo "=========================================" echo "" echo "This will:" echo " 1. Drain the node (evict pods)" echo " 2. Shut down the VM" echo " 3. Resize disk (scsi0) from ${CURRENT_SIZE_NUM}G to ${NEW_SIZE_NUM}G" echo " 4. Start the VM" echo " 5. Expand the filesystem inside the guest" echo " 6. Uncordon the node" echo "" read -rp "Proceed? [y/N] " confirm if [[ ! "$confirm" =~ ^[yY]$ ]]; then echo "Aborted." exit 0 fi # --- Step 1: Drain node --- info "Step 1/7: Draining node '$NODE_NAME'..." DRAINED_NODE="$NODE_NAME" if ! $KUBECTL drain "$NODE_NAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=300s; then error "Failed to drain node '$NODE_NAME'." exit 1 fi ok "Node drained." # --- Step 2: Shutdown VM --- info "Step 2/7: Shutting down VM $VMID..." if ! ssh "$PROXMOX_HOST" "qm shutdown $VMID"; then error "Failed to send shutdown command to VM $VMID." exit 1 fi info "Waiting for VM to stop (timeout: ${SHUTDOWN_TIMEOUT}s)..." elapsed=0 while true; do status=$(ssh "$PROXMOX_HOST" "qm status $VMID" 2>/dev/null) if [[ "$status" == *"stopped"* ]]; then break fi if [[ $elapsed -ge $SHUTDOWN_TIMEOUT ]]; then error "VM $VMID did not stop within ${SHUTDOWN_TIMEOUT}s. Current status: $status" exit 1 fi sleep "$POLL_INTERVAL" elapsed=$((elapsed + POLL_INTERVAL)) done ok "VM stopped." # --- Step 3: Resize disk --- info "Step 3/7: Resizing disk scsi0 by $SIZE_INCREMENT..." if ! ssh "$PROXMOX_HOST" "qm resize $VMID scsi0 $SIZE_INCREMENT"; then error "Failed to resize disk on VM $VMID." exit 1 fi ok "Disk resized." # --- Step 4: Start VM --- info "Step 4/7: Starting VM $VMID..." if ! ssh "$PROXMOX_HOST" "qm start $VMID"; then error "Failed to start VM $VMID." exit 1 fi info "Waiting for SSH to become available at $NODE_IP (timeout: ${SSH_WAIT_TIMEOUT}s)..." elapsed=0 while true; do if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "true" 2>/dev/null; then break fi if [[ $elapsed -ge $SSH_WAIT_TIMEOUT ]]; then error "SSH not reachable on $NODE_IP within ${SSH_WAIT_TIMEOUT}s." exit 1 fi sleep "$POLL_INTERVAL" elapsed=$((elapsed + POLL_INTERVAL)) done ok "VM is up and SSH is reachable." info "Waiting 10s for system stabilization..." sleep 10 # --- Step 5: Expand filesystem --- info "Step 5/7: Expanding filesystem inside the guest..." ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" 'bash -s' <<'REMOTE_SCRIPT' set -o pipefail RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' NC='\033[0m' info() { echo -e "${BLUE}[INFO]${NC} $*"; } ok() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } error() { echo -e "${RED}[ERROR]${NC} $*"; } ROOT_DEV=$(findmnt -n -o SOURCE /) ROOT_FSTYPE=$(findmnt -n -o FSTYPE /) info "Root device: $ROOT_DEV" info "Root filesystem: $ROOT_FSTYPE" # Ensure growpart is available if ! command -v growpart &>/dev/null; then info "Installing growpart (cloud-guest-utils)..." sudo apt-get update -qq && sudo apt-get install -y -qq cloud-guest-utils fi resize_fs() { local dev="$1" local fstype="$2" if [[ "$fstype" == "ext4" || "$fstype" == "ext3" || "$fstype" == "ext2" ]]; then info "Running resize2fs on $dev..." if ! sudo resize2fs "$dev"; then error "resize2fs failed on $dev" return 1 fi elif [[ "$fstype" == "xfs" ]]; then info "Running xfs_growfs on /..." if ! sudo xfs_growfs /; then error "xfs_growfs failed" return 1 fi else error "Unsupported filesystem type: $fstype" return 1 fi return 0 } # Check if root is on LVM (device-mapper) if [[ "$ROOT_DEV" == /dev/mapper/* || "$ROOT_DEV" == /dev/dm-* ]]; then info "LVM layout detected." # Find the PV device PV_DEV=$(sudo pvs --noheadings -o pv_name | head -1 | tr -d ' ') if [[ -z "$PV_DEV" ]]; then error "Could not determine PV device." exit 1 fi info "PV device: $PV_DEV" # Parse disk and partition number (handles /dev/sdaX and /dev/nvmeXnXpX) if [[ "$PV_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then DISK="${BASH_REMATCH[1]}" PARTNUM="${BASH_REMATCH[2]}" elif [[ "$PV_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then DISK="${BASH_REMATCH[1]}" PARTNUM="${BASH_REMATCH[2]}" else error "Could not parse disk/partition from PV: $PV_DEV" exit 1 fi info "Disk: $DISK, Partition: $PARTNUM" # Grow partition info "Growing partition $DISK partition $PARTNUM..." sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)" # Resize PV info "Resizing PV $PV_DEV..." if ! sudo pvresize "$PV_DEV"; then error "pvresize failed on $PV_DEV" exit 1 fi # Resolve LV path if using /dev/dm-* if [[ "$ROOT_DEV" == /dev/dm-* ]]; then LV_PATH=$(sudo lvs --noheadings -o lv_path | head -1 | tr -d ' ') else LV_PATH="$ROOT_DEV" fi info "LV path: $LV_PATH" # Extend LV info "Extending LV $LV_PATH to use all free space..." if ! sudo lvextend -l +100%FREE "$LV_PATH"; then warn "lvextend reported no change (LV may already use all space)." fi # Resize filesystem resize_fs "$LV_PATH" "$ROOT_FSTYPE" if [[ $? -ne 0 ]]; then exit 1 fi else info "Direct partition layout detected." # Parse disk and partition number if [[ "$ROOT_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then DISK="${BASH_REMATCH[1]}" PARTNUM="${BASH_REMATCH[2]}" elif [[ "$ROOT_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then DISK="${BASH_REMATCH[1]}" PARTNUM="${BASH_REMATCH[2]}" else error "Could not parse disk/partition from: $ROOT_DEV" exit 1 fi info "Disk: $DISK, Partition: $PARTNUM" # Grow partition info "Growing partition $DISK partition $PARTNUM..." sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)" # Resize filesystem resize_fs "$ROOT_DEV" "$ROOT_FSTYPE" if [[ $? -ne 0 ]]; then exit 1 fi fi ok "Filesystem expansion complete." df -h / REMOTE_SCRIPT if [[ $? -ne 0 ]]; then error "Filesystem expansion failed on the guest." exit 1 fi ok "Filesystem expanded." # --- Step 6: Uncordon node --- info "Step 6/7: Uncordoning node '$NODE_NAME'..." if ! $KUBECTL uncordon "$NODE_NAME"; then error "Failed to uncordon node '$NODE_NAME'." exit 1 fi DRAINED_NODE="" ok "Node uncordoned." # --- Step 7: Verify --- info "Step 7/7: Verification" echo "" info "Disk usage on $NODE_NAME:" ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "df -h /" echo "" info "Node status:" $KUBECTL get node "$NODE_NAME" echo "" ok "Storage extension complete for $NODE_NAME."