Skip to content

Instantly share code, notes, and snippets.

@QNimbus
Last active June 16, 2025 12:26
Show Gist options
  • Save QNimbus/3f232138d228e74fc9dde5756bb8bc13 to your computer and use it in GitHub Desktop.
Save QNimbus/3f232138d228e74fc9dde5756bb8bc13 to your computer and use it in GitHub Desktop.
Proxmox k3s create script #proxmox #shell #scripts
#!/usr/bin/env bash
# ====================================================
# K3S VM Provisioning Script
# ====================================================
#
# This script automates the provisioning of K3s VMs on Proxmox VE cluster.
# It performs the following operations:
# 1. Creates server, worker, and storage VMs by cloning a template VM
# 2. Configures resources (RAM, CPU, disk) for each VM type
# 3. Migrates VMs to specified Proxmox nodes for distributed deployment
#
# Requirements:
# - Proxmox VE with qm and pvesh commands available
# - Template VM with ID 1000 must exist
# - Sufficient resources on target Proxmox nodes
# - Valid config.json file with VM definitions
#
# Usage: ./create_vms.sh --config /path/to/config.json [--vmid <id1,id2,...>]
#
# Output: Creates a log file (vm_creation_output.log) with detailed operation logs
# ====================================================
# Function to display usage information
usage() {
echo "Usage: $0 --config <config.json> [--cleanup] [--vmid <id1,id2,...>] [--disable-auto-start] [--nocloud-iso <iso-file>]"
echo ""
echo " --config Path to the JSON configuration file containing VM definitions"
echo " --cleanup Remove stray VM configuration files if they exist but VM is not registered"
echo " --vmid Comma-separated list of VM IDs to process (optional, defaults to all VMs in config)"
echo " --disable-auto-start Disable automatic start on boot for VMs (default: auto-start enabled)"
echo " --nocloud-iso ISO file name from NFS storage to use instead of cloud-init CD-ROM"
echo ""
echo "Example:"
echo " $0 --config ./config.json"
echo " $0 --config ./config.json --cleanup"
echo " $0 --config ./config.json --vmid 101,102,103"
echo " $0 --config ./config.json --disable-auto-start"
echo " $0 --config ./config.json --nocloud-iso nocloud-amd64.iso"
exit 1
}
# Initialize variables to prevent unbound variable errors
CONFIG_FILE=""
CLEANUP=""
VM_IDS=""
DISABLE_AUTO_START=""
NOCLOUD_ISO=""
# Parse and validate command-line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--config)
if [[ -z "$2" ]]; then
echo "Error: --config flag requires a value" >&2
exit 1
fi
CONFIG_FILE="$2"
shift 2
;;
--cleanup)
CLEANUP="cleanup"
shift
;;
--vmid)
if [[ -z "$2" ]]; then
echo "Error: --vmid flag requires a comma-separated list of VM IDs" >&2
exit 1
fi
VM_IDS="$2"
shift 2
;;
--disable-auto-start)
DISABLE_AUTO_START="true"
shift
;;
--nocloud-iso)
if [[ -z "$2" ]]; then
echo "Error: --nocloud-iso flag requires an ISO filename" >&2
exit 1
fi
NOCLOUD_ISO="$2"
shift 2
;;
-h|--help)
usage
;;
*)
echo "Unknown option: $1" >&2
usage
;;
esac
done
# Validate required arguments
if [[ -z "$CONFIG_FILE" ]]; then
echo "Error: --config flag is required" >&2
usage
fi
if [[ ! -f "$CONFIG_FILE" ]]; then
echo "Error: Config file '$CONFIG_FILE' does not exist" >&2
exit 1
fi
# NOW enable strict error handling after argument validation
# Enable strict error handling:
# - 'set -e': Exit immediately if a command exits with a non-zero status.
# - 'set -u': Treat unset variables as an error and exit immediately.
# - 'set -o pipefail': Return the exit code of the last command in the pipeline that failed.
set -euo pipefail
# Load shared library
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LIB_DIR="$SCRIPT_DIR/lib"
if [[ -f "$LIB_DIR/proxmox-common.sh" ]]; then
source "$LIB_DIR/proxmox-common.sh"
else
echo "Error: Cannot find shared library at $LIB_DIR/proxmox-common.sh" >&2
exit 1
fi
echo "=== K3S VM Provisioning Script ==="
echo "Config file: $CONFIG_FILE"
if [[ -n "$NOCLOUD_ISO" ]]; then
echo "NoCloud ISO: $NOCLOUD_ISO (will replace cloud-init CD-ROM)"
fi
echo "This script will provision K3s VMs on Proxmox nodes"
echo "Script will exit on any error"
echo "==============================="
# Initialize script environment with enhanced error handling
init_script_env
# Function for preflight checks with enhanced error context
#
# Verifies that required Proxmox tools (qm and pvesh) are available
# before proceeding with VM provisioning.
#
# Arguments: None
# Returns: Exits with status 1 if required tools are not found
preflight_checks() {
LAST_COMMAND_CONTEXT="Performing preflight checks for VM provisioning"
echo "Performing preflight checks..."
local required_tools="qm pvesh jq"
for tool in $required_tools; do
LAST_COMMAND="command -v $tool"
if ! command -v "$tool" &> /dev/null; then
case "$tool" in
"qm")
echo "Error: 'qm' command not found. Please ensure Proxmox VE CLI tools are installed." >&2
echo "This command is required for VM management operations." >&2
;;
"pvesh")
echo "Error: 'pvesh' command not found. Please ensure Proxmox VE API client is installed." >&2
echo "This command is required for querying Proxmox cluster information." >&2
;;
"jq")
echo "Error: 'jq' command not found. Please install jq for JSON parsing." >&2
echo "Install with: apt-get install jq" >&2
;;
esac
exit 1
fi
done
echo "✓ Required tools are available"
LAST_COMMAND_CONTEXT=""
LAST_COMMAND=""
}
# Function to restore cursor visibility
#
# Ensures the terminal cursor is visible, typically called
# after operations that might hide the cursor.
#
# Arguments: None
# Returns: None
restore_cursor() {
printf "\033[?25h" # Show cursor
}
# Ensure cursor is restored on script exit
trap restore_cursor EXIT
# Function to safely clone a VM with enhanced error reporting
#
# Creates a new VM by cloning a template VM, with safety checks to prevent
# overwriting existing VMs with the same ID.
#
# Arguments:
# $1 - Source VM ID to clone from
# $2 - Target VM ID to create
# $3 - Name for the new VM
# $4 - Storage location for the VM
# $5 - (Optional) "cleanup" to remove stray config files
#
# Returns: 0 if successful or VM already exists
# Exits with status 1 if cloning fails
clone_vm_with_check() {
local source_id=$1
local target_id=$2
local vm_name=$3
local storage=$4
local cleanup=${5:-""}
local desc="Creating ${vm_name}..."
local config_file="/etc/pve/nodes/$(hostname)/qemu-server/${target_id}.conf"
# Set context for error handling
LAST_COMMAND_CONTEXT="$desc (VM ID: $target_id)"
printf "%-50s" "$desc"
echo "$(date): $desc - Checking if VM $target_id already exists" >> "$LOG_FILE"
# Check if VM exists (using shared library)
LAST_COMMAND="qm list | grep \"^ *$target_id \""
if resource_exists "$target_id" "vm"; then
echo "[WARNING] VM $target_id already exists. Skipping."
echo "$(date): $desc - VM $target_id already exists. Skipping." >> "$LOG_FILE"
LAST_COMMAND_CONTEXT=""
return 0
fi
# Check if config file exists but VM is not registered
if [[ -f "$config_file" ]]; then
if [[ "$cleanup" == "cleanup" ]]; then
echo "[WARNING] VM $target_id config exists but VM not registered. Removing config file."
echo "$(date): $desc - VM $target_id config exists but VM not registered. Removing config file." >> "$LOG_FILE"
LAST_COMMAND="rm -f $config_file"
rm -f "$config_file"
else
echo "[WARNING] VM $target_id config exists but VM not registered. Use --cleanup to remove."
echo "$(date): $desc - VM $target_id config exists but VM not registered. Skipping." >> "$LOG_FILE"
LAST_COMMAND_CONTEXT=""
return 0
fi
fi
# Verify source VM exists before attempting clone
LAST_COMMAND="qm list | grep \"^ *$source_id \""
if ! qm list | grep -q "^ *$source_id "; then
echo "[FAILED]"
echo "Error: Source VM ID $source_id does not exist. Cannot clone." >&2
echo "Available VMs:" >&2
qm list >&2
exit 1
fi
local cmd="qm clone $source_id $target_id --name $vm_name --full true --storage $storage"
run_with_spinner "$desc" "$cmd"
LAST_COMMAND_CONTEXT=""
}
# Function to check node availability and capacity
#
# Verifies that a Proxmox node is online and has sufficient
# memory and disk resources for VM migration.
#
# Arguments:
# $1 - Name of the Proxmox node to check
#
# Returns: Exits with status 1 if node is offline or has insufficient resources
check_node() {
local node=$1
echo "Checking availability and capacity for node: $node..."
# Check if node exists and is online
if ! pvesh get /nodes --output-format=json | jq -e ".[] | select(.node == \"$node\" and .status == \"online\")" > /dev/null; then
echo "Error: Node $node is not online or doesn't exist. Aborting migration." >&2
exit 1
fi
# Get free memory and disk space
local free_memory=$(pvesh get /nodes/$node/status --output-format=json | jq '.memory.free' | tr -d '"')
local free_disk=$(pvesh get /nodes/$node/status --output-format=json | jq '.rootfs.free' | tr -d '"')
## Check if resources are sufficient (16GB RAM and 250GB disk minimum)
# if [[ ${free_memory:-0} -lt 17179869184 || ${free_disk:-0} -lt 268435456000 ]]; then
# echo "Error: Node $node does not have sufficient resources. Aborting migration." >&2
# exit 1
# fi
echo "✓ Node $node is online and has sufficient resources"
}
# Function to resize a VM disk if it exists
#
# Resizes the specified disk of a VM after checking if the disk exists.
# Executes the command on the remote Proxmox node via SSH.
#
# Arguments:
# $1 - VM ID to modify
# $2 - Disk identifier (e.g., scsi0)
# $3 - New size for the disk (e.g., 256G)
# $4 - Proxmox node name where the VM is located
#
# Returns: None, but outputs error message if disk doesn't exist
resize_disk() {
local vm_id=$1
local disk=$2
local size=$3
local node=$4
local desc="Resizing disk $disk on VM $vm_id to $size on node $node"
# Check if the disk exists via SSH
if ssh -o StrictHostKeyChecking=no $node "qm config $vm_id | grep -q '$disk'"; then
local cmd="ssh -o StrictHostKeyChecking=no $node 'qm resize $vm_id $disk $size'"
run_with_spinner "$desc" "$cmd"
else
echo "$(date): Error: Disk $disk does not exist for VM $vm_id on node $node. Skipping resize." >> "$LOG_FILE"
printf "%-50s[SKIPPED] - Disk does not exist\n" "$desc"
fi
}
# Function to get VM configuration from JSON
#
# Extracts VM configuration details from the JSON config file.
#
# Arguments:
# $1 - VM ID to look up
# $2 - Property to extract (role, node, etc.)
#
# Returns: Outputs the requested property value
get_vm_config() {
local vm_id=$1
local property=$2
jq -r ".nodes[] | select(.vmid == $vm_id) | .$property" "$CONFIG_FILE"
}
# Function to get all VM IDs from config
#
# Returns a list of all VM IDs defined in the configuration.
#
# Arguments: None
# Returns: Outputs space-separated list of VM IDs
get_all_vm_ids() {
jq -r '.nodes[].vmid' "$CONFIG_FILE" | tr '\n' ' '
}
# Function to get VM IDs by role
#
# Returns a list of VM IDs that match the specified role.
#
# Arguments:
# $1 - Role to filter by (server, agent, storage)
#
# Returns: Outputs space-separated list of VM IDs
get_vm_ids_by_role() {
local role=$1
jq -r ".nodes[] | select(.role == \"$role\") | .vmid" "$CONFIG_FILE" | tr '\n' ' '
}
# Function to configure VM resources based on role
#
# Applies role-specific resource configurations (RAM, CPU, disk).
#
# Arguments:
# $1 - VM ID to configure
# $2 - VM role (server, agent, storage)
#
# Returns: None
configure_vm_resources() {
local vm_id=$1
local role=$2
local vm_name=$(get_vm_config "$vm_id" "role")
case $role in
"agent")
echo "Configuring agent VM $vm_id (16GB RAM, 4 vCPUs)..."
qm set "$vm_id" --memory 16384 --balloon 0 >> "$LOG_FILE" 2>&1
qm set "$vm_id" --sockets 1 --cores 4 >> "$LOG_FILE" 2>&1
;;
"storage")
echo "Configuring storage VM $vm_id (4 vCPUs)..."
qm set "$vm_id" --sockets 1 --cores 4 >> "$LOG_FILE" 2>&1
;;
"server")
echo "Server VM $vm_id using default configuration..."
;;
*)
echo "Warning: Unknown role '$role' for VM $vm_id. Using default configuration." >&2
;;
esac
}
# Function to tag a VM with its role
#
# Applies a role-based tag to a VM for easier identification and management.
#
# Arguments:
# $1 - VM ID to tag
# $2 - VM role (server, agent, storage)
#
# Returns: None
tag_vm() {
local vm_id=$1
local role=$2
local tag="k3s-${role}"
local desc="Tagging VM $vm_id with role '$tag'"
printf "%-50s" "$desc"
echo "$(date): $desc" >> "$LOG_FILE"
qm set "$vm_id" --tags "$tag" >> "$LOG_FILE" 2>&1 &
local pid=$!
spinner $pid
wait $pid
local status=$?
if [ $status -eq 0 ]; then
echo "[DONE]"
else
echo "[FAILED] - Check $LOG_FILE for details"
fi
}
# Function to filter VM IDs based on user-provided list
#
# When the --vmid option is used, this function filters the complete VM ID list
# to only include the specified VMs.
#
# Arguments:
# $1 - Space-separated list of all VM IDs from config
# $2 - Comma-separated list of VM IDs to filter (from --vmid option)
#
# Returns: Outputs space-separated list of filtered VM IDs
filter_vm_ids() {
local all_ids=($1)
local filter_ids_str=$2
# If no filter is provided, return all IDs
if [[ -z "$filter_ids_str" ]]; then
echo "$1"
return
fi
# Convert comma-separated list to space-separated for easier processing
local filter_ids=(${filter_ids_str//,/ })
local result=""
# Loop through all IDs and check if they're in the filter list
for id in "${all_ids[@]}"; do
for filter_id in "${filter_ids[@]}"; do
if [[ "$id" == "$filter_id" ]]; then
result="$result $id"
break
fi
done
done
echo "$result"
}
# Function to configure VM auto-start behavior
#
# Configures whether a VM should automatically start on boot.
# By default, VMs are set to auto-start unless --disable-auto-start is specified.
#
# Arguments:
# $1 - VM ID to configure
# $2 - VM role (for logging purposes)
# $3 - Target node where the VM is located
#
# Returns: None
configure_vm_autostart() {
local vm_id=$1
local role=$2
local target_node=$3
if [[ "$DISABLE_AUTO_START" == "true" ]]; then
local desc="Disabling auto-start for VM $vm_id on $target_node"
local onboot_value=0
else
local desc="Enabling auto-start for VM $vm_id on $target_node"
local onboot_value=1
fi
printf "%-50s" "$desc"
echo "$(date): $desc" >> "$LOG_FILE"
# Execute the command on the target node via SSH if it's remote, or locally if it's the current node
local current_node=$(hostname)
if [[ "$target_node" == "$current_node" ]]; then
qm set "$vm_id" --onboot "$onboot_value" >> "$LOG_FILE" 2>&1 &
else
ssh -o StrictHostKeyChecking=no "$target_node" "qm set $vm_id --onboot $onboot_value" >> "$LOG_FILE" 2>&1 &
fi
local pid=$!
spinner $pid
wait $pid
local status=$?
if [ $status -eq 0 ]; then
echo "[DONE]"
else
echo "[FAILED] - Check $LOG_FILE for details"
fi
}
# Function to detect ISO storage in Proxmox
#
# Queries the Proxmox configuration to find available ISO storage locations.
#
# Arguments: None
# Returns: Outputs the first available ISO storage name
detect_iso_storage() {
echo "Detecting Proxmox ISO storage..." >&2
# Get all storage with ISO content type
local iso_storages=$(pvesm status --content iso 2>/dev/null | tail -n +2 | awk '{print $1}' || echo "")
if [[ -z "$iso_storages" ]]; then
echo "Error: No ISO storage found in Proxmox configuration." >&2
echo "Please configure at least one storage with 'iso' content type." >&2
exit 1
fi
# Use the first available ISO storage
local selected_storage=$(echo "$iso_storages" | head -n 1)
echo "✓ Using ISO storage: $selected_storage" >&2
echo "$selected_storage"
}
# Function to get storage path on a specific node
#
# Retrieves the filesystem path for a storage location on a Proxmox node.
# Can execute either locally or remotely via SSH depending on the target node.
#
# Arguments:
# $1 - Storage name to query
# $2 - Target Proxmox node name (optional, defaults to current node)
#
# Returns: Outputs the full path to the ISO directory for the storage
get_storage_path() {
local storage_name=$1
local target_node=${2:-$(hostname)}
local current_node=$(hostname)
echo "Getting storage path for $storage_name on node $target_node..." >&2
# Get storage config and extract path
local storage_path=""
if [[ "$target_node" == "$current_node" ]]; then
# Execute locally
storage_path=$(pvesh get /storage/$storage_name --output-format=json 2>/dev/null | jq -r '.path // .export' 2>/dev/null || echo "")
else
# Execute remotely via SSH
storage_path=$(ssh -o StrictHostKeyChecking=no "$target_node" "pvesh get /storage/$storage_name --output-format=json 2>/dev/null | jq -r '.path // .export' 2>/dev/null" || echo "")
fi
if [[ -z "$storage_path" ]]; then
echo "Error: Could not determine path for storage '$storage_name' on node '$target_node'." >&2
echo "Please check storage configuration with: pvesh get /storage/$storage_name" >&2
exit 1
fi
# Ensure the iso subdirectory exists
local iso_path="${storage_path}/template/iso"
if [[ "$target_node" == "$current_node" ]]; then
# Check locally
if [[ ! -d "$iso_path" ]]; then
echo "Creating ISO directory: $iso_path" >&2
mkdir -p "$iso_path" 2>/dev/null || {
echo "Error: Cannot create ISO directory. Check permissions." >&2
echo "Attempted path: $iso_path" >&2
exit 1
}
fi
else
# Check remotely via SSH
if ! ssh -o StrictHostKeyChecking=no "$target_node" "[[ -d '$iso_path' ]]" 2>/dev/null; then
echo "Creating ISO directory: $iso_path on $target_node" >&2
ssh -o StrictHostKeyChecking=no "$target_node" "mkdir -p '$iso_path'" 2>/dev/null || {
echo "Error: Cannot create ISO directory on $target_node. Check permissions." >&2
echo "Attempted path: $iso_path" >&2
exit 1
}
fi
fi
echo "✓ Storage path: $iso_path" >&2
echo "$iso_path"
}
# Function to list available ISOs on a storage
#
# Lists all ISO files available in the specified storage location.
# Can execute either locally or remotely via SSH depending on the target node.
#
# Arguments:
# $1 - Storage name to query
# $2 - Target Proxmox node name (optional, defaults to current node)
#
# Returns: Outputs list of ISO files available in the storage
list_available_isos() {
local storage_name=$1
local target_node=${2:-$(hostname)}
local current_node=$(hostname)
echo "Listing available ISOs in storage $storage_name on node $target_node..." >&2
# List ISOs using pvesm
local iso_list=""
if [[ "$target_node" == "$current_node" ]]; then
# Execute locally
iso_list=$(pvesm list "$storage_name" --content iso 2>/dev/null | tail -n +2 | awk '{print $1}' | sed "s|^$storage_name:iso/||" || echo "")
else
# Execute remotely via SSH
iso_list=$(ssh -o StrictHostKeyChecking=no "$target_node" "pvesm list $storage_name --content iso 2>/dev/null | tail -n +2 | awk '{print \$1}' | sed 's|^$storage_name:iso/||'" || echo "")
fi
if [[ -z "$iso_list" ]]; then
echo "No ISOs found in storage $storage_name on node $target_node" >&2
return 0
fi
echo "Available ISOs:" >&2
echo "$iso_list" | while read -r iso; do
echo " - $iso" >&2
done
echo "$iso_list"
}
# Function to check if a specific ISO exists in storage
#
# Checks whether a specific ISO file exists in the given storage location.
#
# Arguments:
# $1 - Storage name to check
# $2 - ISO filename to look for
# $3 - Target Proxmox node name (optional, defaults to current node)
#
# Returns: 0 if ISO exists, 1 if not found
check_iso_exists() {
local storage_name=$1
local iso_filename=$2
local target_node=${3:-$(hostname)}
local available_isos=$(list_available_isos "$storage_name" "$target_node")
if echo "$available_isos" | grep -q "^$iso_filename$"; then
echo "✓ ISO $iso_filename found in storage $storage_name on node $target_node" >&2
return 0
else
echo "✗ ISO $iso_filename not found in storage $storage_name on node $target_node" >&2
return 1
fi
}
# Function to configure VM with NoCloud ISO
#
# Replaces the cloud-init CD-ROM with a custom NoCloud ISO from NFS storage.
# This allows using custom cloud-init configurations or Talos ISOs.
# Also replaces the existing Ubuntu cloud image disk with an empty disk to prevent
# Ubuntu from booting after reboot, configures boot order to prioritize disk first,
# and sets CPU arguments for Talos v1.0+ compatibility (x86-64-v2 microarchitecture).
# Properly cleans up the old disk storage to avoid leaving unused disk images.
#
# Arguments:
# $1 - VM ID to configure
# $2 - ISO filename (without storage prefix)
# $3 - Target node where the VM is located
#
# Returns: None
configure_nocloud_iso() {
local vm_id=$1
local iso_filename=$2
local target_node=$3
local current_node=$(hostname)
# Determine disk size based on VM role
local role=$(get_vm_config "$vm_id" "role")
local disk_size="10" # Default size
if [[ "$role" == "storage" ]]; then
disk_size="256" # Storage VMs need larger disks
fi
local desc="Configuring NoCloud ISO for VM $vm_id (${disk_size}G disk)"
printf "%-50s" "$desc"
echo "$(date): $desc - ISO: $iso_filename, Role: $role, Disk size: ${disk_size}G on node $target_node" >> "$LOG_FILE"
# First, verify the ISO exists in NFS storage on the target node
if ! check_iso_exists "nfs" "$iso_filename" "$target_node"; then
echo "[FAILED] - ISO $iso_filename not found in NFS storage on $target_node"
echo "$(date): Error: ISO $iso_filename not found in NFS storage on $target_node" >> "$LOG_FILE"
exit 1
fi
# Get current disk configuration before making changes
local current_disk_config=""
if [[ "$target_node" == "$current_node" ]]; then
current_disk_config=$(qm config "$vm_id" | grep '^scsi0:' || echo "")
else
current_disk_config=$(ssh -o StrictHostKeyChecking=no "$target_node" "qm config $vm_id | grep '^scsi0:'" || echo "")
fi
echo "$(date): $desc - Current scsi0 config: $current_disk_config" >> "$LOG_FILE"
# Extract the current disk identifier (e.g., vmdata:vm-1211-disk-0)
local old_disk_id=""
if [[ -n "$current_disk_config" ]]; then
# Extract disk ID from config like "scsi0: vmdata:vm-1211-disk-0,size=10G"
old_disk_id=$(echo "$current_disk_config" | sed -n 's/scsi0: \([^,]*\).*/\1/p')
echo "$(date): $desc - Old disk ID: $old_disk_id" >> "$LOG_FILE"
fi
# Configure the CD-ROM with the NoCloud ISO, boot order, and CPU args for Talos compatibility
local iso_path="nfs:iso/$iso_filename"
local cpu_args="-cpu kvm64,+cx16,+lahf_lm,+popcnt,+sse3,+ssse3,+sse4.1,+sse4.2"
# Step 1: Delete the existing disk
echo "$(date): $desc - Step 1: Deleting existing scsi0 disk" >> "$LOG_FILE"
local delete_cmd=""
if [[ "$target_node" == "$current_node" ]]; then
delete_cmd="qm set $vm_id --delete scsi0"
else
delete_cmd="ssh -o StrictHostKeyChecking=no $target_node 'qm set $vm_id --delete scsi0'"
fi
echo "$(date): $desc - DELETE COMMAND: $delete_cmd" >> "$LOG_FILE"
if ! bash -c "$delete_cmd" >> "$LOG_FILE" 2>&1; then
echo "[FAILED] - Could not delete existing disk"
echo "$(date): $desc - Failed to delete existing scsi0 disk" >> "$LOG_FILE"
exit 1
fi
# Step 2: Clean up the old disk storage immediately after deletion
if [[ -n "$old_disk_id" && "$old_disk_id" =~ ^[^:]+:vm-[0-9]+-disk-[0-9]+$ ]]; then
echo "$(date): $desc - Step 2: Cleaning up old disk storage: $old_disk_id" >> "$LOG_FILE"
local cleanup_cmd=""
if [[ "$target_node" == "$current_node" ]]; then
cleanup_cmd="pvesm free $old_disk_id"
else
cleanup_cmd="ssh -o StrictHostKeyChecking=no $target_node 'pvesm free $old_disk_id'"
fi
echo "$(date): $desc - CLEANUP COMMAND: $cleanup_cmd" >> "$LOG_FILE"
if ! bash -c "$cleanup_cmd" >> "$LOG_FILE" 2>&1; then
echo "$(date): $desc - Warning: Could not clean up old disk $old_disk_id (may have been already removed)" >> "$LOG_FILE"
else
echo "$(date): $desc - Successfully cleaned up old disk $old_disk_id" >> "$LOG_FILE"
fi
# Wait a moment for storage cleanup to complete
echo "$(date): $desc - Waiting for storage cleanup to complete..." >> "$LOG_FILE"
sleep 2
else
echo "$(date): $desc - Old disk ID '$old_disk_id' is not valid for cleanup (expected format: storage:vm-id-disk-number)" >> "$LOG_FILE"
fi
# Step 3: Configure new disk, ISO, boot order, and CPU args
echo "$(date): $desc - Step 3: Creating new ${disk_size}G disk and configuring VM" >> "$LOG_FILE"
local config_cmd=""
if [[ "$target_node" == "$current_node" ]]; then
config_cmd="qm set $vm_id --scsi0 vmdata:$disk_size --ide2 $iso_path,media=cdrom --boot order=scsi0\\;ide2 --args '$cpu_args'"
else
config_cmd="ssh -o StrictHostKeyChecking=no $target_node 'qm set $vm_id --scsi0 vmdata:$disk_size --ide2 $iso_path,media=cdrom --boot order=scsi0\\;ide2 --args \"$cpu_args\"'"
fi
echo "$(date): $desc - CONFIG COMMAND: $config_cmd" >> "$LOG_FILE"
bash -c "$config_cmd" >> "$LOG_FILE" 2>&1 &
local pid=$!
spinner $pid
wait $pid
local status=$?
if [ $status -ne 0 ]; then
echo "[FAILED] - Check $LOG_FILE for details"
echo "$(date): $desc - Failed to configure VM with new disk and ISO" >> "$LOG_FILE"
exit 1
fi
echo "[DONE]"
echo "$(date): VM $vm_id configured with NoCloud ISO: $iso_filename, new ${disk_size}G disk, boot order: scsi0,ide2, and CPU args for Talos compatibility" >> "$LOG_FILE"
# Verify the new configuration
local new_disk_config=""
if [[ "$target_node" == "$current_node" ]]; then
new_disk_config=$(qm config "$vm_id" | grep '^scsi0:' || echo "")
else
new_disk_config=$(ssh -o StrictHostKeyChecking=no "$target_node" "qm config $vm_id | grep '^scsi0:'" || echo "")
fi
echo "$(date): $desc - New scsi0 config: $new_disk_config" >> "$LOG_FILE"
# Check if the new disk reused the old identifier
if [[ -n "$old_disk_id" && "$new_disk_config" =~ $old_disk_id ]]; then
echo "$(date): $desc - Successfully reused old disk identifier: $old_disk_id" >> "$LOG_FILE"
else
echo "$(date): $desc - New disk created with different identifier (this is normal)" >> "$LOG_FILE"
fi
}
# Function to verify NoCloud ISO availability
#
# Checks if the specified NoCloud ISO exists in NFS storage on all target nodes
# before proceeding with VM configuration.
#
# Arguments:
# $1 - ISO filename to verify
# $2 - Array of VM IDs to check target nodes for
#
# Returns: Exits with status 1 if ISO not found on any required node
verify_nocloud_iso_availability() {
local iso_filename=$1
local vm_ids=("${@:2}")
echo "Verifying NoCloud ISO availability..."
# Get unique target nodes from the VM configuration
local target_nodes=()
for vm_id in "${vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
local target_node=$(get_vm_config "$vm_id" "node")
if [[ -n "$target_node" ]]; then
# Add to array if not already present
if [[ ! " ${target_nodes[@]} " =~ " ${target_node} " ]]; then
target_nodes+=("$target_node")
fi
fi
fi
done
# Check ISO availability on each target node
local missing_nodes=()
for node in "${target_nodes[@]}"; do
echo "Checking ISO $iso_filename on node $node..."
if ! check_iso_exists "nfs" "$iso_filename" "$node"; then
missing_nodes+=("$node")
fi
done
if [[ ${#missing_nodes[@]} -gt 0 ]]; then
echo "Error: NoCloud ISO '$iso_filename' not found on the following nodes:" >&2
for node in "${missing_nodes[@]}"; do
echo " - $node" >&2
done
echo "" >&2
echo "Please ensure the ISO is available in NFS storage on all target nodes." >&2
echo "You can use the talos_download.sh script to download ISOs to storage." >&2
exit 1
fi
echo "✓ NoCloud ISO $iso_filename verified on all target nodes"
}
# Create output log file with enhanced error handling
init_log_file "vm_creation_output.log" "Starting VM provisioning from config: $CONFIG_FILE"
# Check if template VM exists with better error reporting
LAST_COMMAND_CONTEXT="Verifying template VM availability"
LAST_COMMAND="qm list | grep \"^ *1000 \""
if ! qm list | grep -q "^ *1000 "; then
echo "Error: Source VM ID 1000 does not exist. Cannot proceed with VM provisioning." >&2
echo "" >&2
echo "Available VMs:" >&2
qm list >&2
echo "" >&2
echo "Please ensure template VM 1000 exists before running this script." >&2
exit 1
fi
echo "✓ Template VM 1000 found"
LAST_COMMAND_CONTEXT=""
LAST_COMMAND=""
# Get all VM IDs from config
all_vm_ids_str=$(get_all_vm_ids)
all_vm_ids=($(echo "$all_vm_ids_str"))
# Filter VM IDs if --vmid is provided (using shared library)
if [[ -n "$VM_IDS" ]]; then
filtered_vm_ids_str=$(filter_ids "$all_vm_ids_str" "$VM_IDS")
filtered_vm_ids=($(echo "$filtered_vm_ids_str"))
echo "Filtering VMs to provision: ${filtered_vm_ids[*]} (from list of ${#all_vm_ids[@]} total VMs)"
all_vm_ids=("${filtered_vm_ids[@]}")
else
echo "Found ${#all_vm_ids[@]} VMs to provision: ${all_vm_ids[*]}"
fi
# Verify NoCloud ISO availability if specified
if [[ -n "$NOCLOUD_ISO" ]]; then
verify_nocloud_iso_availability "$NOCLOUD_ISO" "${all_vm_ids[@]}"
fi
# Create all VMs
echo "[1/6] Creating VMs from template..."
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
role=$(get_vm_config "$vm_id" "role")
vm_name="k3s-${role}-$(printf "%03d" $((vm_id % 10)))"
clone_vm_with_check 1000 "$vm_id" "$vm_name" "vmdata" "$CLEANUP"
fi
done
# Tag VMs with their roles
echo "[2/6] Tagging VMs with roles..."
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
role=$(get_vm_config "$vm_id" "role")
tag_vm "$vm_id" "$role"
fi
done
# Configure VMs by role
echo "[3/6] Configuring VM resources..."
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
role=$(get_vm_config "$vm_id" "role")
configure_vm_resources "$vm_id" "$role"
fi
done
# Migrate VMs to target nodes
echo "[4/6] Migrating VMs to target nodes..."
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
target_node=$(get_vm_config "$vm_id" "node")
vm_name="k3s-$(get_vm_config "$vm_id" "role")-$(printf "%03d" $((vm_id % 10)))"
# Get current hostname instead of assuming we're on pve1
current_node=$(hostname)
if [[ "$target_node" != "$current_node" ]]; then
check_node "$target_node"
# Check if VM already exists on target node
printf "%-50s" "Checking if $vm_name exists on $target_node..."
if ssh -o StrictHostKeyChecking=no "$target_node" "qm list | grep -q \"^ *$vm_id \"" 2>/dev/null; then
echo "[EXISTS] - VM $vm_id already exists on $target_node, skipping migration"
echo "$(date): VM $vm_id already exists on $target_node, skipping migration" >> "$LOG_FILE"
continue
else
echo "[NOT FOUND] - Proceeding with migration"
fi
cmd="qm migrate $vm_id $target_node --with-local-disks"
printf "%-50s" "Migrating $vm_name to $target_node"
echo "$(date): Migrating $vm_name to $target_node - COMMAND: $cmd" >> "$LOG_FILE"
bash -c "$cmd" >> "$LOG_FILE" 2>&1 &
pid=$!
spinner $pid
wait $pid
status=$?
if [ $status -eq 0 ]; then
echo "[DONE]"
else
echo "[FAILED] - Migration of $vm_name to $target_node failed. Check $LOG_FILE for details."
echo "$(date): Migration of $vm_name to $target_node failed." >> "$LOG_FILE"
# Don't exit with failure if the error might be that the VM already exists
if grep -q "File exists" "$LOG_FILE"; then
echo "It appears the VM might already exist on the target node. Continuing..."
echo "$(date): VM might already exist on target node. Continuing despite migration error." >> "$LOG_FILE"
else
exit 1
fi
fi
else
echo "VM $vm_name is already on node $current_node - skipping migration"
echo "$(date): VM $vm_name is already on node $current_node - skipping migration" >> "$LOG_FILE"
fi
fi
done
# Resize storage VM disks
echo "[5/6] Configuring storage disks..."
# Skip this step if using NoCloud ISO - disk sizing will be handled in NoCloud configuration
if [[ -z "$NOCLOUD_ISO" ]]; then
# Use filtered VM list instead of getting all storage VMs from config
storage_vm_ids=()
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
role=$(get_vm_config "$vm_id" "role")
if [[ "$role" == "storage" ]]; then
storage_vm_ids+=("$vm_id")
fi
fi
done
for vm_id in "${storage_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
target_node=$(get_vm_config "$vm_id" "node")
resize_disk "$vm_id" scsi0 256G "$target_node"
fi
done
else
echo "Skipping storage disk resize - will be handled in NoCloud ISO configuration"
fi
# Configure VM auto-start behavior (using shared library)
echo "[6/7] Configuring VM auto-start behavior..."
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
role=$(get_vm_config "$vm_id" "role")
target_node=$(get_vm_config "$vm_id" "node")
configure_autostart "$vm_id" "vm" "$target_node" "$DISABLE_AUTO_START"
fi
done
# Configure NoCloud ISO if specified
if [[ -n "$NOCLOUD_ISO" ]]; then
echo "[7/7] Configuring NoCloud ISO for VMs..."
for vm_id in "${all_vm_ids[@]}"; do
if [[ -n "$vm_id" ]]; then
target_node=$(get_vm_config "$vm_id" "node")
configure_nocloud_iso "$vm_id" "$NOCLOUD_ISO" "$target_node"
fi
done
else
echo "[7/7] Skipping NoCloud ISO configuration (not specified)"
fi
echo "===== VM Provisioning Complete ====="
echo "All K3s VMs have been provisioned successfully."
if [[ -n "$NOCLOUD_ISO" ]]; then
echo "All VMs have been configured with NoCloud ISO: $NOCLOUD_ISO"
fi
if [[ "$DISABLE_AUTO_START" == "true" ]]; then
echo "Auto-start on boot has been disabled for all VMs."
else
echo "Auto-start on boot has been enabled for all VMs."
fi
echo "$(date): VM Provisioning Complete - All K3s VMs have been provisioned successfully." >> "$LOG_FILE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment