kiler129 · December 26, 2025 08:42 · docop · Jun 7, 2023 · evoandroidevo · Jun 17, 2023
diff --git a/proxmox-hook.sh b/proxmox-hook.sh
 #!/bin/bash
 set -e -o errexit -o pipefail -o nounset

 ###################################
 # This script can be used by itself, but it's recommended that you read
 # a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/
 ###################################

 # Do not modify these variables (set by Proxmox when calling the script)
 vmId="$1"
 runPhase="$2"
 echo "Running $runPhase on VM=$vmId"

 # vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
 # On 5900x the core config, as seen in lscpu -e, looks like the following:
 # CCX #0:
 #  - NUMA: node 0
 #  - CPU: 0-5, 12-17 (SMT threads/host CPU#)
 #  - CORE: 0-5
 # CCX #1:
 #  - NUMA: node 1
 #  - CPU: 6-11, 18-23
 #  - CORE: 6-11
 # "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430
 #
 # VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
 # so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core
 # There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md
 #
 # Useful commands while debugging this code:
 #  List running tasks with their affinity as of now: (the "]" filters out kthreads)
 #  ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS
 #  Track cgroups resources usage: systemd-cgtop
 #  See tree of cgroups: systemd-cgls


 # Gets QEMU parent process PID for the current VM
 getQemuPID () {
    local qemuParentPid=$(cat /run/qemu-server/$vmId.pid)
    if [[ -z $qemuParentPid ]]; then
        echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
        return 1
    fi

   echo $qemuParentPid
 }

 # Gets the last logical CPU (thread) of the system
 getLastCpu () {
    echo $(( $(nproc --all) - 1 ))
 }

 # Pin vCPU to a host logic CPU (thread)
 # The theread SHOULD be a single one, but it can be any taskset list
 #
 # Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
 # sub-scopes, affinity has to be set per-process with taskset here.
 #
 # Params: vCPU# hostThread#orList
 pinVCpu () {
    local vCpuNum=$1
    local hostThreadNum="$2"

    local qemuParentPid=$(getQemuPID)
    local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)
    if [[ -z $vCpuTaskPid ]]; then
        echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
        return 1
    fi

    echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
    taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
 }


 # Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
 # There thread SHOULD probably be a list unlike pinVCpu
 #
 # Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
 # sub-scopes, affinity has to be set per-process with taskset here.
 #
 # Params: hostThread#orList
 pinNonVCpuTasks () {
    local hostThreadNum="$1"

    local qemuParentPid=$(getQemuPID)
    local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5)

    while IFS= read -r tpid; do
        local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
        echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
        taskset --cpu-list --pid "$hostThreadNum" $tpid
    done <<< "$nonVCpuTaskPids"
 }

 # Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
 # to control their affinity manual pinning is needed.
 # There are hacky ways to identify kthreads like parsing "ps", but the proper way to
 # that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
 # mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740)
 #
 # Params: hostThread#orList
 pinKthreads () {
    local hostThreadNum="$1"

    echo "Attempting to pin all kthreads to $hostThreadNum..."
    local procStat=""
    local pid=""
    local comm=""
    for statFile in /proc/[0-9]*/stat; do
        # This CAN sometimes fail due to TOC-TOU
        procStat=""
 	2>/dev/null read -a procStat < $statFile || true
        if [[ -z "${procStat[0]}" ]]; then continue; fi

        # Ignore not kthreads
        flags="${procStat[8]}"
        if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi

        pid="${procStat[0]}"
        comm="${procStat[1]:1:-1}"
        # This CAN fail for some kthreads that are needed on specific CPUs
        if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
            echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
        fi
    done
 }

 # Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
 # when these are rescheduled. This function is not perfect as it doesn't set a mask
 # for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
 # be needed as if the VM isn't started on boot most if not all busy IRQs would have
 # been triggered by now.
 #
 # Params: hostThread#orList
 pinIrqs () {
    local hostThreadNum="$1"

    echo "Pinning IRQs to host thread(s) $hostThreadNum..."
    for irqAffLst in /proc/irq/*/smp_affinity_list; do
        local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+')
        if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
           echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
        fi
    done
 }

 # Set governor/scaling for a host logic CPU (thread)
 # Params: hostThread# desiredGovernor
 setGovernor () {
    local hostCpu=$1
    local reqGov="$2"
    local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)

    if [[ -z "$curGov" ]]; then
        echo "ERROR: failed to query governor for CPU $hostCpu"
        return 1
    fi
    if [[ "$reqGov" == "$curGov" ]]; then
        echo "CPU $hostCpu: requested governor $reqGov - it is already set"
        return
    fi

    echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
    echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
 }

 # Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
 # Params: hostThreadFrom# hostThreadTo# desiredGovernor
 setGovernorRange () {
    for (( i=$1; i<=$2; i++ )); do
        setGovernor $i "$3"
    done
 }

 # Resets governor/scaling to default state
 resetGovernor () {
    echo "Resetting CPU governor to default"
    service cpufrequtils restart
 }

 # Put host CPU (thread) into offline or online state
 # Params: hostThread# desiredState{0,1}
 setCpuState () {
    local hostCpu=$1
    local reqState=$2
    local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)

    if [[ -z "$curState" ]]; then
        echo "ERROR: failed to online status for CPU $hostCpu"
        return 1
    fi
    if [[ "$reqState" == "$curState" ]]; then
        echo "CPU $hostCpu: requested state $reqState - it is already set"
        return
    fi

    echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
    echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
    if [[ $? -eq 0 ]]; then
        echo "[OK]"
    else
        echo "[FAILED]"
        return 1
    fi
 }

 # Put host CPU (thread) range into offline or online state. Range is inclusive.
 # Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
 setCpuStateRange () {
    for (( i=$1; i<=$2; i++ )); do
        setCpuState $i $3
    done
 }

 tidyCaches () {
    echo -n "Tidying caches... "
    sync
    echo 3 > /proc/sys/vm/drop_caches
    echo 1 > /proc/sys/vm/compact_memory
    echo "[OK]"
 }

 # Sets cgroup slice or scope cpu isolation
 # Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
 setCgroupAllowedCpus () {
    local entity="$1"
    local allowedCpus="$2"

    echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
    systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
 }

 # Sets logical CPUs (threads) which can be used by processes on the host
 # Params: hostThreadsList (e.g. 11,12,13-19)
 setHostAllowedCpus () {
    echo "Setting host userland CPU constrain to $1"
    setCgroupAllowedCpus "init.scope" "$1"
    setCgroupAllowedCpus "system.slice" "$1"
    setCgroupAllowedCpus "user.slice" "$1"
 }

 # Sets logical CPUs (threads) which can be QEMU processes
 # Params: hostThreadsList (e.g. 11,12,13-19
 setQemuAllowedCpus () {
    echo "Setting QEMU CPU default constrain to $1"
    setCgroupAllowedCpus "qemu.slice" "$1"
 }

 # Makes sure that a decoupled slice for some QEMU VMs exist
 # This will only do something the first time a VM start
 # Params: <none>
 ensureQemuDecoupledSlice () {
    if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
        return 0
    fi

   echo "Creating decoupled QEMU cgroup"
   mkdir /sys/fs/cgroup/qemu-decoupled.slice

   # The slice itself MUST be allowed to run on ALL CPUs. The reason
   # for that is we will move vCPUs to an isolated set of cores BUT
   # put emulator and iothread(s) on the shared CPUs. Since cgroups v2
   # doesn't allow a thread/task to be in a different cgroup than the
   # parent these tasks must stay in the qemu-decoupled.slice but with
   # different affinity
   local lastCPU=$(getLastCpu)
   setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
 }

 # Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
 # as away from the standard qemu.slice used by Proxmox; see systemd-cgls
 #
 # All processes from host run under system.slice and user.slice, while all QEMU machines run
 # under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
 # https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893
 # This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
 # threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
 # running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
 # the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
 # under any of the standard slices. However, this is not supported by systemd, as confirmed by one
 # of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can
 # be used directly (albeit without warranties).
 #
 # Params: <none>
 decoupleQemuVm () {
    ensureQemuDecoupledSlice

    local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
    if [[ ! -d "$vmScope" ]]; then
        echo "Creating cgroups scope for VMID=$vmId at $vmScope"
        mkdir "$vmScope"
    fi

    local qemuParentPid=$(getQemuPID)
    echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
    echo $qemuParentPid > "$vmScope/cgroup.procs"
 }

 # Starts/stops the "idle" windows VM to force very low GPU power states
 setIdleVm () {
    echo "Setting idle VM to $1"
    qm "$1" 107
 }

 # Since updates around 2023/03/20-22 GPUs and some other PCIe devices  will only work once.
 # When VM is turned off and on it will just black-screen and the VM never boots. This is a
 # workaround for that issue.
 #
 # Params: <none>
 resetVmPciDevices () {
    echo "Resetting VM PCI devices..."

    local pciAddrFun=''
    local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
    while IFS= read -r pciAddr; do
        # Single function (mostly SR-IOV or vGPU) device
        if echo "$pciAddr" | grep -F '.' > /dev/null; then
            echo "Removing PCI device function at $pciAddr"
            echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true
            continue
        fi

        # Whole device specified => remove all function
        for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
            pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*')
            echo "Removing PCI device $pciAddr function $pciAddrFun"
            echo 1 > "$pciAddrFunRm" || true
            # This is absolutely required. Attempting to remove one function CAN
            # remove all of them but it's not instantenous. However, if you hit
            # such a case and try to manually do /remove on another function while
            # the first is being removed a "general protection fault" will happen
            # in the subsequent "pci_stop_and_remove_bus_device_locked()"
            while [[ -f "$pciAddrFunRm" ]]; do
                sleep 1
                echo "Still waiting for $pciAddrFunRm..."
            done
        done
    done <<< "$vmPciDevices"

    echo "Re-scanning PCI devices..."
    echo 1 > /sys/bus/pci/rescan
    # rescan is asynchronous; if we wanted to be 100% correct here we should wait
    # for /sys entries to appear, but 2 seconds delay is good enough
    sleep 2
 }


 # Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
 # All modifications should be done in post-start as doing them in pre-start will execute them even
 #  if the VM fails to start (and thus post-stop will never be called)
 case "$runPhase" in
    pre-start)
        # Stop idle VM, drop caches & compact memory for hugepages
        setIdleVm shutdown
        tidyCaches
        resetVmPciDevices
    ;;

    # Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
    # All modifications should be done in post-start as doing them in pre-start will execute them even
    #  if the VM fails to start (and thus post-stop will never be called)
    post-start)
        # This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
        # This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
        # ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/
        # I have no idea about any alternatives besides CPU hotplug hack (see below)
        # WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
        # any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748
        # The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
        setHostAllowedCpus "0-5,12-17"
        setQemuAllowedCpus "0-5,12-17"

        # Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
        echo "Offlining to-be pinned CPUs to move tasks away..."
        setCpuStateRange 6 11 0
        setCpuStateRange 18 23 0

        # Move kernel threads & IRQs away from vCPU threads
        # Doing this when CPUs are offlined makes it easier as
        # nothing is running on these CPUs actively
        pinIrqs "0-5,12-17"
        pinKthreads "0-5,12-17"

        # Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
        echo "Onlineing to-be pinned CPUs..."
        setCpuStateRange 6 11 1
        setCpuStateRange 18 23 1

        # Set frequency scaling to performance mode
        setGovernorRange 6 11 performance
        setGovernorRange 18 23 performance

        # Stats generation causes jitter in VR
        sysctl vm.stat_interval=120

        # Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
        # An alternative hacky way to do that would be to iterate over all currently running VMs and
        # taskset their affinity to 1st CCX, but a new VM starting while this one is running will
        # break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
        # requires the VM process to be moved to a non-qemu.slice
        decoupleQemuVm

        # Pin vCPUs to correct threads - this is crucial.
        # Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
        # to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
        # first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
        # In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
        # CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
        # mapping.
        pinVCpu 0 6
        pinVCpu 1 18
        pinVCpu 2 7
        pinVCpu 3 19
        pinVCpu 4 8
        pinVCpu 5 20
        pinVCpu 6 9
        pinVCpu 7 21
        pinVCpu 8 10
        pinVCpu 9 22
        pinVCpu 10 11
        pinVCpu 11 23

        # Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
        # probabably be pinned to a single core, but we're counting on host scheduler being smart.
        # To do static pinning here QMP needs to be used to query types of threads:
        # https://wiki.qemu.org/Documentation/QMP
        pinNonVCpuTasks "0-5,12-17"
      ;;

    pre-stop)
      ;;
    post-stop)
      lastCpu=$(getLastCpu)
      # Allow kthreads, IRQs, host & QEMU to use all CPUs again
      pinKthreads "0-$lastCpu"
      pinIrqs "0-$lastCpu"
      setHostAllowedCpus "0-$lastCpu"
      setQemuAllowedCpus "0-$lastCpu"

      # Restore default scaling
      resetGovernor

      # Restore default virtual mem stats frequency
      sysctl vm.stat_interval=1

      # Start idle VM
      resetVmPciDevices
      setIdleVm start
      ;;
    *)
      echo "Unknown run phase \"$runPhase\"!"
      ;;
 esac
 echo "Finished $runPhase on VM=$vmId"
	#!/bin/bash
	set -e -o errexit -o pipefail -o nounset

	###################################
	# This script can be used by itself, but it's recommended that you read
	# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/
	###################################

	# Do not modify these variables (set by Proxmox when calling the script)
	vmId="$1"
	runPhase="$2"
	echo "Running $runPhase on VM=$vmId"

	# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs.
	# On 5900x the core config, as seen in lscpu -e, looks like the following:
	# CCX #0:
	# - NUMA: node 0
	# - CPU: 0-5, 12-17 (SMT threads/host CPU#)
	# - CORE: 0-5
	# CCX #1:
	# - NUMA: node 1
	# - CPU: 6-11, 18-23
	# - CORE: 6-11
	# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430
	#
	# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different
	# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core
	# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md
	#
	# Useful commands while debugging this code:
	# List running tasks with their affinity as of now: (the "]" filters out kthreads)
	# ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd \| grep -P '^\s+(6\|7\|8\|9\|10\|11\|18\|19\|20\|21\|22\|23)' \| grep -v -P '\]$' \| sort \| cut -c-$COLUMNS
	# Track cgroups resources usage: systemd-cgtop
	# See tree of cgroups: systemd-cgls


	# Gets QEMU parent process PID for the current VM
	getQemuPID () {
	local qemuParentPid=$(cat /run/qemu-server/$vmId.pid)
	if [[ -z $qemuParentPid ]]; then
	echo "ERROR: failed to get QEMU parent PID for VM=$vmId"
	return 1
	fi

	echo $qemuParentPid
	}

	# Gets the last logical CPU (thread) of the system
	getLastCpu () {
	echo $(( $(nproc --all) - 1 ))
	}

	# Pin vCPU to a host logic CPU (thread)
	# The theread SHOULD be a single one, but it can be any taskset list
	#
	# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
	# sub-scopes, affinity has to be set per-process with taskset here.
	#
	# Params: vCPU# hostThread#orList
	pinVCpu () {
	local vCpuNum=$1
	local hostThreadNum="$2"

	local qemuParentPid=$(getQemuPID)
	local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm \| cut -d '/' -f5)
	if [[ -z $vCpuTaskPid ]]; then
	echo "ERROR: failed to get Task PID for vCPU $vCpuNum"
	return 1
	fi

	echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum"
	taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid
	}


	# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s)
	# There thread SHOULD probably be a list unlike pinVCpu
	#
	# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to
	# sub-scopes, affinity has to be set per-process with taskset here.
	#
	# Params: hostThread#orList
	pinNonVCpuTasks () {
	local hostThreadNum="$1"

	local qemuParentPid=$(getQemuPID)
	local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm \| cut -d '/' -f5)

	while IFS= read -r tpid; do
	local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm)
	echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum"
	taskset --cpu-list --pid "$hostThreadNum" $tpid
	done <<< "$nonVCpuTaskPids"
	}

	# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus
	# to control their affinity manual pinning is needed.
	# There are hacky ways to identify kthreads like parsing "ps", but the proper way to
	# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD
	# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740)
	#
	# Params: hostThread#orList
	pinKthreads () {
	local hostThreadNum="$1"

	echo "Attempting to pin all kthreads to $hostThreadNum..."
	local procStat=""
	local pid=""
	local comm=""
	for statFile in /proc/[0-9]*/stat; do
	# This CAN sometimes fail due to TOC-TOU
	procStat=""
	2>/dev/null read -a procStat < $statFile \|\| true
	if [[ -z "${procStat[0]}" ]]; then continue; fi

	# Ignore not kthreads
	flags="${procStat[8]}"
	if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi

	pid="${procStat[0]}"
	comm="${procStat[1]:1:-1}"
	# This CAN fail for some kthreads that are needed on specific CPUs
	if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then
	echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum"
	fi
	done
	}

	# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter
	# when these are rescheduled. This function is not perfect as it doesn't set a mask
	# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't
	# be needed as if the VM isn't started on boot most if not all busy IRQs would have
	# been triggered by now.
	#
	# Params: hostThread#orList
	pinIrqs () {
	local hostThreadNum="$1"

	echo "Pinning IRQs to host thread(s) $hostThreadNum..."
	for irqAffLst in /proc/irq/*/smp_affinity_list; do
	local irqNum=$(echo "$irqAffLst" \| grep -o -E '[0-9]+')
	if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then
	echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum"
	fi
	done
	}

	# Set governor/scaling for a host logic CPU (thread)
	# Params: hostThread# desiredGovernor
	setGovernor () {
	local hostCpu=$1
	local reqGov="$2"
	local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor)

	if [[ -z "$curGov" ]]; then
	echo "ERROR: failed to query governor for CPU $hostCpu"
	return 1
	fi
	if [[ "$reqGov" == "$curGov" ]]; then
	echo "CPU $hostCpu: requested governor $reqGov - it is already set"
	return
	fi

	echo "CPU $hostCpu: changing governor from $curGov to $reqGov"
	echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor
	}

	# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive.
	# Params: hostThreadFrom# hostThreadTo# desiredGovernor
	setGovernorRange () {
	for (( i=$1; i<=$2; i++ )); do
	setGovernor $i "$3"
	done
	}

	# Resets governor/scaling to default state
	resetGovernor () {
	echo "Resetting CPU governor to default"
	service cpufrequtils restart
	}

	# Put host CPU (thread) into offline or online state
	# Params: hostThread# desiredState{0,1}
	setCpuState () {
	local hostCpu=$1
	local reqState=$2
	local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online)

	if [[ -z "$curState" ]]; then
	echo "ERROR: failed to online status for CPU $hostCpu"
	return 1
	fi
	if [[ "$reqState" == "$curState" ]]; then
	echo "CPU $hostCpu: requested state $reqState - it is already set"
	return
	fi

	echo -n "CPU $hostCpu: changing state from $curState to $reqState... "
	echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online
	if [[ $? -eq 0 ]]; then
	echo "[OK]"
	else
	echo "[FAILED]"
	return 1
	fi
	}

	# Put host CPU (thread) range into offline or online state. Range is inclusive.
	# Params: hostThreadFrom# hostThreadTo# desiredState{0,1}
	setCpuStateRange () {
	for (( i=$1; i<=$2; i++ )); do
	setCpuState $i $3
	done
	}

	tidyCaches () {
	echo -n "Tidying caches... "
	sync
	echo 3 > /proc/sys/vm/drop_caches
	echo 1 > /proc/sys/vm/compact_memory
	echo "[OK]"
	}

	# Sets cgroup slice or scope cpu isolation
	# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19)
	setCgroupAllowedCpus () {
	local entity="$1"
	local allowedCpus="$2"

	echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus"
	systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus"
	}

	# Sets logical CPUs (threads) which can be used by processes on the host
	# Params: hostThreadsList (e.g. 11,12,13-19)
	setHostAllowedCpus () {
	echo "Setting host userland CPU constrain to $1"
	setCgroupAllowedCpus "init.scope" "$1"
	setCgroupAllowedCpus "system.slice" "$1"
	setCgroupAllowedCpus "user.slice" "$1"
	}

	# Sets logical CPUs (threads) which can be QEMU processes
	# Params: hostThreadsList (e.g. 11,12,13-19
	setQemuAllowedCpus () {
	echo "Setting QEMU CPU default constrain to $1"
	setCgroupAllowedCpus "qemu.slice" "$1"
	}

	# Makes sure that a decoupled slice for some QEMU VMs exist
	# This will only do something the first time a VM start
	# Params: <none>
	ensureQemuDecoupledSlice () {
	if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then
	return 0
	fi

	echo "Creating decoupled QEMU cgroup"
	mkdir /sys/fs/cgroup/qemu-decoupled.slice

	# The slice itself MUST be allowed to run on ALL CPUs. The reason
	# for that is we will move vCPUs to an isolated set of cores BUT
	# put emulator and iothread(s) on the shared CPUs. Since cgroups v2
	# doesn't allow a thread/task to be in a different cgroup than the
	# parent these tasks must stay in the qemu-decoupled.slice but with
	# different affinity
	local lastCPU=$(getLastCpu)
	setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU"
	}

	# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well
	# as away from the standard qemu.slice used by Proxmox; see systemd-cgls
	#
	# All processes from host run under system.slice and user.slice, while all QEMU machines run
	# under qemu.slice. Proxmox actually hardcodes that slice in their startup code:
	# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893
	# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU
	# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is
	# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate
	# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't
	# under any of the standard slices. However, this is not supported by systemd, as confirmed by one
	# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can
	# be used directly (albeit without warranties).
	#
	# Params: <none>
	decoupleQemuVm () {
	ensureQemuDecoupledSlice

	local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope"
	if [[ ! -d "$vmScope" ]]; then
	echo "Creating cgroups scope for VMID=$vmId at $vmScope"
	mkdir "$vmScope"
	fi

	local qemuParentPid=$(getQemuPID)
	echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope"
	echo $qemuParentPid > "$vmScope/cgroup.procs"
	}

	# Starts/stops the "idle" windows VM to force very low GPU power states
	setIdleVm () {
	echo "Setting idle VM to $1"
	qm "$1" 107
	}

	# Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once.
	# When VM is turned off and on it will just black-screen and the VM never boots. This is a
	# workaround for that issue.
	#
	# Params: <none>
	resetVmPciDevices () {
	echo "Resetting VM PCI devices..."

	local pciAddrFun=''
	local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" \| grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?')
	while IFS= read -r pciAddr; do
	# Single function (mostly SR-IOV or vGPU) device
	if echo "$pciAddr" \| grep -F '.' > /dev/null; then
	echo "Removing PCI device function at $pciAddr"
	echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" \|\| true
	continue
	fi

	# Whole device specified => remove all function
	for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do
	pciAddrFun=$(echo $pciAddrFunRm \| grep -o -E '\.[0-9]*')
	echo "Removing PCI device $pciAddr function $pciAddrFun"
	echo 1 > "$pciAddrFunRm" \|\| true
	# This is absolutely required. Attempting to remove one function CAN
	# remove all of them but it's not instantenous. However, if you hit
	# such a case and try to manually do /remove on another function while
	# the first is being removed a "general protection fault" will happen
	# in the subsequent "pci_stop_and_remove_bus_device_locked()"
	while [[ -f "$pciAddrFunRm" ]]; do
	sleep 1
	echo "Still waiting for $pciAddrFunRm..."
	done
	done
	done <<< "$vmPciDevices"

	echo "Re-scanning PCI devices..."
	echo 1 > /sys/bus/pci/rescan
	# rescan is asynchronous; if we wanted to be 100% correct here we should wait
	# for /sys entries to appear, but 2 seconds delay is good enough
	sleep 2
	}


	# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
	# All modifications should be done in post-start as doing them in pre-start will execute them even
	# if the VM fails to start (and thus post-stop will never be called)
	case "$runPhase" in
	pre-start)
	# Stop idle VM, drop caches & compact memory for hugepages
	setIdleVm shutdown
	tidyCaches
	resetVmPciDevices
	;;

	# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff
	# All modifications should be done in post-start as doing them in pre-start will execute them even
	# if the VM fails to start (and thus post-stop will never be called)
	post-start)
	# This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX.
	# This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker &
	# ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/
	# I have no idea about any alternatives besides CPU hotplug hack (see below)
	# WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset
	# any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748
	# The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later.
	setHostAllowedCpus "0-5,12-17"
	setQemuAllowedCpus "0-5,12-17"

	# Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily
	echo "Offlining to-be pinned CPUs to move tasks away..."
	setCpuStateRange 6 11 0
	setCpuStateRange 18 23 0

	# Move kernel threads & IRQs away from vCPU threads
	# Doing this when CPUs are offlined makes it easier as
	# nothing is running on these CPUs actively
	pinIrqs "0-5,12-17"
	pinKthreads "0-5,12-17"

	# Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above
	echo "Onlineing to-be pinned CPUs..."
	setCpuStateRange 6 11 1
	setCpuStateRange 18 23 1

	# Set frequency scaling to performance mode
	setGovernorRange 6 11 performance
	setGovernorRange 18 23 performance

	# Stats generation causes jitter in VR
	sysctl vm.stat_interval=120

	# Migrate this VM to a separate isolation group (TLDR: see systemd-cgls)
	# An alternative hacky way to do that would be to iterate over all currently running VMs and
	# taskset their affinity to 1st CCX, but a new VM starting while this one is running will
	# break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That
	# requires the VM process to be moved to a non-qemu.slice
	decoupleQemuVm

	# Pin vCPUs to correct threads - this is crucial.
	# Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned
	# to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of
	# first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc.
	# In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with
	# CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc
	# mapping.
	pinVCpu 0 6
	pinVCpu 1 18
	pinVCpu 2 7
	pinVCpu 3 19
	pinVCpu 4 8
	pinVCpu 5 20
	pinVCpu 6 9
	pinVCpu 7 21
	pinVCpu 8 10
	pinVCpu 9 22
	pinVCpu 10 11
	pinVCpu 11 23

	# Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should
	# probabably be pinned to a single core, but we're counting on host scheduler being smart.
	# To do static pinning here QMP needs to be used to query types of threads:
	# https://wiki.qemu.org/Documentation/QMP
	pinNonVCpuTasks "0-5,12-17"
	;;

	pre-stop)
	;;
	post-stop)
	lastCpu=$(getLastCpu)
	# Allow kthreads, IRQs, host & QEMU to use all CPUs again
	pinKthreads "0-$lastCpu"
	pinIrqs "0-$lastCpu"
	setHostAllowedCpus "0-$lastCpu"
	setQemuAllowedCpus "0-$lastCpu"

	# Restore default scaling
	resetGovernor

	# Restore default virtual mem stats frequency
	sysctl vm.stat_interval=1

	# Start idle VM
	resetVmPciDevices
	setIdleVm start
	;;
	*)
	echo "Unknown run phase \"$runPhase\"!"
	;;
	esac
	echo "Finished $runPhase on VM=$vmId"
No results found