-
-
Save kiler129/215e2c8de853209ca429ad5ed40ce128 to your computer and use it in GitHub Desktop.
| #!/bin/bash | |
| set -e -o errexit -o pipefail -o nounset | |
| ################################### | |
| # This script can be used by itself, but it's recommended that you read | |
| # a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/ | |
| ################################### | |
| # Do not modify these variables (set by Proxmox when calling the script) | |
| vmId="$1" | |
| runPhase="$2" | |
| echo "Running $runPhase on VM=$vmId" | |
| # vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs. | |
| # On 5900x the core config, as seen in lscpu -e, looks like the following: | |
| # CCX #0: | |
| # - NUMA: node 0 | |
| # - CPU: 0-5, 12-17 (SMT threads/host CPU#) | |
| # - CORE: 0-5 | |
| # CCX #1: | |
| # - NUMA: node 1 | |
| # - CPU: 6-11, 18-23 | |
| # - CORE: 6-11 | |
| # "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430 | |
| # | |
| # VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different | |
| # so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core | |
| # There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md | |
| # | |
| # Useful commands while debugging this code: | |
| # List running tasks with their affinity as of now: (the "]" filters out kthreads) | |
| # ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS | |
| # Track cgroups resources usage: systemd-cgtop | |
| # See tree of cgroups: systemd-cgls | |
| # Gets QEMU parent process PID for the current VM | |
| getQemuPID () { | |
| local qemuParentPid=$(cat /run/qemu-server/$vmId.pid) | |
| if [[ -z $qemuParentPid ]]; then | |
| echo "ERROR: failed to get QEMU parent PID for VM=$vmId" | |
| return 1 | |
| fi | |
| echo $qemuParentPid | |
| } | |
| # Gets the last logical CPU (thread) of the system | |
| getLastCpu () { | |
| echo $(( $(nproc --all) - 1 )) | |
| } | |
| # Pin vCPU to a host logic CPU (thread) | |
| # The theread SHOULD be a single one, but it can be any taskset list | |
| # | |
| # Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to | |
| # sub-scopes, affinity has to be set per-process with taskset here. | |
| # | |
| # Params: vCPU# hostThread#orList | |
| pinVCpu () { | |
| local vCpuNum=$1 | |
| local hostThreadNum="$2" | |
| local qemuParentPid=$(getQemuPID) | |
| local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5) | |
| if [[ -z $vCpuTaskPid ]]; then | |
| echo "ERROR: failed to get Task PID for vCPU $vCpuNum" | |
| return 1 | |
| fi | |
| echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum" | |
| taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid | |
| } | |
| # Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s) | |
| # There thread SHOULD probably be a list unlike pinVCpu | |
| # | |
| # Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to | |
| # sub-scopes, affinity has to be set per-process with taskset here. | |
| # | |
| # Params: hostThread#orList | |
| pinNonVCpuTasks () { | |
| local hostThreadNum="$1" | |
| local qemuParentPid=$(getQemuPID) | |
| local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5) | |
| while IFS= read -r tpid; do | |
| local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm) | |
| echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum" | |
| taskset --cpu-list --pid "$hostThreadNum" $tpid | |
| done <<< "$nonVCpuTaskPids" | |
| } | |
| # Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus | |
| # to control their affinity manual pinning is needed. | |
| # There are hacky ways to identify kthreads like parsing "ps", but the proper way to | |
| # that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD | |
| # mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740) | |
| # | |
| # Params: hostThread#orList | |
| pinKthreads () { | |
| local hostThreadNum="$1" | |
| echo "Attempting to pin all kthreads to $hostThreadNum..." | |
| local procStat="" | |
| local pid="" | |
| local comm="" | |
| for statFile in /proc/[0-9]*/stat; do | |
| # This CAN sometimes fail due to TOC-TOU | |
| procStat="" | |
| 2>/dev/null read -a procStat < $statFile || true | |
| if [[ -z "${procStat[0]}" ]]; then continue; fi | |
| # Ignore not kthreads | |
| flags="${procStat[8]}" | |
| if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi | |
| pid="${procStat[0]}" | |
| comm="${procStat[1]:1:-1}" | |
| # This CAN fail for some kthreads that are needed on specific CPUs | |
| if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then | |
| echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum" | |
| fi | |
| done | |
| } | |
| # Most IRQs can be moved away from the threads running vCPUs, that can cause jitter | |
| # when these are rescheduled. This function is not perfect as it doesn't set a mask | |
| # for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't | |
| # be needed as if the VM isn't started on boot most if not all busy IRQs would have | |
| # been triggered by now. | |
| # | |
| # Params: hostThread#orList | |
| pinIrqs () { | |
| local hostThreadNum="$1" | |
| echo "Pinning IRQs to host thread(s) $hostThreadNum..." | |
| for irqAffLst in /proc/irq/*/smp_affinity_list; do | |
| local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+') | |
| if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then | |
| echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum" | |
| fi | |
| done | |
| } | |
| # Set governor/scaling for a host logic CPU (thread) | |
| # Params: hostThread# desiredGovernor | |
| setGovernor () { | |
| local hostCpu=$1 | |
| local reqGov="$2" | |
| local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor) | |
| if [[ -z "$curGov" ]]; then | |
| echo "ERROR: failed to query governor for CPU $hostCpu" | |
| return 1 | |
| fi | |
| if [[ "$reqGov" == "$curGov" ]]; then | |
| echo "CPU $hostCpu: requested governor $reqGov - it is already set" | |
| return | |
| fi | |
| echo "CPU $hostCpu: changing governor from $curGov to $reqGov" | |
| echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor | |
| } | |
| # Sets governor/scaling on a range of host CPUs (threads). Range is inclusive. | |
| # Params: hostThreadFrom# hostThreadTo# desiredGovernor | |
| setGovernorRange () { | |
| for (( i=$1; i<=$2; i++ )); do | |
| setGovernor $i "$3" | |
| done | |
| } | |
| # Resets governor/scaling to default state | |
| resetGovernor () { | |
| echo "Resetting CPU governor to default" | |
| service cpufrequtils restart | |
| } | |
| # Put host CPU (thread) into offline or online state | |
| # Params: hostThread# desiredState{0,1} | |
| setCpuState () { | |
| local hostCpu=$1 | |
| local reqState=$2 | |
| local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online) | |
| if [[ -z "$curState" ]]; then | |
| echo "ERROR: failed to online status for CPU $hostCpu" | |
| return 1 | |
| fi | |
| if [[ "$reqState" == "$curState" ]]; then | |
| echo "CPU $hostCpu: requested state $reqState - it is already set" | |
| return | |
| fi | |
| echo -n "CPU $hostCpu: changing state from $curState to $reqState... " | |
| echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online | |
| if [[ $? -eq 0 ]]; then | |
| echo "[OK]" | |
| else | |
| echo "[FAILED]" | |
| return 1 | |
| fi | |
| } | |
| # Put host CPU (thread) range into offline or online state. Range is inclusive. | |
| # Params: hostThreadFrom# hostThreadTo# desiredState{0,1} | |
| setCpuStateRange () { | |
| for (( i=$1; i<=$2; i++ )); do | |
| setCpuState $i $3 | |
| done | |
| } | |
| tidyCaches () { | |
| echo -n "Tidying caches... " | |
| sync | |
| echo 3 > /proc/sys/vm/drop_caches | |
| echo 1 > /proc/sys/vm/compact_memory | |
| echo "[OK]" | |
| } | |
| # Sets cgroup slice or scope cpu isolation | |
| # Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19) | |
| setCgroupAllowedCpus () { | |
| local entity="$1" | |
| local allowedCpus="$2" | |
| echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus" | |
| systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus" | |
| } | |
| # Sets logical CPUs (threads) which can be used by processes on the host | |
| # Params: hostThreadsList (e.g. 11,12,13-19) | |
| setHostAllowedCpus () { | |
| echo "Setting host userland CPU constrain to $1" | |
| setCgroupAllowedCpus "init.scope" "$1" | |
| setCgroupAllowedCpus "system.slice" "$1" | |
| setCgroupAllowedCpus "user.slice" "$1" | |
| } | |
| # Sets logical CPUs (threads) which can be QEMU processes | |
| # Params: hostThreadsList (e.g. 11,12,13-19 | |
| setQemuAllowedCpus () { | |
| echo "Setting QEMU CPU default constrain to $1" | |
| setCgroupAllowedCpus "qemu.slice" "$1" | |
| } | |
| # Makes sure that a decoupled slice for some QEMU VMs exist | |
| # This will only do something the first time a VM start | |
| # Params: <none> | |
| ensureQemuDecoupledSlice () { | |
| if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then | |
| return 0 | |
| fi | |
| echo "Creating decoupled QEMU cgroup" | |
| mkdir /sys/fs/cgroup/qemu-decoupled.slice | |
| # The slice itself MUST be allowed to run on ALL CPUs. The reason | |
| # for that is we will move vCPUs to an isolated set of cores BUT | |
| # put emulator and iothread(s) on the shared CPUs. Since cgroups v2 | |
| # doesn't allow a thread/task to be in a different cgroup than the | |
| # parent these tasks must stay in the qemu-decoupled.slice but with | |
| # different affinity | |
| local lastCPU=$(getLastCpu) | |
| setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU" | |
| } | |
| # Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well | |
| # as away from the standard qemu.slice used by Proxmox; see systemd-cgls | |
| # | |
| # All processes from host run under system.slice and user.slice, while all QEMU machines run | |
| # under qemu.slice. Proxmox actually hardcodes that slice in their startup code: | |
| # https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893 | |
| # This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU | |
| # threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is | |
| # running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate | |
| # the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't | |
| # under any of the standard slices. However, this is not supported by systemd, as confirmed by one | |
| # of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can | |
| # be used directly (albeit without warranties). | |
| # | |
| # Params: <none> | |
| decoupleQemuVm () { | |
| ensureQemuDecoupledSlice | |
| local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope" | |
| if [[ ! -d "$vmScope" ]]; then | |
| echo "Creating cgroups scope for VMID=$vmId at $vmScope" | |
| mkdir "$vmScope" | |
| fi | |
| local qemuParentPid=$(getQemuPID) | |
| echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope" | |
| echo $qemuParentPid > "$vmScope/cgroup.procs" | |
| } | |
| # Starts/stops the "idle" windows VM to force very low GPU power states | |
| setIdleVm () { | |
| echo "Setting idle VM to $1" | |
| qm "$1" 107 | |
| } | |
| # Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once. | |
| # When VM is turned off and on it will just black-screen and the VM never boots. This is a | |
| # workaround for that issue. | |
| # | |
| # Params: <none> | |
| resetVmPciDevices () { | |
| echo "Resetting VM PCI devices..." | |
| local pciAddrFun='' | |
| local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?') | |
| while IFS= read -r pciAddr; do | |
| # Single function (mostly SR-IOV or vGPU) device | |
| if echo "$pciAddr" | grep -F '.' > /dev/null; then | |
| echo "Removing PCI device function at $pciAddr" | |
| echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true | |
| continue | |
| fi | |
| # Whole device specified => remove all function | |
| for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do | |
| pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*') | |
| echo "Removing PCI device $pciAddr function $pciAddrFun" | |
| echo 1 > "$pciAddrFunRm" || true | |
| # This is absolutely required. Attempting to remove one function CAN | |
| # remove all of them but it's not instantenous. However, if you hit | |
| # such a case and try to manually do /remove on another function while | |
| # the first is being removed a "general protection fault" will happen | |
| # in the subsequent "pci_stop_and_remove_bus_device_locked()" | |
| while [[ -f "$pciAddrFunRm" ]]; do | |
| sleep 1 | |
| echo "Still waiting for $pciAddrFunRm..." | |
| done | |
| done | |
| done <<< "$vmPciDevices" | |
| echo "Re-scanning PCI devices..." | |
| echo 1 > /sys/bus/pci/rescan | |
| # rescan is asynchronous; if we wanted to be 100% correct here we should wait | |
| # for /sys entries to appear, but 2 seconds delay is good enough | |
| sleep 2 | |
| } | |
| # Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff | |
| # All modifications should be done in post-start as doing them in pre-start will execute them even | |
| # if the VM fails to start (and thus post-stop will never be called) | |
| case "$runPhase" in | |
| pre-start) | |
| # Stop idle VM, drop caches & compact memory for hugepages | |
| setIdleVm shutdown | |
| tidyCaches | |
| resetVmPciDevices | |
| ;; | |
| # Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff | |
| # All modifications should be done in post-start as doing them in pre-start will execute them even | |
| # if the VM fails to start (and thus post-stop will never be called) | |
| post-start) | |
| # This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX. | |
| # This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker & | |
| # ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/ | |
| # I have no idea about any alternatives besides CPU hotplug hack (see below) | |
| # WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset | |
| # any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748 | |
| # The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later. | |
| setHostAllowedCpus "0-5,12-17" | |
| setQemuAllowedCpus "0-5,12-17" | |
| # Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily | |
| echo "Offlining to-be pinned CPUs to move tasks away..." | |
| setCpuStateRange 6 11 0 | |
| setCpuStateRange 18 23 0 | |
| # Move kernel threads & IRQs away from vCPU threads | |
| # Doing this when CPUs are offlined makes it easier as | |
| # nothing is running on these CPUs actively | |
| pinIrqs "0-5,12-17" | |
| pinKthreads "0-5,12-17" | |
| # Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above | |
| echo "Onlineing to-be pinned CPUs..." | |
| setCpuStateRange 6 11 1 | |
| setCpuStateRange 18 23 1 | |
| # Set frequency scaling to performance mode | |
| setGovernorRange 6 11 performance | |
| setGovernorRange 18 23 performance | |
| # Stats generation causes jitter in VR | |
| sysctl vm.stat_interval=120 | |
| # Migrate this VM to a separate isolation group (TLDR: see systemd-cgls) | |
| # An alternative hacky way to do that would be to iterate over all currently running VMs and | |
| # taskset their affinity to 1st CCX, but a new VM starting while this one is running will | |
| # break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That | |
| # requires the VM process to be moved to a non-qemu.slice | |
| decoupleQemuVm | |
| # Pin vCPUs to correct threads - this is crucial. | |
| # Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned | |
| # to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of | |
| # first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc. | |
| # In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with | |
| # CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc | |
| # mapping. | |
| pinVCpu 0 6 | |
| pinVCpu 1 18 | |
| pinVCpu 2 7 | |
| pinVCpu 3 19 | |
| pinVCpu 4 8 | |
| pinVCpu 5 20 | |
| pinVCpu 6 9 | |
| pinVCpu 7 21 | |
| pinVCpu 8 10 | |
| pinVCpu 9 22 | |
| pinVCpu 10 11 | |
| pinVCpu 11 23 | |
| # Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should | |
| # probabably be pinned to a single core, but we're counting on host scheduler being smart. | |
| # To do static pinning here QMP needs to be used to query types of threads: | |
| # https://wiki.qemu.org/Documentation/QMP | |
| pinNonVCpuTasks "0-5,12-17" | |
| ;; | |
| pre-stop) | |
| ;; | |
| post-stop) | |
| lastCpu=$(getLastCpu) | |
| # Allow kthreads, IRQs, host & QEMU to use all CPUs again | |
| pinKthreads "0-$lastCpu" | |
| pinIrqs "0-$lastCpu" | |
| setHostAllowedCpus "0-$lastCpu" | |
| setQemuAllowedCpus "0-$lastCpu" | |
| # Restore default scaling | |
| resetGovernor | |
| # Restore default virtual mem stats frequency | |
| sysctl vm.stat_interval=1 | |
| # Start idle VM | |
| resetVmPciDevices | |
| setIdleVm start | |
| ;; | |
| *) | |
| echo "Unknown run phase \"$runPhase\"!" | |
| ;; | |
| esac | |
| echo "Finished $runPhase on VM=$vmId" |
after having it work for a few times it stopped working for some reason now giving this error
Migrating VMID=301 PPID=9671 to scope /sys/fs/cgroup/qemu-decoupled.slice/301.scope /var/lib/vz/snippets/proxmox-hook.sh: line 297: echo: write error: Invalid argument
line 297 is echo $qemuParentPid > "$vmScope/cgroup.procs"
after having it work for a few times it stopped working for some reason now giving this error
Migrating VMID=301 PPID=9671 to scope /sys/fs/cgroup/qemu-decoupled.slice/301.scope /var/lib/vz/snippets/proxmox-hook.sh: line 297: echo: write error: Invalid argumentline 297 is
echo $qemuParentPid > "$vmScope/cgroup.procs"
I'm with the same issue. It's something related do cgroup v2 inner workings.
Did a lot of research but couldn't find any workarounds yet.
Hi quite very complete script... not fully sure. But at the section : Resetting VM PCI devices , is it made as i look what is the card being pass and then find it.. and remove-reset/rescran the card ? I do see a 2 liner that you have to define the card. but if your script can detect and do it direct, it's very nice. Thanks for precision.