Skip to content

Instantly share code, notes, and snippets.

@legraphista
Last active April 3, 2025 03:14
Show Gist options
  • Save legraphista/c7f11c29dcc415a309406ae6da941e6e to your computer and use it in GitHub Desktop.
Save legraphista/c7f11c29dcc415a309406ae6da941e6e to your computer and use it in GitHub Desktop.
nvidia driver with p2p support for rtx 4090
# step 0 - cleanup your existing drivers
sudo apt-get --purge remove "*nvidia*"
sudo apt-get --purge remove "*cuda*" "*cudnn*" "*cublas*" "*cufft*" "*cufile*" "*curand*" "*cusolver*" "*cusparse*" "*gds-tools*" "*npp*" "*nvjpeg*" "nsight*" "*nvvm*" "*libnccl*"

# step 0.1 - disable iommu
ll /sys/class/iommu/
# if this folder is empty, continue
# if the folder is not empty, see https://docs.dolphinics.com/latest/guides/iommu.html

sudo reboot

# step 1 - install drivers 
sudo add-apt-repository ppa:graphics-drivers/ppa
sudo apt-get update

# Search for available Nvidia drivers and install the latest version.
apt search --names-only nvidia-driver
sudo apt install nvidia-driver-560-open # or latest

sudo reboot

# step 1.1 - verify driver
nvidia-smi

# --- 

# step 2 - install cuda & cudnn
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update

apt search --names-only cuda-toolkit
sudo apt install cuda-toolkit-12-6 # or latest

apt search --names-only cudnn
sudo apt install cudnn9-cuda-12-6 # or latest

sudo reboot

# step 2.1 - verify driver
nvidia-smi

# --- 

# step 3 - make cuda samples
sudo apt install git cmake

# Clone the Nvidia CUDA samples repository to test CUDA installation.
cd ~
git clone https://github.com/nvidia/cuda-samples
cd cuda-samples

make -j `nproc`

# --- 

# step 4 - test p2p 
./bin/x86_64/linux/release/deviceQuery
# look for "> Peer access from NVIDIA GeForce RTX 4090 (GPU0) -> NVIDIA GeForce RTX 4090 (GPU1) : {Yes/No}" 
# if you see Yes - stop, you already have p2p
# if you see No - continue

# --- 

# step 5 - uninstall official driver (just the driver)
sudo apt remove nvidia-driver-550-open
sudo apt remove nvidia-dkms-550-open
sudo apt remove nvidia-driver-550-server-open
sudo apt remove nvidia-dkms-550-server-open

# step 5.1 - double check
sudo dpkg -l | grep nvidia
# verify that all nvidia drivers / dkms are uninstalled 

# --- 

# step 6 - install patched driver
cd ~
git clone [email protected]:tinygrad/open-gpu-kernel-modules.git
cd open-gpu-kernel-modules

# step 6.1 - Building your driver
nvidia-smi # note down driver version - you've uninstalled the driver, but it should still be loaded in memory, if not, use `dpkg -l | grep nvidia` to grab the version `560.35.03-0ubuntu1` => `560.35.03`

git remote add upstream [email protected]:NVIDIA/open-gpu-kernel-modules.git
git fetch --all

# rebase patch with git or your preferred GUI
git rebase -Xignore-space-change -i upstream/560.35.03 # your driver version - make sure tag exists - you might have to deal with a conflicting README


# step 6.2 - make driver
./install.sh
# make modules -j$(nproc)
# sudo checkinstall make modules_install -j$(nproc)
# name = nvidia-driver-550-open-patch-tinygrad
# version = {driver-version}-p2p

sudo depmod

sudo reboot

# step 6.3 verify driver
nvidia-smi

# --- 

# step 7 - test p2p
cd ~/cuda-samples
./bin/x86_64/linux/release/deviceQuery
# look for "> Peer access from NVIDIA GeForce RTX 4090 (GPU0) -> NVIDIA GeForce RTX 4090 (GPU1) : {Yes/No}" 
# if you see Yes - stop, you're done
# if you see No - go to troubleshooting

# --- 

# troubleshooting
see https://morgangiraud.medium.com/multi-gpu-nvidia-p2p-capabilities-and-debugging-tips-fb7597b4e2b5
see https://morgangiraud.medium.com/multi-gpu-tinygrad-patch-4904a75f8e16
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install torch plotly tqdm numpy matplotlib seaborn"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import gc\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[device(type='cuda', index=0),\n",
" device(type='cuda', index=1),\n",
" device(type='cuda', index=2),\n",
" device(type='cuda', index=3),\n",
" device(type='cuda', index=4),\n",
" device(type='cuda', index=5)]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list_of_gpus = [torch.device(f\"cuda:{i}\") for i in range(torch.cuda.device_count())]\n",
"list_of_gpus"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cuda:0 -> cuda:3\n",
"cuda:1 -> cuda:4\n",
"cuda:5 -> cuda:3\n",
"cuda:0 -> cuda:1\n",
"cuda:4 -> cuda:5\n",
"cuda:2 -> cuda:1\n",
"cuda:4 -> cuda:0\n",
"cuda:5 -> cuda:1\n",
"cuda:3 -> cuda:0\n",
"cuda:1 -> cuda:5\n",
"cuda:3 -> cuda:2\n",
"cuda:4 -> cuda:1\n",
"cuda:5 -> cuda:0\n",
"cuda:4 -> cuda:2\n",
"cuda:0 -> cuda:5\n",
"cuda:3 -> cuda:1\n",
"cuda:2 -> cuda:0\n",
"cuda:4 -> cuda:3\n",
"cuda:1 -> cuda:0\n",
"cuda:2 -> cuda:4\n",
"cuda:3 -> cuda:5\n",
"cuda:1 -> cuda:2\n",
"cuda:0 -> cuda:4\n",
"cuda:2 -> cuda:5\n",
"cuda:3 -> cuda:4\n",
"cuda:5 -> cuda:2\n",
"cuda:1 -> cuda:3\n",
"cuda:5 -> cuda:4\n",
"cuda:2 -> cuda:3\n",
"cuda:0 -> cuda:2\n",
"All 30 GPU transfer combinations are present\n"
]
}
],
"source": [
"\n",
"speed_matrix = np.zeros((len(list_of_gpus), len(list_of_gpus)))\n",
"\n",
"test_pattern = []\n",
"for gpu1 in list_of_gpus:\n",
" for gpu2 in list_of_gpus:\n",
" if gpu1 == gpu2:\n",
" continue\n",
" test_pattern.append((gpu1, gpu2))\n",
"\n",
"# Shuffle and ensure no adjacent pairs share GPUs\n",
"np.random.shuffle(test_pattern)\n",
"i = 0\n",
"while i < len(test_pattern)-1:\n",
" gpu1, gpu2 = test_pattern[i]\n",
" next_gpu1, next_gpu2 = test_pattern[i+1]\n",
" \n",
" # If adjacent pairs share any GPU, swap with next non-conflicting pair\n",
" if gpu1 in (next_gpu1, next_gpu2) or gpu2 in (next_gpu1, next_gpu2):\n",
" for j in range(i+2, len(test_pattern)):\n",
" candidate_gpu1, candidate_gpu2 = test_pattern[j]\n",
" if gpu1 not in (candidate_gpu1, candidate_gpu2) and gpu2 not in (candidate_gpu1, candidate_gpu2):\n",
" test_pattern[i+1], test_pattern[j] = test_pattern[j], test_pattern[i+1]\n",
" break\n",
" i += 1\n",
"\n",
"\n",
"for gpu1, gpu2 in test_pattern:\n",
" print(f\"{gpu1} -> {gpu2}\")\n",
"\n",
"# Verify all GPU combinations are present\n",
"expected_combinations = set()\n",
"for gpu1 in list_of_gpus:\n",
" for gpu2 in list_of_gpus:\n",
" if gpu1 != gpu2:\n",
" expected_combinations.add((gpu1, gpu2))\n",
"\n",
"test_pattern_set = set(test_pattern)\n",
"\n",
"assert len(expected_combinations) == len(test_pattern_set), \"Some GPU combinations are missing\"\n",
"assert expected_combinations == test_pattern_set, \"Test pattern doesn't match expected GPU combinations\"\n",
"\n",
"print(f\"All {len(test_pattern)} GPU transfer combinations are present\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GPU cuda:0 can access: GPU cuda:1: ✓, GPU cuda:2: ✓, GPU cuda:3: ✓, GPU cuda:4: ✓, GPU cuda:5: ✓, \n",
"GPU cuda:1 can access: GPU cuda:0: ✓, GPU cuda:2: ✓, GPU cuda:3: ✓, GPU cuda:4: ✓, GPU cuda:5: ✓, \n",
"GPU cuda:2 can access: GPU cuda:0: ✓, GPU cuda:1: ✓, GPU cuda:3: ✓, GPU cuda:4: ✓, GPU cuda:5: ✓, \n",
"GPU cuda:3 can access: GPU cuda:0: ✓, GPU cuda:1: ✓, GPU cuda:2: ✓, GPU cuda:4: ✓, GPU cuda:5: ✓, \n",
"GPU cuda:4 can access: GPU cuda:0: ✓, GPU cuda:1: ✓, GPU cuda:2: ✓, GPU cuda:3: ✓, GPU cuda:5: ✓, \n",
"GPU cuda:5 can access: GPU cuda:0: ✓, GPU cuda:1: ✓, GPU cuda:2: ✓, GPU cuda:3: ✓, GPU cuda:4: ✓, \n"
]
}
],
"source": [
"for gpu1 in list_of_gpus:\n",
" print(f\"GPU {gpu1} can access: \", end=\"\")\n",
" for gpu2 in list_of_gpus:\n",
" if gpu1 == gpu2:\n",
" continue\n",
" can_access = torch.cuda.can_device_access_peer(gpu1, gpu2)\n",
" print(f\"GPU {gpu2}: {'✓' if can_access else '✗'}, \", end=\" \")\n",
" print()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/30 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:0 to cuda:3\n",
" Generating random tensor for cuda:0\n",
" Copying to cuda:0\n",
" Copying from cuda:0 to cuda:3\n",
" Done in 0.08 seconds at 26.16 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 3%|▎ | 1/30 [00:04<02:22, 4.92s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:1 to cuda:4\n",
" Generating random tensor for cuda:1\n",
" Copying to cuda:1\n",
" Copying from cuda:1 to cuda:4\n",
" Done in 0.08 seconds at 26.15 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 7%|▋ | 2/30 [00:09<02:20, 5.00s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:5 to cuda:3\n",
" Generating random tensor for cuda:5\n",
" Copying to cuda:5\n",
" Copying from cuda:5 to cuda:3\n",
" Done in 0.08 seconds at 26.12 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 10%|█ | 3/30 [00:14<02:13, 4.95s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:0 to cuda:1\n",
" Generating random tensor for cuda:0\n",
" Copying to cuda:0\n",
" Copying from cuda:0 to cuda:1\n",
" Done in 0.08 seconds at 26.16 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 13%|█▎ | 4/30 [00:19<02:06, 4.88s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:4 to cuda:5\n",
" Generating random tensor for cuda:4\n",
" Copying to cuda:4\n",
" Copying from cuda:4 to cuda:5\n",
" Done in 0.08 seconds at 26.13 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 17%|█▋ | 5/30 [00:24<02:01, 4.86s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:2 to cuda:1\n",
" Generating random tensor for cuda:2\n",
" Copying to cuda:2\n",
" Copying from cuda:2 to cuda:1\n",
" Done in 0.08 seconds at 26.11 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 20%|██ | 6/30 [00:29<01:59, 4.97s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:4 to cuda:0\n",
" Generating random tensor for cuda:4\n",
" Copying to cuda:4\n",
" Copying from cuda:4 to cuda:0\n",
" Done in 0.08 seconds at 26.15 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 23%|██▎ | 7/30 [00:34<01:55, 5.01s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:5 to cuda:1\n",
" Generating random tensor for cuda:5\n",
" Copying to cuda:5\n",
" Copying from cuda:5 to cuda:1\n",
" Done in 0.08 seconds at 26.08 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 27%|██▋ | 8/30 [00:39<01:50, 5.01s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:3 to cuda:0\n",
" Generating random tensor for cuda:3\n",
" Copying to cuda:3\n",
" Copying from cuda:3 to cuda:0\n",
" Done in 0.08 seconds at 26.08 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 30%|███ | 9/30 [00:44<01:45, 5.01s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:1 to cuda:5\n",
" Generating random tensor for cuda:1\n",
" Copying to cuda:1\n",
" Copying from cuda:1 to cuda:5\n",
" Done in 0.08 seconds at 26.12 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 33%|███▎ | 10/30 [00:49<01:40, 5.01s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:3 to cuda:2\n",
" Generating random tensor for cuda:3\n",
" Copying to cuda:3\n",
" Copying from cuda:3 to cuda:2\n",
" Done in 0.08 seconds at 26.15 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 37%|███▋ | 11/30 [00:54<01:35, 5.01s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:4 to cuda:1\n",
" Generating random tensor for cuda:4\n",
" Copying to cuda:4\n",
" Copying from cuda:4 to cuda:1\n",
" Done in 0.08 seconds at 26.04 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 40%|████ | 12/30 [00:59<01:30, 5.01s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:5 to cuda:0\n",
" Generating random tensor for cuda:5\n",
" Copying to cuda:5\n",
" Copying from cuda:5 to cuda:0\n",
" Done in 0.08 seconds at 26.06 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 43%|████▎ | 13/30 [01:04<01:25, 5.00s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:4 to cuda:2\n",
" Generating random tensor for cuda:4\n",
" Copying to cuda:4\n",
" Copying from cuda:4 to cuda:2\n",
" Done in 0.08 seconds at 26.13 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 47%|████▋ | 14/30 [01:09<01:20, 5.00s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:0 to cuda:5\n",
" Generating random tensor for cuda:0\n",
" Copying to cuda:0\n",
" Copying from cuda:0 to cuda:5\n",
" Done in 0.08 seconds at 26.08 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 50%|█████ | 15/30 [01:14<01:15, 5.00s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:3 to cuda:1\n",
" Generating random tensor for cuda:3\n",
" Copying to cuda:3\n",
" Copying from cuda:3 to cuda:1\n",
" Done in 0.08 seconds at 26.01 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 53%|█████▎ | 16/30 [01:19<01:09, 4.99s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:2 to cuda:0\n",
" Generating random tensor for cuda:2\n",
" Copying to cuda:2\n",
" Copying from cuda:2 to cuda:0\n",
" Done in 0.08 seconds at 26.04 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 57%|█████▋ | 17/30 [01:24<01:04, 4.99s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:4 to cuda:3\n",
" Generating random tensor for cuda:4\n",
" Copying to cuda:4\n",
" Copying from cuda:4 to cuda:3\n",
" Done in 0.08 seconds at 26.08 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 60%|██████ | 18/30 [01:29<00:59, 4.98s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:1 to cuda:0\n",
" Generating random tensor for cuda:1\n",
" Copying to cuda:1\n",
" Copying from cuda:1 to cuda:0\n",
" Done in 0.08 seconds at 25.94 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 63%|██████▎ | 19/30 [01:34<00:54, 4.99s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:2 to cuda:4\n",
" Generating random tensor for cuda:2\n",
" Copying to cuda:2\n",
" Copying from cuda:2 to cuda:4\n",
" Done in 0.08 seconds at 26.12 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 67%|██████▋ | 20/30 [01:39<00:49, 4.99s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:3 to cuda:5\n",
" Generating random tensor for cuda:3\n",
" Copying to cuda:3\n",
" Copying from cuda:3 to cuda:5\n",
" Done in 0.08 seconds at 25.99 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 70%|███████ | 21/30 [01:44<00:45, 5.02s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:1 to cuda:2\n",
" Generating random tensor for cuda:1\n",
" Copying to cuda:1\n",
" Copying from cuda:1 to cuda:2\n",
" Done in 0.08 seconds at 26.08 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 73%|███████▎ | 22/30 [01:49<00:40, 5.02s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:0 to cuda:4\n",
" Generating random tensor for cuda:0\n",
" Copying to cuda:0\n",
" Copying from cuda:0 to cuda:4\n",
" Done in 0.08 seconds at 26.08 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 77%|███████▋ | 23/30 [01:54<00:35, 5.03s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:2 to cuda:5\n",
" Generating random tensor for cuda:2\n",
" Copying to cuda:2\n",
" Copying from cuda:2 to cuda:5\n",
" Done in 0.08 seconds at 26.01 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 80%|████████ | 24/30 [01:59<00:30, 5.04s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:3 to cuda:4\n",
" Generating random tensor for cuda:3\n",
" Copying to cuda:3\n",
" Copying from cuda:3 to cuda:4\n",
" Done in 0.08 seconds at 26.04 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 83%|████████▎ | 25/30 [02:04<00:25, 5.03s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:5 to cuda:2\n",
" Generating random tensor for cuda:5\n",
" Copying to cuda:5\n",
" Copying from cuda:5 to cuda:2\n",
" Done in 0.08 seconds at 26.04 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 87%|████████▋ | 26/30 [02:09<00:20, 5.03s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:1 to cuda:3\n",
" Generating random tensor for cuda:1\n",
" Copying to cuda:1\n",
" Copying from cuda:1 to cuda:3\n",
" Done in 0.08 seconds at 25.97 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 90%|█████████ | 27/30 [02:15<00:15, 5.04s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:5 to cuda:4\n",
" Generating random tensor for cuda:5\n",
" Copying to cuda:5\n",
" Copying from cuda:5 to cuda:4\n",
" Done in 0.08 seconds at 25.94 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 93%|█████████▎| 28/30 [02:20<00:10, 5.03s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:2 to cuda:3\n",
" Generating random tensor for cuda:2\n",
" Copying to cuda:2\n",
" Copying from cuda:2 to cuda:3\n",
" Done in 0.09 seconds at 25.11 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 97%|█████████▋| 29/30 [02:25<00:05, 5.02s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing cuda:0 to cuda:2\n",
" Generating random tensor for cuda:0\n",
" Copying to cuda:0\n",
" Copying from cuda:0 to cuda:2\n",
" Done in 0.08 seconds at 25.92 GB/s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 30/30 [02:30<00:00, 5.00s/it]\n"
]
}
],
"source": [
"\n",
"import time\n",
"from tqdm import tqdm\n",
"\n",
"\n",
"for gpu1, gpu2 in tqdm(test_pattern):\n",
" print(f\"Testing {gpu1} to {gpu2}\")\n",
"\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
"\n",
" # Warm up transfer\n",
" warm_up = torch.randn(1024, 1024, device=gpu1)\n",
" _ = warm_up.to(gpu2)\n",
" torch.cuda.synchronize(gpu1)\n",
" torch.cuda.synchronize(gpu2)\n",
" del warm_up\n",
"\n",
" # 8gb\n",
" print(f\" Generating random tensor for {gpu1}\")\n",
" random_tensor = torch.tensor(np.random.randint(0, 100, (1024, 1024, 1024), dtype=np.int16))\n",
"\n",
" # copy from CPU to GPU 1\n",
" print(f\" Copying to {gpu1}\")\n",
" gpu1_tensor = random_tensor.to(gpu1)\n",
"\n",
" # copy from GPU 1 to GPU 2\n",
" _start = time.perf_counter()\n",
" print(f\" Copying from {gpu1} to {gpu2}\")\n",
" gpu2_tensor = gpu1_tensor.to(gpu2)\n",
" torch.cuda.synchronize(gpu1)\n",
" torch.cuda.synchronize(gpu2)\n",
" _end = time.perf_counter()\n",
"\n",
" _total_time = _end - _start\n",
" _bytes_moved = random_tensor.numel() * random_tensor.element_size()\n",
" _speed_GBs = _bytes_moved / _total_time / 1e9\n",
" print(f\" Done in {_end - _start:.2f} seconds at {_speed_GBs:.2f} GB/s\")\n",
"\n",
" speed_matrix[gpu1.index, gpu2.index] = _speed_GBs\n",
"\n",
" # pull tensor back to CPU\n",
" cpu_tensor = gpu2_tensor.to(\"cpu\")\n",
"\n",
" # check if the tensors are equal\n",
" if not torch.equal(random_tensor, cpu_tensor):\n",
" print(f\"GPU tensors are not equal\")\n",
"\n",
" del gpu2_tensor\n",
" del gpu1_tensor\n",
" del random_tensor\n",
"\n",
"gc.collect()\n",
"torch.cuda.empty_cache()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# Create a copy of speed_matrix and set diagonal to None to ignore same-GPU transfers\n",
"masked_matrix = speed_matrix.copy()\n",
"for i in range(len(list_of_gpus)):\n",
" masked_matrix[i,i] = np.nan\n",
"\n",
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"im = ax.imshow(masked_matrix, cmap='viridis')\n",
"\n",
"# Add labels\n",
"ax.set_xticks(np.arange(len(list_of_gpus)))\n",
"ax.set_yticks(np.arange(len(list_of_gpus)))\n",
"ax.set_xticklabels([f'GPU {i}' for i in range(len(list_of_gpus))])\n",
"ax.set_yticklabels([f'GPU {i}' for i in range(len(list_of_gpus))])\n",
"\n",
"# Add colorbar\n",
"cbar = plt.colorbar(im)\n",
"cbar.set_label('GB/s')\n",
"\n",
"# Add title and axis labels\n",
"plt.title('GPU to GPU Transfer Speeds')\n",
"plt.xlabel('Destination GPU')\n",
"plt.ylabel('Source GPU')\n",
"\n",
"# Rotate x-axis labels for better readability\n",
"plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@liunn5
Copy link

liunn5 commented Dec 20, 2024

driver565.57.1 enabled resize-bar, question:CUDA error at simpleP2P.cu:129 code=205(cudaErrorMapBufferObjectFailed) "cudaDeviceEnablePeerAccess(gpuid[1], 0)"。 why this

@legraphista
Copy link
Author

driver565.57.1 enabled resize-bar, question:CUDA error at simpleP2P.cu:129 code=205(cudaErrorMapBufferObjectFailed) "cudaDeviceEnablePeerAccess(gpuid[1], 0)"。 why this

You might have to enable resizable bar in your BIOS. it should be under pci-e settings

@liunn5
Copy link

liunn5 commented Dec 20, 2024 via email

@legraphista
Copy link
Author

I've never had this issue, try posting in https://github.com/tinygrad/open-gpu-kernel-modules and ask for help there. They most likely know better

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment