Skip to content

Instantly share code, notes, and snippets.

@thesues
Created October 11, 2024 20:33
Show Gist options
  • Save thesues/bf37d783bf00e4ad0e0791df9724d100 to your computer and use it in GitHub Desktop.
Save thesues/bf37d783bf00e4ad0e0791df9724d100 to your computer and use it in GitHub Desktop.
#include <cuda_runtime.h>
#include <infiniband/verbs.h>
#include <iostream>
int main() {
// 1. 分配 GPU 内存
void* gpu_ptr1;
void* gpu_ptr2;
// 64MB * 2 => pass
// 110MB * 2 => pass
// 128MB * 2 => failed
// 单次MR注册或者总量注册不能超过220MB?
size_t size = 110<<20;
cudaError_t err = cudaMalloc(&gpu_ptr1, size);
if (err != cudaSuccess) {
std::cerr << "Failed to allocate GPU memory: " << cudaGetErrorString(err) << std::endl;
return -1;
}
err = cudaMalloc(&gpu_ptr2, size);
if (err != cudaSuccess) {
std::cerr << "Failed to allocate GPU memory: " << cudaGetErrorString(err) << std::endl;
return -1;
}
// 2. 初始化 RDMA 相关资源
struct ibv_context* context;
struct ibv_device** dev_list = ibv_get_device_list(nullptr);
if (!dev_list) {
std::cerr << "Failed to get RDMA devices list" << std::endl;
return -1;
}
context = ibv_open_device(dev_list[0]);
if (!context) {
std::cerr << "Failed to open RDMA device" << std::endl;
return -1;
}
ibv_free_device_list(dev_list);
struct ibv_pd* pd = ibv_alloc_pd(context); // 保护域 (protection domain)
if (!pd) {
std::cerr << "Failed to allocate protection domain" << std::endl;
return -1;
}
// 3. 注册 GPU 内存
struct ibv_mr* mr = ibv_reg_mr(pd, gpu_ptr1, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
// print error message
if (!mr) {
std::cerr << "1Failed to register memory region with RDMA " << strerror(errno) << std::endl;
return -1;
}
struct ibv_mr* mr2 = ibv_reg_mr(pd, gpu_ptr2, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
// print error message
if (!mr2) {
std::cerr << "2Failed to register memory region with RDMA " << strerror(errno) << std::endl;
return -1;
}
std::cout << "Successfully registered GPU memory for RDMA." << std::endl;
// 4. 进行 RDMA 操作
// 在这里可以执行 RDMA 读/写操作,使用 mr->rkey 进行远程访问
// 5. 清理资源
ibv_dereg_mr(mr);
cudaFree(gpu_ptr1);
cudaFree(gpu_ptr2);
ibv_dealloc_pd(pd);
return 0;
}
g++ -L/usr/local/cuda/lib64 -I/usr/local/cuda/include bug.c -libverbs -lcudart
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment