Created
October 11, 2024 20:33
-
-
Save thesues/bf37d783bf00e4ad0e0791df9724d100 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime.h> | |
#include <infiniband/verbs.h> | |
#include <iostream> | |
int main() { | |
// 1. 分配 GPU 内存 | |
void* gpu_ptr1; | |
void* gpu_ptr2; | |
// 64MB * 2 => pass | |
// 110MB * 2 => pass | |
// 128MB * 2 => failed | |
// 单次MR注册或者总量注册不能超过220MB? | |
size_t size = 110<<20; | |
cudaError_t err = cudaMalloc(&gpu_ptr1, size); | |
if (err != cudaSuccess) { | |
std::cerr << "Failed to allocate GPU memory: " << cudaGetErrorString(err) << std::endl; | |
return -1; | |
} | |
err = cudaMalloc(&gpu_ptr2, size); | |
if (err != cudaSuccess) { | |
std::cerr << "Failed to allocate GPU memory: " << cudaGetErrorString(err) << std::endl; | |
return -1; | |
} | |
// 2. 初始化 RDMA 相关资源 | |
struct ibv_context* context; | |
struct ibv_device** dev_list = ibv_get_device_list(nullptr); | |
if (!dev_list) { | |
std::cerr << "Failed to get RDMA devices list" << std::endl; | |
return -1; | |
} | |
context = ibv_open_device(dev_list[0]); | |
if (!context) { | |
std::cerr << "Failed to open RDMA device" << std::endl; | |
return -1; | |
} | |
ibv_free_device_list(dev_list); | |
struct ibv_pd* pd = ibv_alloc_pd(context); // 保护域 (protection domain) | |
if (!pd) { | |
std::cerr << "Failed to allocate protection domain" << std::endl; | |
return -1; | |
} | |
// 3. 注册 GPU 内存 | |
struct ibv_mr* mr = ibv_reg_mr(pd, gpu_ptr1, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); | |
// print error message | |
if (!mr) { | |
std::cerr << "1Failed to register memory region with RDMA " << strerror(errno) << std::endl; | |
return -1; | |
} | |
struct ibv_mr* mr2 = ibv_reg_mr(pd, gpu_ptr2, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); | |
// print error message | |
if (!mr2) { | |
std::cerr << "2Failed to register memory region with RDMA " << strerror(errno) << std::endl; | |
return -1; | |
} | |
std::cout << "Successfully registered GPU memory for RDMA." << std::endl; | |
// 4. 进行 RDMA 操作 | |
// 在这里可以执行 RDMA 读/写操作,使用 mr->rkey 进行远程访问 | |
// 5. 清理资源 | |
ibv_dereg_mr(mr); | |
cudaFree(gpu_ptr1); | |
cudaFree(gpu_ptr2); | |
ibv_dealloc_pd(pd); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
g++ -L/usr/local/cuda/lib64 -I/usr/local/cuda/include bug.c -libverbs -lcudart |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment