-
-
Save youkaichao/8f87555bdeaaf68f4492b0dc96fbd206 to your computer and use it in GitHub Desktop.
import os | |
from typing import List | |
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
import torch | |
import torch.distributed as dist | |
dist.init_process_group(backend="gloo") | |
rank = local_rank = dist.get_rank() | |
world_size = dist.get_world_size() | |
torch.cuda.set_device(local_rank) | |
def share_tensor(A: torch.Tensor, group=None) -> List[torch.Tensor]: | |
from torch.multiprocessing.reductions import reduce_tensor | |
A_meta = reduce_tensor(A) | |
tensor_metas = [None] * world_size | |
dist.all_gather_object(tensor_metas, A_meta, group=group) | |
rank = dist.get_rank(group) | |
all_tensors = [] | |
for i, obj in enumerate(tensor_metas): | |
func = obj[0] | |
args = list(obj[1]) | |
args[6] = A.device.index | |
if i != rank: | |
all_tensors.append(func(*args)) | |
else: | |
all_tensors.append(A) | |
return all_tensors | |
A = torch.ones((10,), device=local_rank) * rank | |
all_tensors = share_tensor(A) | |
dist.barrier() | |
torch.cuda.synchronize() | |
if rank == 0: | |
for x in all_tensors: | |
x.zero_() | |
dist.barrier() | |
torch.cuda.synchronize() | |
for i, x in enumerate(all_tensors): | |
print(f"{rank=}, {i=}, {x=}") |
IPC of expandable segment is introduced in pytorch 2.5. However, I find that some linux os does not allow this feature.
Example test code to manually test the IPC functionality:
// sender.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
struct ShareHeader {
pid_t pid;
size_t segment_size;
size_t num_handles;
};
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Allocate memory using VMM API
const size_t size = 20 * 1024 * 1024; // 20MB
CUmemGenericAllocationHandle handle;
// Set up memory allocation properties
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0; // Use device 0
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // Specify handle type for export
prop.win32HandleMetaData = nullptr;
// Get the minimum granularity supported for allocation
size_t granularity = 0;
result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get allocation granularity: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Ensure size is a multiple of granularity
if (size % granularity) {
std::cerr << "Allocation size is not a multiple of minimum supported granularity" << std::endl;
return 1;
}
std::cout << "Creating memory handle with size: " << size << " bytes" << std::endl;
result = cuMemCreate(&handle, size, &prop, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create memory handle: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully created memory handle" << std::endl;
// Reserve address range
CUdeviceptr ptr;
std::cout << "Reserving address range" << std::endl;
result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully reserved address range at: " << ptr << std::endl;
// Map the memory
std::cout << "Mapping memory" << std::endl;
result = cuMemMap(ptr, size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully mapped memory" << std::endl;
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
std::cout << "Setting memory access properties" << std::endl;
result = cuMemSetAccess(ptr, size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully set memory access properties" << std::endl;
// Export handle to file descriptor
int fd = 0;
std::cout << "Exporting handle to file descriptor" << std::endl;
result = cuMemExportToShareableHandle(&fd, handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to export handle: " << getCudaErrorString(result) << std::endl;
std::cerr << "Handle value: " << handle << std::endl;
std::cerr << "Handle type: CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR" << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully exported handle to fd: " << fd << std::endl;
// Write to file
std::ofstream outfile("data.bin", std::ios::binary);
if (!outfile) {
std::cerr << "Failed to open output file: " << strerror(errno) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
// Write header
ShareHeader header{getpid(), size, 1};
outfile.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
// Write file descriptor
outfile.write(reinterpret_cast<const char*>(&fd), sizeof(int));
outfile.close();
std::cout << "Data written to data.bin. Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
}
// receiver.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
#ifndef SYS_pidfd_getfd
#define SYS_pidfd_getfd 438
#endif
struct ShareHeader {
pid_t pid;
size_t segment_size;
size_t num_handles;
};
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Read from file
std::ifstream infile("data.bin", std::ios::binary);
if (!infile) {
std::cerr << "Failed to open input file: " << strerror(errno) << std::endl;
return 1;
}
// Read header
ShareHeader header;
infile.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
// Open pidfd
auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
if (pidfd == -1) {
std::cerr << "pidfd_open failed: " << strerror(errno) << std::endl;
return 1;
}
// Read file descriptor
int fd = 0;
infile.read(reinterpret_cast<char*>(&fd), sizeof(int));
infile.close();
// Get our own file descriptor
auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
if (myfd == -1) {
std::cerr << "pidfd_getfd failed: " << strerror(errno) << std::endl;
close(pidfd);
return 1;
}
// Import handle
CUmemGenericAllocationHandle handle;
result = cuMemImportFromShareableHandle(
&handle,
reinterpret_cast<void*>(static_cast<uintptr_t>(myfd)),
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to import handle: " << getCudaErrorString(result) << std::endl;
close(myfd);
close(pidfd);
return 1;
}
// Reserve address range
CUdeviceptr ptr;
result = cuMemAddressReserve(&ptr, header.segment_size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
close(myfd);
close(pidfd);
return 1;
}
// Map the memory
result = cuMemMap(ptr, header.segment_size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, header.segment_size);
close(myfd);
close(pidfd);
return 1;
}
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
result = cuMemSetAccess(ptr, header.segment_size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, header.segment_size);
cuMemAddressFree(ptr, header.segment_size);
close(myfd);
close(pidfd);
return 1;
}
std::cout << "Successfully imported and mapped memory at address: " << ptr << std::endl;
std::cout << "Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, header.segment_size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, header.segment_size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
close(myfd);
close(pidfd);
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
}
Compile with
$ nvcc receiver.cpp -o receiver -lcuda
$ nvcc sender.cpp -o sender -lcuda
In one shell, execute ./sender
, and in another shell, execute ./receiver
.
In some nodes, it succeeds; but in some nodes, it does not.
When it fails with Operation not permitted
, execute the receiver
with sudo
access works.
it seems, if the sender process let any process ptrace it, then it works.
add the following code to sender:
#include <sys/prctl.h>
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY);
or python equivalent:
import ctypes
# Constants from prctl.h
PR_SET_PTRACER = 0x59616d61
PR_SET_PTRACER_ANY = -1 # Allow any process with the same UID to ptrace
libc = ctypes.CDLL("libc.so.6", use_errno=True)
result = libc.prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0)
if result != 0:
errno = ctypes.get_errno()
raise OSError(errno, f"prctl(PR_SET_PTRACER, ANY) failed: {ctypes.cast(libc.strerror(errno), ctypes.c_char_p).value.decode()}")
else:
print("✅ Allowed ptrace from any same-UID process (PR_SET_PTRACER_ANY)")
then it works.
to use the fabric handle, we should follow https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html to create imex channels (read access is required), and then here is a working example of ipc through the fabric handle:
// sender.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY);
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Allocate memory using VMM API
const size_t size = 20 * 1024 * 1024; // 20MB
CUmemGenericAllocationHandle handle;
// Set up memory allocation properties
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0; // Use device 0
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; // Use fabric handle type for IPC
prop.win32HandleMetaData = nullptr;
// Get the minimum granularity supported for allocation
size_t granularity = 0;
result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get allocation granularity: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Ensure size is a multiple of granularity
if (size % granularity) {
std::cerr << "Allocation size is not a multiple of minimum supported granularity" << std::endl;
return 1;
}
std::cout << "Creating memory handle with size: " << size << " bytes" << std::endl;
result = cuMemCreate(&handle, size, &prop, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create memory handle: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully created memory handle" << std::endl;
// Reserve address range
CUdeviceptr ptr;
std::cout << "Reserving address range" << std::endl;
result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully reserved address range at: " << ptr << std::endl;
// Map the memory
std::cout << "Mapping memory" << std::endl;
result = cuMemMap(ptr, size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully mapped memory" << std::endl;
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
std::cout << "Setting memory access properties" << std::endl;
result = cuMemSetAccess(ptr, size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully set memory access properties" << std::endl;
// Export handle to fabric handle
CUmemFabricHandle_v1 fabricHandle;
std::cout << "Exporting handle to fabric handle" << std::endl;
std::cout << "Original handle value: " << handle << std::endl;
std::cout << "Allocation size: " << size << " bytes" << std::endl;
result = cuMemExportToShareableHandle(&fabricHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to export handle: " << getCudaErrorString(result) << std::endl;
std::cerr << "Handle value: " << handle << std::endl;
std::cerr << "Handle type: CU_MEM_HANDLE_TYPE_FABRIC" << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully exported handle to fabric handle" << std::endl;
std::cout << "Fabric handle value: " << reinterpret_cast<uintptr_t>(&fabricHandle) << std::endl;
// Write to file
std::ofstream outfile("data.bin", std::ios::binary);
if (!outfile) {
std::cerr << "Failed to open output file: " << strerror(errno) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
// Write 8-byte size header
outfile.write(reinterpret_cast<const char*>(&size), 8);
// Write fabric handle
outfile.write(reinterpret_cast<const char*>(&fabricHandle), sizeof(CUmemFabricHandle_v1));
outfile.close();
std::cout << "Data written to data.bin. Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
}
// receiver.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Read from file
std::ifstream infile("data.bin", std::ios::binary);
if (!infile) {
std::cerr << "Failed to open input file: " << strerror(errno) << std::endl;
return 1;
}
// Read 8-byte size header
size_t size;
infile.read(reinterpret_cast<char*>(&size), 8);
std::cout << "Read allocation size: " << size << " bytes" << std::endl;
// Read fabric handle
CUmemFabricHandle_v1 fabricHandle;
infile.read(reinterpret_cast<char*>(&fabricHandle), sizeof(CUmemFabricHandle_v1));
std::cout << "Read fabric handle value: " << reinterpret_cast<uintptr_t>(&fabricHandle) << std::endl;
infile.close();
// Import handle
CUmemGenericAllocationHandle handle;
std::cout << "Importing handle..." << std::endl;
result = cuMemImportFromShareableHandle(
&handle,
reinterpret_cast<void*>(&fabricHandle),
CU_MEM_HANDLE_TYPE_FABRIC
);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to import handle: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully imported handle: " << handle << std::endl;
// Reserve address range
CUdeviceptr ptr;
std::cout << "Reserving address range of size: " << size << std::endl;
result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully reserved address range at: " << ptr << std::endl;
// Map the memory
std::cout << "Mapping memory..." << std::endl;
result = cuMemMap(ptr, size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, size);
return 1;
}
std::cout << "Successfully mapped memory" << std::endl;
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
std::cout << "Setting memory access properties..." << std::endl;
result = cuMemSetAccess(ptr, size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
return 1;
}
std::cout << "Successfully set memory access properties" << std::endl;
std::cout << "Successfully imported and mapped memory at address: " << ptr << std::endl;
std::cout << "Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
}
Compile with
$ nvcc receiver.cpp -o receiver -lcuda
$ nvcc sender.cpp -o sender -lcuda
In one shell, execute ./sender
, and in another shell, execute ./receiver
.
We can see:
(py310) ➜ test_pidfd ./sender
Creating memory handle with size: 20971520 bytes
Successfully created memory handle
Reserving address range
Successfully reserved address range at: 139849545809920
Mapping memory
Successfully mapped memory
Setting memory access properties
Successfully set memory access properties
Exporting handle to fabric handle
Original handle value: 94556404496384
Allocation size: 20971520 bytes
Successfully exported handle to fabric handle
Fabric handle value: 140735970184032
Data written to data.bin. Press Enter to continue...
(py310) ➜ test_pidfd ./receiver
Read allocation size: 20971520 bytes
Read fabric handle value: 140734345409808
Importing handle...
Successfully imported handle: 94798849087488
Reserving address range of size: 20971520
Successfully reserved address range at: 140063992184832
Mapping memory...
Successfully mapped memory
Setting memory access properties...
Successfully set memory access properties
Successfully imported and mapped memory at address: 140063992184832
Press Enter to continue...
The data is 72 bytes, 8 byte header (for size) and 64 byte for the fabric handle.
running on
2.6.0.dev20241112+cu124
, still get the same errorRuntimeError: pidfd_getfd: Operation not permitted
.