youkaichao/ipc.py

youkaichao · 2024-11-05T00:24:14Z

the following code can compile and run actually:

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <errno.h>

int main() {
    // Step 1: Obtain a pidfd for the current process
    int pidfd = syscall(SYS_pidfd_open, getpid(), 0);
    if (pidfd == -1) {
        perror("pidfd_open failed");
        return 1;
    }

    // Step 2: Open a valid file descriptor, e.g., /proc/self/status
    int fd = open("/proc/self/status", O_RDONLY);
    if (fd == -1) {
        perror("open failed");
        close(pidfd);
        return 1;
    }

    // Step 3: Try to duplicate fd using pidfd_getfd
    int new_fd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
    if (new_fd == -1) {
        perror("pidfd_getfd failed");
    } else {
        printf("pidfd_getfd succeeded, new_fd: %d\n", new_fd);
        close(new_fd);  // Close the duplicated fd if successful
    }

    // Clean up
    close(fd);
    close(pidfd);
    return 0;
}

compile : gcc test.c -o test

run: ./test

output: pidfd_getfd succeeded, new_fd: 5

youkaichao · 2024-11-13T00:40:40Z

running on 2.6.0.dev20241112+cu124 , still get the same error RuntimeError: pidfd_getfd: Operation not permitted .

youkaichao · 2025-06-12T04:58:41Z

IPC of expandable segment is introduced in pytorch 2.5. However, I find that some linux os does not allow this feature.

Example test code to manually test the IPC functionality:

// sender.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>

// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif

struct ShareHeader {
    pid_t pid;
    size_t segment_size;
    size_t num_handles;
};

// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
    const char* errorString;
    cuGetErrorString(error, &errorString);
    return errorString;
}

int main() {
    // Initialize CUDA
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Get CUDA device
    CUdevice device;
    result = cuDeviceGet(&device, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Create CUDA context
    CUcontext context;
    result = cuCtxCreate(&context, 0, device);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Allocate memory using VMM API
    const size_t size = 20 * 1024 * 1024; // 20MB
    CUmemGenericAllocationHandle handle;

    // Set up memory allocation properties
    CUmemAllocationProp prop = {};
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;  // Use device 0
    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;  // Specify handle type for export
    prop.win32HandleMetaData = nullptr;

    // Get the minimum granularity supported for allocation
    size_t granularity = 0;
    result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to get allocation granularity: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Ensure size is a multiple of granularity
    if (size % granularity) {
        std::cerr << "Allocation size is not a multiple of minimum supported granularity" << std::endl;
        return 1;
    }

    std::cout << "Creating memory handle with size: " << size << " bytes" << std::endl;
    result = cuMemCreate(&handle, size, &prop, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to create memory handle: " << getCudaErrorString(result) << std::endl;
        return 1;
    }
    std::cout << "Successfully created memory handle" << std::endl;

    // Reserve address range
    CUdeviceptr ptr;
    std::cout << "Reserving address range" << std::endl;
    result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully reserved address range at: " << ptr << std::endl;

    // Map the memory
    std::cout << "Mapping memory" << std::endl;
    result = cuMemMap(ptr, size, 0, handle, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully mapped memory" << std::endl;

    // Set access properties
    CUmemAccessDesc accessDesc = {};
    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDesc.location.id = 0;  // Use device 0
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

    std::cout << "Setting memory access properties" << std::endl;
    result = cuMemSetAccess(ptr, size, &accessDesc, 1);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully set memory access properties" << std::endl;

    // Export handle to file descriptor
    int fd = 0;
    std::cout << "Exporting handle to file descriptor" << std::endl;
    result = cuMemExportToShareableHandle(&fd, handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to export handle: " << getCudaErrorString(result) << std::endl;
        std::cerr << "Handle value: " << handle << std::endl;
        std::cerr << "Handle type: CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR" << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully exported handle to fd: " << fd << std::endl;

    // Write to file
    std::ofstream outfile("data.bin", std::ios::binary);
    if (!outfile) {
        std::cerr << "Failed to open output file: " << strerror(errno) << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }

    // Write header
    ShareHeader header{getpid(), size, 1};
    outfile.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));

    // Write file descriptor
    outfile.write(reinterpret_cast<const char*>(&fd), sizeof(int));
    outfile.close();

    std::cout << "Data written to data.bin. Press Enter to continue..." << std::endl;
    std::cin.get();

    // Cleanup
    result = cuMemUnmap(ptr, size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemAddressFree(ptr, size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemRelease(handle);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
    }
    
    result = cuCtxDestroy(context);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
    }

    return 0;
}

// receiver.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>

// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
#ifndef SYS_pidfd_getfd
#define SYS_pidfd_getfd 438
#endif

struct ShareHeader {
    pid_t pid;
    size_t segment_size;
    size_t num_handles;
};

// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
    const char* errorString;
    cuGetErrorString(error, &errorString);
    return errorString;
}

int main() {
    // Initialize CUDA
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Get CUDA device
    CUdevice device;
    result = cuDeviceGet(&device, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Create CUDA context
    CUcontext context;
    result = cuCtxCreate(&context, 0, device);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Read from file
    std::ifstream infile("data.bin", std::ios::binary);
    if (!infile) {
        std::cerr << "Failed to open input file: " << strerror(errno) << std::endl;
        return 1;
    }

    // Read header
    ShareHeader header;
    infile.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));

    // Open pidfd
    auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
    if (pidfd == -1) {
        std::cerr << "pidfd_open failed: " << strerror(errno) << std::endl;
        return 1;
    }

    // Read file descriptor
    int fd = 0;
    infile.read(reinterpret_cast<char*>(&fd), sizeof(int));
    infile.close();

    // Get our own file descriptor
    auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
    if (myfd == -1) {
        std::cerr << "pidfd_getfd failed: " << strerror(errno) << std::endl;
        close(pidfd);
        return 1;
    }

    // Import handle
    CUmemGenericAllocationHandle handle;
    result = cuMemImportFromShareableHandle(
        &handle,
        reinterpret_cast<void*>(static_cast<uintptr_t>(myfd)),
        CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
    );
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to import handle: " << getCudaErrorString(result) << std::endl;
        close(myfd);
        close(pidfd);
        return 1;
    }

    // Reserve address range
    CUdeviceptr ptr;
    result = cuMemAddressReserve(&ptr, header.segment_size, 0, 0, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
        close(myfd);
        close(pidfd);
        return 1;
    }

    // Map the memory
    result = cuMemMap(ptr, header.segment_size, 0, handle, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
        cuMemAddressFree(ptr, header.segment_size);
        close(myfd);
        close(pidfd);
        return 1;
    }

    // Set access properties
    CUmemAccessDesc accessDesc = {};
    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDesc.location.id = 0;  // Use device 0
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

    result = cuMemSetAccess(ptr, header.segment_size, &accessDesc, 1);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
        cuMemUnmap(ptr, header.segment_size);
        cuMemAddressFree(ptr, header.segment_size);
        close(myfd);
        close(pidfd);
        return 1;
    }

    std::cout << "Successfully imported and mapped memory at address: " << ptr << std::endl;
    std::cout << "Press Enter to continue..." << std::endl;
    std::cin.get();

    // Cleanup
    result = cuMemUnmap(ptr, header.segment_size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemAddressFree(ptr, header.segment_size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemRelease(handle);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
    }

    close(myfd);
    close(pidfd);

    result = cuCtxDestroy(context);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
    }

    return 0;
}

Compile with

$ nvcc receiver.cpp -o receiver -lcuda
$ nvcc sender.cpp -o sender -lcuda

In one shell, execute ./sender , and in another shell, execute ./receiver .

In some nodes, it succeeds; but in some nodes, it does not.

When it fails with Operation not permitted, execute the receiver with sudo access works.

youkaichao · 2025-06-12T06:12:34Z

it seems, if the sender process let any process ptrace it, then it works.

add the following code to sender:

#include <sys/prctl.h>
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY);

or python equivalent:

import ctypes

# Constants from prctl.h
PR_SET_PTRACER = 0x59616d61
PR_SET_PTRACER_ANY = -1  # Allow any process with the same UID to ptrace

libc = ctypes.CDLL("libc.so.6", use_errno=True)

result = libc.prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0)
if result != 0:
    errno = ctypes.get_errno()
    raise OSError(errno, f"prctl(PR_SET_PTRACER, ANY) failed: {ctypes.cast(libc.strerror(errno), ctypes.c_char_p).value.decode()}")
else:
    print("✅ Allowed ptrace from any same-UID process (PR_SET_PTRACER_ANY)")

then it works.

youkaichao · 2025-06-15T07:03:32Z

to use the fabric handle, we should follow https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html to create imex channels (read access is required), and then here is a working example of ipc through the fabric handle:

// sender.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/prctl.h>

// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif

// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
    const char* errorString;
    cuGetErrorString(error, &errorString);
    return errorString;
}

int main() {
    prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY);
    // Initialize CUDA
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Get CUDA device
    CUdevice device;
    result = cuDeviceGet(&device, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Create CUDA context
    CUcontext context;
    result = cuCtxCreate(&context, 0, device);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Allocate memory using VMM API
    const size_t size = 20 * 1024 * 1024; // 20MB
    CUmemGenericAllocationHandle handle;

    // Set up memory allocation properties
    CUmemAllocationProp prop = {};
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = 0;  // Use device 0
    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;  // Use fabric handle type for IPC
    prop.win32HandleMetaData = nullptr;

    // Get the minimum granularity supported for allocation
    size_t granularity = 0;
    result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to get allocation granularity: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Ensure size is a multiple of granularity
    if (size % granularity) {
        std::cerr << "Allocation size is not a multiple of minimum supported granularity" << std::endl;
        return 1;
    }

    std::cout << "Creating memory handle with size: " << size << " bytes" << std::endl;
    result = cuMemCreate(&handle, size, &prop, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to create memory handle: " << getCudaErrorString(result) << std::endl;
        return 1;
    }
    std::cout << "Successfully created memory handle" << std::endl;

    // Reserve address range
    CUdeviceptr ptr;
    std::cout << "Reserving address range" << std::endl;
    result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully reserved address range at: " << ptr << std::endl;

    // Map the memory
    std::cout << "Mapping memory" << std::endl;
    result = cuMemMap(ptr, size, 0, handle, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully mapped memory" << std::endl;

    // Set access properties
    CUmemAccessDesc accessDesc = {};
    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDesc.location.id = 0;  // Use device 0
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

    std::cout << "Setting memory access properties" << std::endl;
    result = cuMemSetAccess(ptr, size, &accessDesc, 1);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully set memory access properties" << std::endl;

    // Export handle to fabric handle
    CUmemFabricHandle_v1 fabricHandle;
    std::cout << "Exporting handle to fabric handle" << std::endl;
    std::cout << "Original handle value: " << handle << std::endl;
    std::cout << "Allocation size: " << size << " bytes" << std::endl;
    result = cuMemExportToShareableHandle(&fabricHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to export handle: " << getCudaErrorString(result) << std::endl;
        std::cerr << "Handle value: " << handle << std::endl;
        std::cerr << "Handle type: CU_MEM_HANDLE_TYPE_FABRIC" << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }
    std::cout << "Successfully exported handle to fabric handle" << std::endl;
    std::cout << "Fabric handle value: " << reinterpret_cast<uintptr_t>(&fabricHandle) << std::endl;

    // Write to file
    std::ofstream outfile("data.bin", std::ios::binary);
    if (!outfile) {
        std::cerr << "Failed to open output file: " << strerror(errno) << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        cuMemRelease(handle);
        return 1;
    }

    // Write 8-byte size header
    outfile.write(reinterpret_cast<const char*>(&size), 8);
    // Write fabric handle
    outfile.write(reinterpret_cast<const char*>(&fabricHandle), sizeof(CUmemFabricHandle_v1));
    outfile.close();

    std::cout << "Data written to data.bin. Press Enter to continue..." << std::endl;
    std::cin.get();

    // Cleanup
    result = cuMemUnmap(ptr, size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemAddressFree(ptr, size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemRelease(handle);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
    }
    
    result = cuCtxDestroy(context);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
    }

    return 0;
}

// receiver.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>

// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
    const char* errorString;
    cuGetErrorString(error, &errorString);
    return errorString;
}

int main() {
    // Initialize CUDA
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Get CUDA device
    CUdevice device;
    result = cuDeviceGet(&device, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Create CUDA context
    CUcontext context;
    result = cuCtxCreate(&context, 0, device);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
        return 1;
    }

    // Read from file
    std::ifstream infile("data.bin", std::ios::binary);
    if (!infile) {
        std::cerr << "Failed to open input file: " << strerror(errno) << std::endl;
        return 1;
    }

    // Read 8-byte size header
    size_t size;
    infile.read(reinterpret_cast<char*>(&size), 8);
    std::cout << "Read allocation size: " << size << " bytes" << std::endl;

    // Read fabric handle
    CUmemFabricHandle_v1 fabricHandle;
    infile.read(reinterpret_cast<char*>(&fabricHandle), sizeof(CUmemFabricHandle_v1));
    std::cout << "Read fabric handle value: " << reinterpret_cast<uintptr_t>(&fabricHandle) << std::endl;
    infile.close();

    // Import handle
    CUmemGenericAllocationHandle handle;
    std::cout << "Importing handle..." << std::endl;
    result = cuMemImportFromShareableHandle(
        &handle,
        reinterpret_cast<void*>(&fabricHandle),
        CU_MEM_HANDLE_TYPE_FABRIC
    );
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to import handle: " << getCudaErrorString(result) << std::endl;
        return 1;
    }
    std::cout << "Successfully imported handle: " << handle << std::endl;

    // Reserve address range
    CUdeviceptr ptr;
    std::cout << "Reserving address range of size: " << size << std::endl;
    result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
        return 1;
    }
    std::cout << "Successfully reserved address range at: " << ptr << std::endl;

    // Map the memory
    std::cout << "Mapping memory..." << std::endl;
    result = cuMemMap(ptr, size, 0, handle, 0);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
        cuMemAddressFree(ptr, size);
        return 1;
    }
    std::cout << "Successfully mapped memory" << std::endl;

    // Set access properties
    CUmemAccessDesc accessDesc = {};
    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDesc.location.id = 0;  // Use device 0
    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

    std::cout << "Setting memory access properties..." << std::endl;
    result = cuMemSetAccess(ptr, size, &accessDesc, 1);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
        cuMemUnmap(ptr, size);
        cuMemAddressFree(ptr, size);
        return 1;
    }
    std::cout << "Successfully set memory access properties" << std::endl;

    std::cout << "Successfully imported and mapped memory at address: " << ptr << std::endl;
    std::cout << "Press Enter to continue..." << std::endl;
    std::cin.get();

    // Cleanup
    result = cuMemUnmap(ptr, size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemAddressFree(ptr, size);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
    }

    result = cuMemRelease(handle);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
    }

    result = cuCtxDestroy(context);
    if (result != CUDA_SUCCESS) {
        std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
    }

    return 0;
}

Compile with

$ nvcc receiver.cpp -o receiver -lcuda
$ nvcc sender.cpp -o sender -lcuda

In one shell, execute ./sender , and in another shell, execute ./receiver .

We can see:

(py310) ➜  test_pidfd ./sender
Creating memory handle with size: 20971520 bytes
Successfully created memory handle
Reserving address range
Successfully reserved address range at: 139849545809920
Mapping memory
Successfully mapped memory
Setting memory access properties
Successfully set memory access properties
Exporting handle to fabric handle
Original handle value: 94556404496384
Allocation size: 20971520 bytes
Successfully exported handle to fabric handle
Fabric handle value: 140735970184032
Data written to data.bin. Press Enter to continue...

(py310) ➜  test_pidfd ./receiver
Read allocation size: 20971520 bytes
Read fabric handle value: 140734345409808
Importing handle...
Successfully imported handle: 94798849087488
Reserving address range of size: 20971520
Successfully reserved address range at: 140063992184832
Mapping memory...
Successfully mapped memory
Setting memory access properties...
Successfully set memory access properties
Successfully imported and mapped memory at address: 140063992184832
Press Enter to continue...

The data is 72 bytes, 8 byte header (for size) and 64 byte for the fabric handle.

	import os
	from typing import List
	# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
	import torch
	import torch.distributed as dist

	dist.init_process_group(backend="gloo")
	rank = local_rank = dist.get_rank()
	world_size = dist.get_world_size()
	torch.cuda.set_device(local_rank)

	def share_tensor(A: torch.Tensor, group=None) -> List[torch.Tensor]:
	from torch.multiprocessing.reductions import reduce_tensor
	A_meta = reduce_tensor(A)
	tensor_metas = [None] * world_size
	dist.all_gather_object(tensor_metas, A_meta, group=group)

	rank = dist.get_rank(group)

	all_tensors = []
	for i, obj in enumerate(tensor_metas):
	func = obj[0]
	args = list(obj[1])
	args[6] = A.device.index
	if i != rank:
	all_tensors.append(func(*args))
	else:
	all_tensors.append(A)

	return all_tensors

	A = torch.ones((10,), device=local_rank) * rank

	all_tensors = share_tensor(A)

	dist.barrier()
	torch.cuda.synchronize()

	if rank == 0:
	for x in all_tensors:
	x.zero_()

	dist.barrier()
	torch.cuda.synchronize()

	for i, x in enumerate(all_tensors):
	print(f"{rank=}, {i=}, {x=}")

youkaichao/ipc.py

youkaichao commented Nov 5, 2024

Uh oh!

youkaichao commented Nov 13, 2024

Uh oh!

youkaichao commented Jun 12, 2025

Uh oh!

youkaichao commented Jun 12, 2025

Uh oh!

youkaichao commented Jun 15, 2025 •

edited

Loading

Uh oh!

youkaichao/ipc.py

youkaichao commented Nov 5, 2024

Uh oh!

youkaichao commented Nov 13, 2024

Uh oh!

youkaichao commented Jun 12, 2025

Uh oh!

youkaichao commented Jun 12, 2025

Uh oh!

youkaichao commented Jun 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

youkaichao commented Jun 15, 2025 •

edited

Loading