Skip to content

Instantly share code, notes, and snippets.

@nicolasnoble
Last active May 14, 2026 16:02
Show Gist options
  • Select an option

  • Save nicolasnoble/e0e57eb5a1b902057ae3d1df59c039cf to your computer and use it in GitHub Desktop.

Select an option

Save nicolasnoble/e0e57eb5a1b902057ae3d1df59c039cf to your computer and use it in GitHub Desktop.
Minimal reproducer: UCX cuda_copy MD silently truncates ucp_mem_map registration extent for multi-handle CUDA VMM allocations
/*
* Minimal reproducer for the UCX cuda_copy MD multi-handle VMM registration
* truncation bug.
*
* SYMPTOM
* ucp_put_nbx on a CUDA buffer composed of multiple cuMemCreate handles
* mapped contiguously into one VA range fails with IBV_WC_LOC_PROT_ERR
* (CQE syndrome 0x4) as soon as the put crosses a handle boundary.
*
* [host:pid:0:tid] ib_mlx5_log.c:179 Local protection error on
* mlx5_0:1/IB (synd 0x4 vend 0x53 hw_synd 0/157)
* [host:pid:0:tid] ib_mlx5_log.c:179 RC QP ... wqe[0]: RDMA_WRITE
* [rva ... rkey ...] [va ... len 4194304 lkey ...]
*
* The MR returned by ucp_mem_map covers only the first cuMem handle (one
* chunk worth of physical pages), not the full mapped VA range.
*
* ROOT CAUSE
* uct_cuda_copy_md_sync_memops_get_address_range() in
* src/uct/cuda/cuda_copy/cuda_copy_md.c calls cuMemGetAddressRange() on
* the base pointer and overwrites mem_info->alloc_length with its
* result. For multi-handle VMM, cuMemGetAddressRange() returns only the
* bounds of the cuMem handle containing the base pointer, not the full
* mapped range. That truncated length flows down to
* ibv_reg_dmabuf_mr(len=chunk_size).
*
* WORKAROUND
* UCX_CUDA_COPY_REG_WHOLE_ALLOC=off
*
* BUILD
* gcc -O2 -Wall -g \
* -I/path/to/cuda/include \
* -o ucx_vmm_minimal ucx_vmm_minimal.c \
* -lucp -lucs -lcuda
*
* USAGE
* server (recv): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal
* client (send): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal <server_ip>
*
* Expected with vanilla UCX: client crashes with Local protection error.
* Expected with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off: clean transfer.
*
* TESTED AGAINST
* UCX 1.20.0, CUDA 12.x, NVIDIA driver supporting multi-handle VMM
* (cuMemCreate + cuMemMap + cuMemAddressReserve), an HCA that supports
* GPUDirect RDMA via dmabuf (e.g. ConnectX-6/7 with nvidia-peermem or
* newer drivers).
*
* FLOW (both ends do steps 1-7, then split for the actual put)
* 1. Initialise CUDA, retain primary context on device 0.
* 2. Build a multi-handle VMM buffer: N_CHUNKS cuMemCreate handles,
* mapped contiguously into a single cuMemAddressReserve VA range.
* 3. ucp_init + ucp_worker_create with RMA + 64-bit atomics.
* 4. ucp_worker_get_address - produce a wireup blob for the peer.
* 5. ucp_mem_map(addr, length=TOTAL_BYTES, UCS_MEMORY_TYPE_CUDA) -
* THIS IS WHERE THE BUG LIVES. UCX shrinks the registration to
* one cuMem handle internally; the returned memh's lkey only
* covers the first chunk.
* 6. ucp_rkey_pack - serialise the memh for the peer.
* 7. Trivial TCP OOB exchange:
* a. swap worker addresses
* b. swap remote VA (so each side knows where to put/get)
* c. swap packed rkey
* 8. ucp_ep_create with the peer's worker address; rkey_unpack the
* peer's packed rkey.
* 9. Client only: ucp_put_nbx the full TOTAL_BYTES from local buf
* to remote VA. Vanilla UCX submits an RC RDMA_WRITE WQE with
* len=TOTAL_BYTES but lkey covers only CHUNK_BYTES; HCA returns
* IBV_WC_LOC_PROT_ERR as soon as the translation crosses the
* chunk boundary. Server only: progress the worker until the
* client signals completion via OOB.
*/
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <inttypes.h>
#include <netinet/in.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <unistd.h>
#include <cuda.h>
#include <ucp/api/ucp.h>
#define OOB_PORT 19292
#define N_CHUNKS 2
#define CHUNK_BYTES (2ULL * 1024 * 1024)
#define TOTAL_BYTES ((size_t)N_CHUNKS * CHUNK_BYTES)
#define CHECK_CU(call) do { \
CUresult _r = (call); \
if (_r != CUDA_SUCCESS) { \
const char *_s = NULL; cuGetErrorString(_r, &_s); \
fprintf(stderr, "%s:%d: %s -> %d %s\n", __FILE__, __LINE__, #call, _r, _s ? _s : "?"); \
exit(1); \
} \
} while (0)
#define CHECK_UCS(call) do { \
ucs_status_t _s = (call); \
if (_s != UCS_OK) { \
fprintf(stderr, "%s:%d: %s -> %s\n", __FILE__, __LINE__, #call, ucs_status_string(_s)); \
exit(1); \
} \
} while (0)
static int oob_listen_accept(void) {
int s = socket(AF_INET, SOCK_STREAM, 0);
int one = 1;
setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
struct sockaddr_in a = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(OOB_PORT) };
if (bind(s, (struct sockaddr*)&a, sizeof(a)) < 0) { perror("bind"); exit(1); }
listen(s, 1);
printf("listening on %d\n", OOB_PORT);
int c = accept(s, NULL, NULL);
close(s);
return c;
}
static int oob_connect(const char *host) {
int s = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in a = { .sin_family = AF_INET, .sin_port = htons(OOB_PORT) };
if (inet_pton(AF_INET, host, &a.sin_addr) != 1) { fprintf(stderr, "need IPv4: %s\n", host); exit(1); }
for (int t = 0; t < 30; t++) {
if (connect(s, (struct sockaddr*)&a, sizeof(a)) == 0) return s;
sleep(1);
}
perror("connect"); exit(1);
}
static void oob_send(int s, const void *p, size_t n) {
if (write(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); }
if (n && write(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); }
}
static void *oob_recv(int s, size_t *out_n) {
size_t n;
if (read(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); }
*out_n = n;
void *p = malloc(n);
if (n && read(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); }
return p;
}
/* Build a multi-handle VMM CUDA buffer: N_CHUNKS separate cuMemCreate
* handles, all mapped into one contiguous VA range. */
static CUdeviceptr alloc_multi_handle_vmm(void) {
CUdeviceptr base = 0;
CHECK_CU(cuMemAddressReserve(&base, TOTAL_BYTES, 0, 0, 0));
CUmemAllocationProp prop = {0};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0;
prop.allocFlags.gpuDirectRDMACapable = 1;
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
for (int i = 0; i < N_CHUNKS; i++) {
CUmemGenericAllocationHandle h;
CHECK_CU(cuMemCreate(&h, CHUNK_BYTES, &prop, 0));
CHECK_CU(cuMemMap(base + (CUdeviceptr)((size_t)i * CHUNK_BYTES),
CHUNK_BYTES, 0, h, 0));
CHECK_CU(cuMemRelease(h));
}
CUmemAccessDesc access = {0};
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access.location.id = 0;
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CHECK_CU(cuMemSetAccess(base, TOTAL_BYTES, &access, 1));
return base;
}
struct req_state { int done; ucs_status_t status; };
static void on_send(void *req, ucs_status_t s, void *u) {
(void)req; struct req_state *r = u; r->status = s; r->done = 1;
}
static void wait_req(ucp_worker_h worker, ucs_status_ptr_t req, struct req_state *st, const char *what) {
if (req == NULL) return;
if (UCS_PTR_IS_ERR(req)) {
fprintf(stderr, "%s: immediate %s\n", what, ucs_status_string(UCS_PTR_STATUS(req)));
exit(1);
}
while (!st->done) ucp_worker_progress(worker);
ucs_status_t s = st->status;
ucp_request_free(req);
if (s != UCS_OK) { fprintf(stderr, "%s: %s\n", what, ucs_status_string(s)); exit(1); }
}
int main(int argc, char **argv) {
setvbuf(stdout, NULL, _IOLBF, 0);
setvbuf(stderr, NULL, _IOLBF, 0);
int is_client = (argc == 2);
const char *server = is_client ? argv[1] : NULL;
/* --- 1. CUDA: retain primary context on device 0 ----------------- */
CHECK_CU(cuInit(0));
CUdevice dev; CHECK_CU(cuDeviceGet(&dev, 0));
CUcontext cuctx; CHECK_CU(cuDevicePrimaryCtxRetain(&cuctx, dev));
CHECK_CU(cuCtxPushCurrent(cuctx));
/* --- 2. Build the multi-handle VMM buffer ------------------------ *
* N_CHUNKS separate cuMemCreate handles, all mapped via cuMemMap
* into one contiguous VA range. This is the shape the bug triggers
* on (e.g. PyTorch arena allocators, fragmented VMM pools). */
CUdeviceptr buf = alloc_multi_handle_vmm();
printf("VMM allocated: base=0x%llx total=%zu (%d handles x %llu bytes)\n",
(unsigned long long)buf, TOTAL_BYTES, N_CHUNKS, (unsigned long long)CHUNK_BYTES);
/* --- 3. UCX context + worker ------------------------------------- *
* RMA + 64-bit atomics so the protocol layer wires up an RC lane
* with the dmabuf-registered MR (the path the bug lives on). */
ucp_config_t *cfg = NULL; CHECK_UCS(ucp_config_read(NULL, NULL, &cfg));
ucp_params_t up = { .field_mask = UCP_PARAM_FIELD_FEATURES,
.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO64 };
ucp_context_h ctx; CHECK_UCS(ucp_init(&up, cfg, &ctx));
ucp_config_release(cfg);
ucp_worker_params_t wp = { .field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE,
.thread_mode = UCS_THREAD_MODE_SINGLE };
ucp_worker_h worker; CHECK_UCS(ucp_worker_create(ctx, &wp, &worker));
/* --- 4. Wireup blob for the peer --------------------------------- */
ucp_address_t *local_addr; size_t local_addr_len;
CHECK_UCS(ucp_worker_get_address(worker, &local_addr, &local_addr_len));
/* --- 5. Register the VMM buffer ---------------------------------- *
* THIS IS WHERE THE BUG LIVES. We ask UCX to register the full
* TOTAL_BYTES (multi-handle VA range). Internally UCX calls
* cuMemGetAddressRange() on the base pointer to "expand to the
* whole allocation" - for multi-handle VMM this returns only one
* cuMem handle's bounds, and that becomes mem_info->alloc_length.
* The truncated length is then handed to cuMemGetHandleForAddressRange
* and ibv_reg_dmabuf_mr, so the resulting memh's lkey covers only
* CHUNK_BYTES rather than TOTAL_BYTES.
*
* Workaround: run with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off, which
* skips the offending expansion and preserves the requested
* length. */
ucp_mem_map_params_t mp = {
.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
UCP_MEM_MAP_PARAM_FIELD_LENGTH |
UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE,
.address = (void*)(uintptr_t)buf,
.length = TOTAL_BYTES,
.memory_type = UCS_MEMORY_TYPE_CUDA,
};
ucp_mem_h memh; CHECK_UCS(ucp_mem_map(ctx, &mp, &memh));
/* --- 6. Serialise memh for the peer ------------------------------ */
void *rkey_buf = NULL; size_t rkey_len = 0;
CHECK_UCS(ucp_rkey_pack(ctx, memh, &rkey_buf, &rkey_len));
/* --- 7. OOB exchange over TCP ------------------------------------ *
* Trivial pairwise swap of: worker address, VA of the registered
* buffer, packed rkey. Length-prefixed blobs in each direction. */
int sock = is_client ? oob_connect(server) : oob_listen_accept();
oob_send(sock, local_addr, local_addr_len);
size_t remote_addr_len; void *remote_addr = oob_recv(sock, &remote_addr_len);
uint64_t my_va = (uint64_t)(uintptr_t)buf;
oob_send(sock, &my_va, sizeof(my_va));
uint64_t remote_va; { size_t n; void *p = oob_recv(sock, &n); memcpy(&remote_va, p, sizeof(remote_va)); free(p); }
oob_send(sock, rkey_buf, rkey_len);
size_t remote_rkey_len; void *remote_rkey = oob_recv(sock, &remote_rkey_len);
/* --- 8. Endpoint + remote rkey unpack ---------------------------- */
ucp_ep_params_t ep_params = { .field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS,
.address = (ucp_address_t*)remote_addr };
ucp_ep_h ep; CHECK_UCS(ucp_ep_create(worker, &ep_params, &ep));
ucp_rkey_h rrkey; CHECK_UCS(ucp_ep_rkey_unpack(ep, remote_rkey, &rrkey));
/* --- 9. The actual transfer -------------------------------------- *
* Client posts a single ucp_put_nbx of the FULL TOTAL_BYTES. With
* vanilla UCX this issues an RDMA_WRITE WQE that walks off the end
* of the truncated MR at the first chunk boundary and the HCA
* returns IBV_WC_LOC_PROT_ERR (CQE syndrome 0x4). Server just
* progresses the worker until the client signals completion. */
if (is_client) {
struct req_state st = {0};
ucp_request_param_t prm = { .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
UCP_OP_ATTR_FIELD_USER_DATA,
.cb.send = on_send,
.user_data = &st };
printf("issuing ucp_put_nbx %zu bytes...\n", TOTAL_BYTES);
ucs_status_ptr_t req = ucp_put_nbx(ep, (const void*)(uintptr_t)buf,
TOTAL_BYTES, remote_va, rrkey, &prm);
wait_req(worker, req, &st, "ucp_put_nbx");
printf("put completed OK\n");
/* Signal the server we're done so it can exit cleanly. */
uint32_t marker = 0xC0FFEE;
oob_send(sock, &marker, sizeof(marker));
} else {
/* Server doesn't actively progress here - the put is fully
* one-sided. We just wait on the OOB done marker. (In a
* realistic app you'd be progressing the worker too.) */
size_t n; void *p = oob_recv(sock, &n);
free(p);
printf("server: client signaled done\n");
}
/* Minimal reproducer: skip teardown - process exit handles cleanup.
* Real code should ucp_ep_close_nbx, ucp_rkey_destroy,
* ucp_rkey_buffer_release, ucp_mem_unmap, ucp_worker_destroy,
* ucp_cleanup, cuMemUnmap, cuMemAddressFree etc. */
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment