Last active
May 14, 2026 16:02
-
-
Save nicolasnoble/e0e57eb5a1b902057ae3d1df59c039cf to your computer and use it in GitHub Desktop.
Minimal reproducer: UCX cuda_copy MD silently truncates ucp_mem_map registration extent for multi-handle CUDA VMM allocations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * Minimal reproducer for the UCX cuda_copy MD multi-handle VMM registration | |
| * truncation bug. | |
| * | |
| * SYMPTOM | |
| * ucp_put_nbx on a CUDA buffer composed of multiple cuMemCreate handles | |
| * mapped contiguously into one VA range fails with IBV_WC_LOC_PROT_ERR | |
| * (CQE syndrome 0x4) as soon as the put crosses a handle boundary. | |
| * | |
| * [host:pid:0:tid] ib_mlx5_log.c:179 Local protection error on | |
| * mlx5_0:1/IB (synd 0x4 vend 0x53 hw_synd 0/157) | |
| * [host:pid:0:tid] ib_mlx5_log.c:179 RC QP ... wqe[0]: RDMA_WRITE | |
| * [rva ... rkey ...] [va ... len 4194304 lkey ...] | |
| * | |
| * The MR returned by ucp_mem_map covers only the first cuMem handle (one | |
| * chunk worth of physical pages), not the full mapped VA range. | |
| * | |
| * ROOT CAUSE | |
| * uct_cuda_copy_md_sync_memops_get_address_range() in | |
| * src/uct/cuda/cuda_copy/cuda_copy_md.c calls cuMemGetAddressRange() on | |
| * the base pointer and overwrites mem_info->alloc_length with its | |
| * result. For multi-handle VMM, cuMemGetAddressRange() returns only the | |
| * bounds of the cuMem handle containing the base pointer, not the full | |
| * mapped range. That truncated length flows down to | |
| * ibv_reg_dmabuf_mr(len=chunk_size). | |
| * | |
| * WORKAROUND | |
| * UCX_CUDA_COPY_REG_WHOLE_ALLOC=off | |
| * | |
| * BUILD | |
| * gcc -O2 -Wall -g \ | |
| * -I/path/to/cuda/include \ | |
| * -o ucx_vmm_minimal ucx_vmm_minimal.c \ | |
| * -lucp -lucs -lcuda | |
| * | |
| * USAGE | |
| * server (recv): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal | |
| * client (send): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal <server_ip> | |
| * | |
| * Expected with vanilla UCX: client crashes with Local protection error. | |
| * Expected with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off: clean transfer. | |
| * | |
| * TESTED AGAINST | |
| * UCX 1.20.0, CUDA 12.x, NVIDIA driver supporting multi-handle VMM | |
| * (cuMemCreate + cuMemMap + cuMemAddressReserve), an HCA that supports | |
| * GPUDirect RDMA via dmabuf (e.g. ConnectX-6/7 with nvidia-peermem or | |
| * newer drivers). | |
| * | |
| * FLOW (both ends do steps 1-7, then split for the actual put) | |
| * 1. Initialise CUDA, retain primary context on device 0. | |
| * 2. Build a multi-handle VMM buffer: N_CHUNKS cuMemCreate handles, | |
| * mapped contiguously into a single cuMemAddressReserve VA range. | |
| * 3. ucp_init + ucp_worker_create with RMA + 64-bit atomics. | |
| * 4. ucp_worker_get_address - produce a wireup blob for the peer. | |
| * 5. ucp_mem_map(addr, length=TOTAL_BYTES, UCS_MEMORY_TYPE_CUDA) - | |
| * THIS IS WHERE THE BUG LIVES. UCX shrinks the registration to | |
| * one cuMem handle internally; the returned memh's lkey only | |
| * covers the first chunk. | |
| * 6. ucp_rkey_pack - serialise the memh for the peer. | |
| * 7. Trivial TCP OOB exchange: | |
| * a. swap worker addresses | |
| * b. swap remote VA (so each side knows where to put/get) | |
| * c. swap packed rkey | |
| * 8. ucp_ep_create with the peer's worker address; rkey_unpack the | |
| * peer's packed rkey. | |
| * 9. Client only: ucp_put_nbx the full TOTAL_BYTES from local buf | |
| * to remote VA. Vanilla UCX submits an RC RDMA_WRITE WQE with | |
| * len=TOTAL_BYTES but lkey covers only CHUNK_BYTES; HCA returns | |
| * IBV_WC_LOC_PROT_ERR as soon as the translation crosses the | |
| * chunk boundary. Server only: progress the worker until the | |
| * client signals completion via OOB. | |
| */ | |
| #define _GNU_SOURCE | |
| #include <arpa/inet.h> | |
| #include <inttypes.h> | |
| #include <netinet/in.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <sys/socket.h> | |
| #include <unistd.h> | |
| #include <cuda.h> | |
| #include <ucp/api/ucp.h> | |
| #define OOB_PORT 19292 | |
| #define N_CHUNKS 2 | |
| #define CHUNK_BYTES (2ULL * 1024 * 1024) | |
| #define TOTAL_BYTES ((size_t)N_CHUNKS * CHUNK_BYTES) | |
| #define CHECK_CU(call) do { \ | |
| CUresult _r = (call); \ | |
| if (_r != CUDA_SUCCESS) { \ | |
| const char *_s = NULL; cuGetErrorString(_r, &_s); \ | |
| fprintf(stderr, "%s:%d: %s -> %d %s\n", __FILE__, __LINE__, #call, _r, _s ? _s : "?"); \ | |
| exit(1); \ | |
| } \ | |
| } while (0) | |
| #define CHECK_UCS(call) do { \ | |
| ucs_status_t _s = (call); \ | |
| if (_s != UCS_OK) { \ | |
| fprintf(stderr, "%s:%d: %s -> %s\n", __FILE__, __LINE__, #call, ucs_status_string(_s)); \ | |
| exit(1); \ | |
| } \ | |
| } while (0) | |
| static int oob_listen_accept(void) { | |
| int s = socket(AF_INET, SOCK_STREAM, 0); | |
| int one = 1; | |
| setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); | |
| struct sockaddr_in a = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(OOB_PORT) }; | |
| if (bind(s, (struct sockaddr*)&a, sizeof(a)) < 0) { perror("bind"); exit(1); } | |
| listen(s, 1); | |
| printf("listening on %d\n", OOB_PORT); | |
| int c = accept(s, NULL, NULL); | |
| close(s); | |
| return c; | |
| } | |
| static int oob_connect(const char *host) { | |
| int s = socket(AF_INET, SOCK_STREAM, 0); | |
| struct sockaddr_in a = { .sin_family = AF_INET, .sin_port = htons(OOB_PORT) }; | |
| if (inet_pton(AF_INET, host, &a.sin_addr) != 1) { fprintf(stderr, "need IPv4: %s\n", host); exit(1); } | |
| for (int t = 0; t < 30; t++) { | |
| if (connect(s, (struct sockaddr*)&a, sizeof(a)) == 0) return s; | |
| sleep(1); | |
| } | |
| perror("connect"); exit(1); | |
| } | |
| static void oob_send(int s, const void *p, size_t n) { | |
| if (write(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); } | |
| if (n && write(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); } | |
| } | |
| static void *oob_recv(int s, size_t *out_n) { | |
| size_t n; | |
| if (read(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); } | |
| *out_n = n; | |
| void *p = malloc(n); | |
| if (n && read(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); } | |
| return p; | |
| } | |
| /* Build a multi-handle VMM CUDA buffer: N_CHUNKS separate cuMemCreate | |
| * handles, all mapped into one contiguous VA range. */ | |
| static CUdeviceptr alloc_multi_handle_vmm(void) { | |
| CUdeviceptr base = 0; | |
| CHECK_CU(cuMemAddressReserve(&base, TOTAL_BYTES, 0, 0, 0)); | |
| CUmemAllocationProp prop = {0}; | |
| prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; | |
| prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
| prop.location.id = 0; | |
| prop.allocFlags.gpuDirectRDMACapable = 1; | |
| prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; | |
| for (int i = 0; i < N_CHUNKS; i++) { | |
| CUmemGenericAllocationHandle h; | |
| CHECK_CU(cuMemCreate(&h, CHUNK_BYTES, &prop, 0)); | |
| CHECK_CU(cuMemMap(base + (CUdeviceptr)((size_t)i * CHUNK_BYTES), | |
| CHUNK_BYTES, 0, h, 0)); | |
| CHECK_CU(cuMemRelease(h)); | |
| } | |
| CUmemAccessDesc access = {0}; | |
| access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
| access.location.id = 0; | |
| access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; | |
| CHECK_CU(cuMemSetAccess(base, TOTAL_BYTES, &access, 1)); | |
| return base; | |
| } | |
| struct req_state { int done; ucs_status_t status; }; | |
| static void on_send(void *req, ucs_status_t s, void *u) { | |
| (void)req; struct req_state *r = u; r->status = s; r->done = 1; | |
| } | |
| static void wait_req(ucp_worker_h worker, ucs_status_ptr_t req, struct req_state *st, const char *what) { | |
| if (req == NULL) return; | |
| if (UCS_PTR_IS_ERR(req)) { | |
| fprintf(stderr, "%s: immediate %s\n", what, ucs_status_string(UCS_PTR_STATUS(req))); | |
| exit(1); | |
| } | |
| while (!st->done) ucp_worker_progress(worker); | |
| ucs_status_t s = st->status; | |
| ucp_request_free(req); | |
| if (s != UCS_OK) { fprintf(stderr, "%s: %s\n", what, ucs_status_string(s)); exit(1); } | |
| } | |
| int main(int argc, char **argv) { | |
| setvbuf(stdout, NULL, _IOLBF, 0); | |
| setvbuf(stderr, NULL, _IOLBF, 0); | |
| int is_client = (argc == 2); | |
| const char *server = is_client ? argv[1] : NULL; | |
| /* --- 1. CUDA: retain primary context on device 0 ----------------- */ | |
| CHECK_CU(cuInit(0)); | |
| CUdevice dev; CHECK_CU(cuDeviceGet(&dev, 0)); | |
| CUcontext cuctx; CHECK_CU(cuDevicePrimaryCtxRetain(&cuctx, dev)); | |
| CHECK_CU(cuCtxPushCurrent(cuctx)); | |
| /* --- 2. Build the multi-handle VMM buffer ------------------------ * | |
| * N_CHUNKS separate cuMemCreate handles, all mapped via cuMemMap | |
| * into one contiguous VA range. This is the shape the bug triggers | |
| * on (e.g. PyTorch arena allocators, fragmented VMM pools). */ | |
| CUdeviceptr buf = alloc_multi_handle_vmm(); | |
| printf("VMM allocated: base=0x%llx total=%zu (%d handles x %llu bytes)\n", | |
| (unsigned long long)buf, TOTAL_BYTES, N_CHUNKS, (unsigned long long)CHUNK_BYTES); | |
| /* --- 3. UCX context + worker ------------------------------------- * | |
| * RMA + 64-bit atomics so the protocol layer wires up an RC lane | |
| * with the dmabuf-registered MR (the path the bug lives on). */ | |
| ucp_config_t *cfg = NULL; CHECK_UCS(ucp_config_read(NULL, NULL, &cfg)); | |
| ucp_params_t up = { .field_mask = UCP_PARAM_FIELD_FEATURES, | |
| .features = UCP_FEATURE_RMA | UCP_FEATURE_AMO64 }; | |
| ucp_context_h ctx; CHECK_UCS(ucp_init(&up, cfg, &ctx)); | |
| ucp_config_release(cfg); | |
| ucp_worker_params_t wp = { .field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE, | |
| .thread_mode = UCS_THREAD_MODE_SINGLE }; | |
| ucp_worker_h worker; CHECK_UCS(ucp_worker_create(ctx, &wp, &worker)); | |
| /* --- 4. Wireup blob for the peer --------------------------------- */ | |
| ucp_address_t *local_addr; size_t local_addr_len; | |
| CHECK_UCS(ucp_worker_get_address(worker, &local_addr, &local_addr_len)); | |
| /* --- 5. Register the VMM buffer ---------------------------------- * | |
| * THIS IS WHERE THE BUG LIVES. We ask UCX to register the full | |
| * TOTAL_BYTES (multi-handle VA range). Internally UCX calls | |
| * cuMemGetAddressRange() on the base pointer to "expand to the | |
| * whole allocation" - for multi-handle VMM this returns only one | |
| * cuMem handle's bounds, and that becomes mem_info->alloc_length. | |
| * The truncated length is then handed to cuMemGetHandleForAddressRange | |
| * and ibv_reg_dmabuf_mr, so the resulting memh's lkey covers only | |
| * CHUNK_BYTES rather than TOTAL_BYTES. | |
| * | |
| * Workaround: run with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off, which | |
| * skips the offending expansion and preserves the requested | |
| * length. */ | |
| ucp_mem_map_params_t mp = { | |
| .field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | | |
| UCP_MEM_MAP_PARAM_FIELD_LENGTH | | |
| UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE, | |
| .address = (void*)(uintptr_t)buf, | |
| .length = TOTAL_BYTES, | |
| .memory_type = UCS_MEMORY_TYPE_CUDA, | |
| }; | |
| ucp_mem_h memh; CHECK_UCS(ucp_mem_map(ctx, &mp, &memh)); | |
| /* --- 6. Serialise memh for the peer ------------------------------ */ | |
| void *rkey_buf = NULL; size_t rkey_len = 0; | |
| CHECK_UCS(ucp_rkey_pack(ctx, memh, &rkey_buf, &rkey_len)); | |
| /* --- 7. OOB exchange over TCP ------------------------------------ * | |
| * Trivial pairwise swap of: worker address, VA of the registered | |
| * buffer, packed rkey. Length-prefixed blobs in each direction. */ | |
| int sock = is_client ? oob_connect(server) : oob_listen_accept(); | |
| oob_send(sock, local_addr, local_addr_len); | |
| size_t remote_addr_len; void *remote_addr = oob_recv(sock, &remote_addr_len); | |
| uint64_t my_va = (uint64_t)(uintptr_t)buf; | |
| oob_send(sock, &my_va, sizeof(my_va)); | |
| uint64_t remote_va; { size_t n; void *p = oob_recv(sock, &n); memcpy(&remote_va, p, sizeof(remote_va)); free(p); } | |
| oob_send(sock, rkey_buf, rkey_len); | |
| size_t remote_rkey_len; void *remote_rkey = oob_recv(sock, &remote_rkey_len); | |
| /* --- 8. Endpoint + remote rkey unpack ---------------------------- */ | |
| ucp_ep_params_t ep_params = { .field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS, | |
| .address = (ucp_address_t*)remote_addr }; | |
| ucp_ep_h ep; CHECK_UCS(ucp_ep_create(worker, &ep_params, &ep)); | |
| ucp_rkey_h rrkey; CHECK_UCS(ucp_ep_rkey_unpack(ep, remote_rkey, &rrkey)); | |
| /* --- 9. The actual transfer -------------------------------------- * | |
| * Client posts a single ucp_put_nbx of the FULL TOTAL_BYTES. With | |
| * vanilla UCX this issues an RDMA_WRITE WQE that walks off the end | |
| * of the truncated MR at the first chunk boundary and the HCA | |
| * returns IBV_WC_LOC_PROT_ERR (CQE syndrome 0x4). Server just | |
| * progresses the worker until the client signals completion. */ | |
| if (is_client) { | |
| struct req_state st = {0}; | |
| ucp_request_param_t prm = { .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | | |
| UCP_OP_ATTR_FIELD_USER_DATA, | |
| .cb.send = on_send, | |
| .user_data = &st }; | |
| printf("issuing ucp_put_nbx %zu bytes...\n", TOTAL_BYTES); | |
| ucs_status_ptr_t req = ucp_put_nbx(ep, (const void*)(uintptr_t)buf, | |
| TOTAL_BYTES, remote_va, rrkey, &prm); | |
| wait_req(worker, req, &st, "ucp_put_nbx"); | |
| printf("put completed OK\n"); | |
| /* Signal the server we're done so it can exit cleanly. */ | |
| uint32_t marker = 0xC0FFEE; | |
| oob_send(sock, &marker, sizeof(marker)); | |
| } else { | |
| /* Server doesn't actively progress here - the put is fully | |
| * one-sided. We just wait on the OOB done marker. (In a | |
| * realistic app you'd be progressing the worker too.) */ | |
| size_t n; void *p = oob_recv(sock, &n); | |
| free(p); | |
| printf("server: client signaled done\n"); | |
| } | |
| /* Minimal reproducer: skip teardown - process exit handles cleanup. | |
| * Real code should ucp_ep_close_nbx, ucp_rkey_destroy, | |
| * ucp_rkey_buffer_release, ucp_mem_unmap, ucp_worker_destroy, | |
| * ucp_cleanup, cuMemUnmap, cuMemAddressFree etc. */ | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment