nicolasnoble · May 14, 2026 16:02
diff --git a/ucx_vmm_minimal.c b/ucx_vmm_minimal.c
 /*
 * Minimal reproducer for the UCX cuda_copy MD multi-handle VMM registration
 * truncation bug.
 *
 * SYMPTOM
 *   ucp_put_nbx on a CUDA buffer composed of multiple cuMemCreate handles
 *   mapped contiguously into one VA range fails with IBV_WC_LOC_PROT_ERR
 *   (CQE syndrome 0x4) as soon as the put crosses a handle boundary.
 *
 *     [host:pid:0:tid] ib_mlx5_log.c:179  Local protection error on
 *       mlx5_0:1/IB (synd 0x4 vend 0x53 hw_synd 0/157)
 *     [host:pid:0:tid] ib_mlx5_log.c:179  RC QP ... wqe[0]: RDMA_WRITE
 *       [rva ... rkey ...] [va ... len 4194304 lkey ...]
 *
 *   The MR returned by ucp_mem_map covers only the first cuMem handle (one
 *   chunk worth of physical pages), not the full mapped VA range.
 *
 * ROOT CAUSE
 *   uct_cuda_copy_md_sync_memops_get_address_range() in
 *   src/uct/cuda/cuda_copy/cuda_copy_md.c calls cuMemGetAddressRange() on
 *   the base pointer and overwrites mem_info->alloc_length with its
 *   result. For multi-handle VMM, cuMemGetAddressRange() returns only the
 *   bounds of the cuMem handle containing the base pointer, not the full
 *   mapped range. That truncated length flows down to
 *   ibv_reg_dmabuf_mr(len=chunk_size).
 *
 * WORKAROUND
 *   UCX_CUDA_COPY_REG_WHOLE_ALLOC=off
 *
 * BUILD
 *   gcc -O2 -Wall -g \
 *     -I/path/to/cuda/include \
 *     -o ucx_vmm_minimal ucx_vmm_minimal.c \
 *     -lucp -lucs -lcuda
 *
 * USAGE
 *   server (recv): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal
 *   client (send): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal <server_ip>
 *
 *   Expected with vanilla UCX: client crashes with Local protection error.
 *   Expected with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off: clean transfer.
 *
 * TESTED AGAINST
 *   UCX 1.20.0, CUDA 12.x, NVIDIA driver supporting multi-handle VMM
 *   (cuMemCreate + cuMemMap + cuMemAddressReserve), an HCA that supports
 *   GPUDirect RDMA via dmabuf (e.g. ConnectX-6/7 with nvidia-peermem or
 *   newer drivers).
 *
 * FLOW (both ends do steps 1-7, then split for the actual put)
 *   1. Initialise CUDA, retain primary context on device 0.
 *   2. Build a multi-handle VMM buffer: N_CHUNKS cuMemCreate handles,
 *      mapped contiguously into a single cuMemAddressReserve VA range.
 *   3. ucp_init + ucp_worker_create with RMA + 64-bit atomics.
 *   4. ucp_worker_get_address - produce a wireup blob for the peer.
 *   5. ucp_mem_map(addr, length=TOTAL_BYTES, UCS_MEMORY_TYPE_CUDA) -
 *      THIS IS WHERE THE BUG LIVES. UCX shrinks the registration to
 *      one cuMem handle internally; the returned memh's lkey only
 *      covers the first chunk.
 *   6. ucp_rkey_pack - serialise the memh for the peer.
 *   7. Trivial TCP OOB exchange:
 *        a. swap worker addresses
 *        b. swap remote VA (so each side knows where to put/get)
 *        c. swap packed rkey
 *   8. ucp_ep_create with the peer's worker address; rkey_unpack the
 *      peer's packed rkey.
 *   9. Client only: ucp_put_nbx the full TOTAL_BYTES from local buf
 *      to remote VA. Vanilla UCX submits an RC RDMA_WRITE WQE with
 *      len=TOTAL_BYTES but lkey covers only CHUNK_BYTES; HCA returns
 *      IBV_WC_LOC_PROT_ERR as soon as the translation crosses the
 *      chunk boundary. Server only: progress the worker until the
 *      client signals completion via OOB.
 */

 #define _GNU_SOURCE
 #include <arpa/inet.h>
 #include <inttypes.h>
 #include <netinet/in.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <unistd.h>

 #include <cuda.h>
 #include <ucp/api/ucp.h>

 #define OOB_PORT     19292
 #define N_CHUNKS     2
 #define CHUNK_BYTES  (2ULL * 1024 * 1024)
 #define TOTAL_BYTES  ((size_t)N_CHUNKS * CHUNK_BYTES)

 #define CHECK_CU(call) do { \
    CUresult _r = (call); \
    if (_r != CUDA_SUCCESS) { \
        const char *_s = NULL; cuGetErrorString(_r, &_s); \
        fprintf(stderr, "%s:%d: %s -> %d %s\n", __FILE__, __LINE__, #call, _r, _s ? _s : "?"); \
        exit(1); \
    } \
 } while (0)

 #define CHECK_UCS(call) do { \
    ucs_status_t _s = (call); \
    if (_s != UCS_OK) { \
        fprintf(stderr, "%s:%d: %s -> %s\n", __FILE__, __LINE__, #call, ucs_status_string(_s)); \
        exit(1); \
    } \
 } while (0)

 static int oob_listen_accept(void) {
    int s = socket(AF_INET, SOCK_STREAM, 0);
    int one = 1;
    setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
    struct sockaddr_in a = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(OOB_PORT) };
    if (bind(s, (struct sockaddr*)&a, sizeof(a)) < 0) { perror("bind"); exit(1); }
    listen(s, 1);
    printf("listening on %d\n", OOB_PORT);
    int c = accept(s, NULL, NULL);
    close(s);
    return c;
 }

 static int oob_connect(const char *host) {
    int s = socket(AF_INET, SOCK_STREAM, 0);
    struct sockaddr_in a = { .sin_family = AF_INET, .sin_port = htons(OOB_PORT) };
    if (inet_pton(AF_INET, host, &a.sin_addr) != 1) { fprintf(stderr, "need IPv4: %s\n", host); exit(1); }
    for (int t = 0; t < 30; t++) {
        if (connect(s, (struct sockaddr*)&a, sizeof(a)) == 0) return s;
        sleep(1);
    }
    perror("connect"); exit(1);
 }

 static void oob_send(int s, const void *p, size_t n) {
    if (write(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); }
    if (n && write(s, p, n) != (ssize_t)n)            { perror("oob"); exit(1); }
 }

 static void *oob_recv(int s, size_t *out_n) {
    size_t n;
    if (read(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); }
    *out_n = n;
    void *p = malloc(n);
    if (n && read(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); }
    return p;
 }

 /* Build a multi-handle VMM CUDA buffer: N_CHUNKS separate cuMemCreate
 * handles, all mapped into one contiguous VA range. */
 static CUdeviceptr alloc_multi_handle_vmm(void) {
    CUdeviceptr base = 0;
    CHECK_CU(cuMemAddressReserve(&base, TOTAL_BYTES, 0, 0, 0));

    CUmemAllocationProp prop = {0};
    prop.type                       = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type              = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id                = 0;
    prop.allocFlags.gpuDirectRDMACapable = 1;
    prop.requestedHandleTypes       = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;

    for (int i = 0; i < N_CHUNKS; i++) {
        CUmemGenericAllocationHandle h;
        CHECK_CU(cuMemCreate(&h, CHUNK_BYTES, &prop, 0));
        CHECK_CU(cuMemMap(base + (CUdeviceptr)((size_t)i * CHUNK_BYTES),
                          CHUNK_BYTES, 0, h, 0));
        CHECK_CU(cuMemRelease(h));
    }

    CUmemAccessDesc access = {0};
    access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    access.location.id   = 0;
    access.flags         = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    CHECK_CU(cuMemSetAccess(base, TOTAL_BYTES, &access, 1));
    return base;
 }

 struct req_state { int done; ucs_status_t status; };
 static void on_send(void *req, ucs_status_t s, void *u) {
    (void)req; struct req_state *r = u; r->status = s; r->done = 1;
 }

 static void wait_req(ucp_worker_h worker, ucs_status_ptr_t req, struct req_state *st, const char *what) {
    if (req == NULL) return;
    if (UCS_PTR_IS_ERR(req)) {
        fprintf(stderr, "%s: immediate %s\n", what, ucs_status_string(UCS_PTR_STATUS(req)));
        exit(1);
    }
    while (!st->done) ucp_worker_progress(worker);
    ucs_status_t s = st->status;
    ucp_request_free(req);
    if (s != UCS_OK) { fprintf(stderr, "%s: %s\n", what, ucs_status_string(s)); exit(1); }
 }

 int main(int argc, char **argv) {
    setvbuf(stdout, NULL, _IOLBF, 0);
    setvbuf(stderr, NULL, _IOLBF, 0);
    int is_client = (argc == 2);
    const char *server = is_client ? argv[1] : NULL;

    /* --- 1. CUDA: retain primary context on device 0 ----------------- */
    CHECK_CU(cuInit(0));
    CUdevice dev; CHECK_CU(cuDeviceGet(&dev, 0));
    CUcontext cuctx; CHECK_CU(cuDevicePrimaryCtxRetain(&cuctx, dev));
    CHECK_CU(cuCtxPushCurrent(cuctx));

    /* --- 2. Build the multi-handle VMM buffer ------------------------ *
     * N_CHUNKS separate cuMemCreate handles, all mapped via cuMemMap
     * into one contiguous VA range. This is the shape the bug triggers
     * on (e.g. PyTorch arena allocators, fragmented VMM pools). */
    CUdeviceptr buf = alloc_multi_handle_vmm();
    printf("VMM allocated: base=0x%llx total=%zu (%d handles x %llu bytes)\n",
           (unsigned long long)buf, TOTAL_BYTES, N_CHUNKS, (unsigned long long)CHUNK_BYTES);

    /* --- 3. UCX context + worker ------------------------------------- *
     * RMA + 64-bit atomics so the protocol layer wires up an RC lane
     * with the dmabuf-registered MR (the path the bug lives on). */
    ucp_config_t *cfg = NULL; CHECK_UCS(ucp_config_read(NULL, NULL, &cfg));
    ucp_params_t up = { .field_mask = UCP_PARAM_FIELD_FEATURES,
                        .features   = UCP_FEATURE_RMA | UCP_FEATURE_AMO64 };
    ucp_context_h ctx; CHECK_UCS(ucp_init(&up, cfg, &ctx));
    ucp_config_release(cfg);

    ucp_worker_params_t wp = { .field_mask  = UCP_WORKER_PARAM_FIELD_THREAD_MODE,
                               .thread_mode = UCS_THREAD_MODE_SINGLE };
    ucp_worker_h worker; CHECK_UCS(ucp_worker_create(ctx, &wp, &worker));

    /* --- 4. Wireup blob for the peer --------------------------------- */
    ucp_address_t *local_addr; size_t local_addr_len;
    CHECK_UCS(ucp_worker_get_address(worker, &local_addr, &local_addr_len));

    /* --- 5. Register the VMM buffer ---------------------------------- *
     * THIS IS WHERE THE BUG LIVES. We ask UCX to register the full
     * TOTAL_BYTES (multi-handle VA range). Internally UCX calls
     * cuMemGetAddressRange() on the base pointer to "expand to the
     * whole allocation" - for multi-handle VMM this returns only one
     * cuMem handle's bounds, and that becomes mem_info->alloc_length.
     * The truncated length is then handed to cuMemGetHandleForAddressRange
     * and ibv_reg_dmabuf_mr, so the resulting memh's lkey covers only
     * CHUNK_BYTES rather than TOTAL_BYTES.
     *
     * Workaround: run with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off, which
     * skips the offending expansion and preserves the requested
     * length. */
    ucp_mem_map_params_t mp = {
        .field_mask  = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
                       UCP_MEM_MAP_PARAM_FIELD_LENGTH  |
                       UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE,
        .address     = (void*)(uintptr_t)buf,
        .length      = TOTAL_BYTES,
        .memory_type = UCS_MEMORY_TYPE_CUDA,
    };
    ucp_mem_h memh; CHECK_UCS(ucp_mem_map(ctx, &mp, &memh));

    /* --- 6. Serialise memh for the peer ------------------------------ */
    void *rkey_buf = NULL; size_t rkey_len = 0;
    CHECK_UCS(ucp_rkey_pack(ctx, memh, &rkey_buf, &rkey_len));

    /* --- 7. OOB exchange over TCP ------------------------------------ *
     * Trivial pairwise swap of: worker address, VA of the registered
     * buffer, packed rkey. Length-prefixed blobs in each direction. */
    int sock = is_client ? oob_connect(server) : oob_listen_accept();
    oob_send(sock, local_addr, local_addr_len);
    size_t remote_addr_len; void *remote_addr = oob_recv(sock, &remote_addr_len);
    uint64_t my_va = (uint64_t)(uintptr_t)buf;
    oob_send(sock, &my_va, sizeof(my_va));
    uint64_t remote_va; { size_t n; void *p = oob_recv(sock, &n); memcpy(&remote_va, p, sizeof(remote_va)); free(p); }
    oob_send(sock, rkey_buf, rkey_len);
    size_t remote_rkey_len; void *remote_rkey = oob_recv(sock, &remote_rkey_len);

    /* --- 8. Endpoint + remote rkey unpack ---------------------------- */
    ucp_ep_params_t ep_params = { .field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS,
                                  .address    = (ucp_address_t*)remote_addr };
    ucp_ep_h ep; CHECK_UCS(ucp_ep_create(worker, &ep_params, &ep));
    ucp_rkey_h rrkey; CHECK_UCS(ucp_ep_rkey_unpack(ep, remote_rkey, &rrkey));

    /* --- 9. The actual transfer -------------------------------------- *
     * Client posts a single ucp_put_nbx of the FULL TOTAL_BYTES. With
     * vanilla UCX this issues an RDMA_WRITE WQE that walks off the end
     * of the truncated MR at the first chunk boundary and the HCA
     * returns IBV_WC_LOC_PROT_ERR (CQE syndrome 0x4). Server just
     * progresses the worker until the client signals completion. */
    if (is_client) {
        struct req_state st = {0};
        ucp_request_param_t prm = { .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
                                                    UCP_OP_ATTR_FIELD_USER_DATA,
                                    .cb.send      = on_send,
                                    .user_data    = &st };
        printf("issuing ucp_put_nbx %zu bytes...\n", TOTAL_BYTES);
        ucs_status_ptr_t req = ucp_put_nbx(ep, (const void*)(uintptr_t)buf,
                                            TOTAL_BYTES, remote_va, rrkey, &prm);
        wait_req(worker, req, &st, "ucp_put_nbx");
        printf("put completed OK\n");

        /* Signal the server we're done so it can exit cleanly. */
        uint32_t marker = 0xC0FFEE;
        oob_send(sock, &marker, sizeof(marker));
    } else {
        /* Server doesn't actively progress here - the put is fully
         * one-sided. We just wait on the OOB done marker. (In a
         * realistic app you'd be progressing the worker too.) */
        size_t n; void *p = oob_recv(sock, &n);
        free(p);
        printf("server: client signaled done\n");
    }

    /* Minimal reproducer: skip teardown - process exit handles cleanup.
     * Real code should ucp_ep_close_nbx, ucp_rkey_destroy,
     * ucp_rkey_buffer_release, ucp_mem_unmap, ucp_worker_destroy,
     * ucp_cleanup, cuMemUnmap, cuMemAddressFree etc. */
    return 0;
 }
	/*
	* Minimal reproducer for the UCX cuda_copy MD multi-handle VMM registration
	* truncation bug.
	*
	* SYMPTOM
	* ucp_put_nbx on a CUDA buffer composed of multiple cuMemCreate handles
	* mapped contiguously into one VA range fails with IBV_WC_LOC_PROT_ERR
	* (CQE syndrome 0x4) as soon as the put crosses a handle boundary.
	*
	* [host:pid:0:tid] ib_mlx5_log.c:179 Local protection error on
	* mlx5_0:1/IB (synd 0x4 vend 0x53 hw_synd 0/157)
	* [host:pid:0:tid] ib_mlx5_log.c:179 RC QP ... wqe[0]: RDMA_WRITE
	* [rva ... rkey ...] [va ... len 4194304 lkey ...]
	*
	* The MR returned by ucp_mem_map covers only the first cuMem handle (one
	* chunk worth of physical pages), not the full mapped VA range.
	*
	* ROOT CAUSE
	* uct_cuda_copy_md_sync_memops_get_address_range() in
	* src/uct/cuda/cuda_copy/cuda_copy_md.c calls cuMemGetAddressRange() on
	* the base pointer and overwrites mem_info->alloc_length with its
	* result. For multi-handle VMM, cuMemGetAddressRange() returns only the
	* bounds of the cuMem handle containing the base pointer, not the full
	* mapped range. That truncated length flows down to
	* ibv_reg_dmabuf_mr(len=chunk_size).
	*
	* WORKAROUND
	* UCX_CUDA_COPY_REG_WHOLE_ALLOC=off
	*
	* BUILD
	* gcc -O2 -Wall -g \
	* -I/path/to/cuda/include \
	* -o ucx_vmm_minimal ucx_vmm_minimal.c \
	* -lucp -lucs -lcuda
	*
	* USAGE
	* server (recv): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal
	* client (send): UCX_NET_DEVICES=mlx5_0:1 ./ucx_vmm_minimal <server_ip>
	*
	* Expected with vanilla UCX: client crashes with Local protection error.
	* Expected with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off: clean transfer.
	*
	* TESTED AGAINST
	* UCX 1.20.0, CUDA 12.x, NVIDIA driver supporting multi-handle VMM
	* (cuMemCreate + cuMemMap + cuMemAddressReserve), an HCA that supports
	* GPUDirect RDMA via dmabuf (e.g. ConnectX-6/7 with nvidia-peermem or
	* newer drivers).
	*
	* FLOW (both ends do steps 1-7, then split for the actual put)
	* 1. Initialise CUDA, retain primary context on device 0.
	* 2. Build a multi-handle VMM buffer: N_CHUNKS cuMemCreate handles,
	* mapped contiguously into a single cuMemAddressReserve VA range.
	* 3. ucp_init + ucp_worker_create with RMA + 64-bit atomics.
	* 4. ucp_worker_get_address - produce a wireup blob for the peer.
	* 5. ucp_mem_map(addr, length=TOTAL_BYTES, UCS_MEMORY_TYPE_CUDA) -
	* THIS IS WHERE THE BUG LIVES. UCX shrinks the registration to
	* one cuMem handle internally; the returned memh's lkey only
	* covers the first chunk.
	* 6. ucp_rkey_pack - serialise the memh for the peer.
	* 7. Trivial TCP OOB exchange:
	* a. swap worker addresses
	* b. swap remote VA (so each side knows where to put/get)
	* c. swap packed rkey
	* 8. ucp_ep_create with the peer's worker address; rkey_unpack the
	* peer's packed rkey.
	* 9. Client only: ucp_put_nbx the full TOTAL_BYTES from local buf
	* to remote VA. Vanilla UCX submits an RC RDMA_WRITE WQE with
	* len=TOTAL_BYTES but lkey covers only CHUNK_BYTES; HCA returns
	* IBV_WC_LOC_PROT_ERR as soon as the translation crosses the
	* chunk boundary. Server only: progress the worker until the
	* client signals completion via OOB.
	*/

	#define _GNU_SOURCE
	#include <arpa/inet.h>
	#include <inttypes.h>
	#include <netinet/in.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/socket.h>
	#include <unistd.h>

	#include <cuda.h>
	#include <ucp/api/ucp.h>

	#define OOB_PORT 19292
	#define N_CHUNKS 2
	#define CHUNK_BYTES (2ULL * 1024 * 1024)
	#define TOTAL_BYTES ((size_t)N_CHUNKS * CHUNK_BYTES)

	#define CHECK_CU(call) do { \
	CUresult _r = (call); \
	if (_r != CUDA_SUCCESS) { \
	const char *_s = NULL; cuGetErrorString(_r, &_s); \
	fprintf(stderr, "%s:%d: %s -> %d %s\n", __FILE__, __LINE__, #call, _r, _s ? _s : "?"); \
	exit(1); \
	} \
	} while (0)

	#define CHECK_UCS(call) do { \
	ucs_status_t _s = (call); \
	if (_s != UCS_OK) { \
	fprintf(stderr, "%s:%d: %s -> %s\n", __FILE__, __LINE__, #call, ucs_status_string(_s)); \
	exit(1); \
	} \
	} while (0)

	static int oob_listen_accept(void) {
	int s = socket(AF_INET, SOCK_STREAM, 0);
	int one = 1;
	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
	struct sockaddr_in a = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(OOB_PORT) };
	if (bind(s, (struct sockaddr*)&a, sizeof(a)) < 0) { perror("bind"); exit(1); }
	listen(s, 1);
	printf("listening on %d\n", OOB_PORT);
	int c = accept(s, NULL, NULL);
	close(s);
	return c;
	}

	static int oob_connect(const char *host) {
	int s = socket(AF_INET, SOCK_STREAM, 0);
	struct sockaddr_in a = { .sin_family = AF_INET, .sin_port = htons(OOB_PORT) };
	if (inet_pton(AF_INET, host, &a.sin_addr) != 1) { fprintf(stderr, "need IPv4: %s\n", host); exit(1); }
	for (int t = 0; t < 30; t++) {
	if (connect(s, (struct sockaddr*)&a, sizeof(a)) == 0) return s;
	sleep(1);
	}
	perror("connect"); exit(1);
	}

	static void oob_send(int s, const void *p, size_t n) {
	if (write(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); }
	if (n && write(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); }
	}

	static void oob_recv(int s, size_t out_n) {
	size_t n;
	if (read(s, &n, sizeof(n)) != (ssize_t)sizeof(n)) { perror("oob"); exit(1); }
	*out_n = n;
	void *p = malloc(n);
	if (n && read(s, p, n) != (ssize_t)n) { perror("oob"); exit(1); }
	return p;
	}

	/* Build a multi-handle VMM CUDA buffer: N_CHUNKS separate cuMemCreate
	* handles, all mapped into one contiguous VA range. */
	static CUdeviceptr alloc_multi_handle_vmm(void) {
	CUdeviceptr base = 0;
	CHECK_CU(cuMemAddressReserve(&base, TOTAL_BYTES, 0, 0, 0));

	CUmemAllocationProp prop = {0};
	prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
	prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
	prop.location.id = 0;
	prop.allocFlags.gpuDirectRDMACapable = 1;
	prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;

	for (int i = 0; i < N_CHUNKS; i++) {
	CUmemGenericAllocationHandle h;
	CHECK_CU(cuMemCreate(&h, CHUNK_BYTES, &prop, 0));
	CHECK_CU(cuMemMap(base + (CUdeviceptr)((size_t)i * CHUNK_BYTES),
	CHUNK_BYTES, 0, h, 0));
	CHECK_CU(cuMemRelease(h));
	}

	CUmemAccessDesc access = {0};
	access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
	access.location.id = 0;
	access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
	CHECK_CU(cuMemSetAccess(base, TOTAL_BYTES, &access, 1));
	return base;
	}

	struct req_state { int done; ucs_status_t status; };
	static void on_send(void req, ucs_status_t s, void u) {
	(void)req; struct req_state *r = u; r->status = s; r->done = 1;
	}

	static void wait_req(ucp_worker_h worker, ucs_status_ptr_t req, struct req_state st, const char what) {
	if (req == NULL) return;
	if (UCS_PTR_IS_ERR(req)) {
	fprintf(stderr, "%s: immediate %s\n", what, ucs_status_string(UCS_PTR_STATUS(req)));
	exit(1);
	}
	while (!st->done) ucp_worker_progress(worker);
	ucs_status_t s = st->status;
	ucp_request_free(req);
	if (s != UCS_OK) { fprintf(stderr, "%s: %s\n", what, ucs_status_string(s)); exit(1); }
	}

	int main(int argc, char **argv) {
	setvbuf(stdout, NULL, _IOLBF, 0);
	setvbuf(stderr, NULL, _IOLBF, 0);
	int is_client = (argc == 2);
	const char *server = is_client ? argv[1] : NULL;

	/* --- 1. CUDA: retain primary context on device 0 ----------------- */
	CHECK_CU(cuInit(0));
	CUdevice dev; CHECK_CU(cuDeviceGet(&dev, 0));
	CUcontext cuctx; CHECK_CU(cuDevicePrimaryCtxRetain(&cuctx, dev));
	CHECK_CU(cuCtxPushCurrent(cuctx));

	/* --- 2. Build the multi-handle VMM buffer ------------------------ *
	* N_CHUNKS separate cuMemCreate handles, all mapped via cuMemMap
	* into one contiguous VA range. This is the shape the bug triggers
	* on (e.g. PyTorch arena allocators, fragmented VMM pools). */
	CUdeviceptr buf = alloc_multi_handle_vmm();
	printf("VMM allocated: base=0x%llx total=%zu (%d handles x %llu bytes)\n",
	(unsigned long long)buf, TOTAL_BYTES, N_CHUNKS, (unsigned long long)CHUNK_BYTES);

	/* --- 3. UCX context + worker ------------------------------------- *
	* RMA + 64-bit atomics so the protocol layer wires up an RC lane
	* with the dmabuf-registered MR (the path the bug lives on). */
	ucp_config_t *cfg = NULL; CHECK_UCS(ucp_config_read(NULL, NULL, &cfg));
	ucp_params_t up = { .field_mask = UCP_PARAM_FIELD_FEATURES,
	.features = UCP_FEATURE_RMA \| UCP_FEATURE_AMO64 };
	ucp_context_h ctx; CHECK_UCS(ucp_init(&up, cfg, &ctx));
	ucp_config_release(cfg);

	ucp_worker_params_t wp = { .field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE,
	.thread_mode = UCS_THREAD_MODE_SINGLE };
	ucp_worker_h worker; CHECK_UCS(ucp_worker_create(ctx, &wp, &worker));

	/* --- 4. Wireup blob for the peer --------------------------------- */
	ucp_address_t *local_addr; size_t local_addr_len;
	CHECK_UCS(ucp_worker_get_address(worker, &local_addr, &local_addr_len));

	/* --- 5. Register the VMM buffer ---------------------------------- *
	* THIS IS WHERE THE BUG LIVES. We ask UCX to register the full
	* TOTAL_BYTES (multi-handle VA range). Internally UCX calls
	* cuMemGetAddressRange() on the base pointer to "expand to the
	* whole allocation" - for multi-handle VMM this returns only one
	* cuMem handle's bounds, and that becomes mem_info->alloc_length.
	* The truncated length is then handed to cuMemGetHandleForAddressRange
	* and ibv_reg_dmabuf_mr, so the resulting memh's lkey covers only
	* CHUNK_BYTES rather than TOTAL_BYTES.
	*
	* Workaround: run with UCX_CUDA_COPY_REG_WHOLE_ALLOC=off, which
	* skips the offending expansion and preserves the requested
	* length. */
	ucp_mem_map_params_t mp = {
	.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS \|
	UCP_MEM_MAP_PARAM_FIELD_LENGTH \|
	UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE,
	.address = (void*)(uintptr_t)buf,
	.length = TOTAL_BYTES,
	.memory_type = UCS_MEMORY_TYPE_CUDA,
	};
	ucp_mem_h memh; CHECK_UCS(ucp_mem_map(ctx, &mp, &memh));

	/* --- 6. Serialise memh for the peer ------------------------------ */
	void *rkey_buf = NULL; size_t rkey_len = 0;
	CHECK_UCS(ucp_rkey_pack(ctx, memh, &rkey_buf, &rkey_len));

	/* --- 7. OOB exchange over TCP ------------------------------------ *
	* Trivial pairwise swap of: worker address, VA of the registered
	* buffer, packed rkey. Length-prefixed blobs in each direction. */
	int sock = is_client ? oob_connect(server) : oob_listen_accept();
	oob_send(sock, local_addr, local_addr_len);
	size_t remote_addr_len; void *remote_addr = oob_recv(sock, &remote_addr_len);
	uint64_t my_va = (uint64_t)(uintptr_t)buf;
	oob_send(sock, &my_va, sizeof(my_va));
	uint64_t remote_va; { size_t n; void *p = oob_recv(sock, &n); memcpy(&remote_va, p, sizeof(remote_va)); free(p); }
	oob_send(sock, rkey_buf, rkey_len);
	size_t remote_rkey_len; void *remote_rkey = oob_recv(sock, &remote_rkey_len);

	/* --- 8. Endpoint + remote rkey unpack ---------------------------- */
	ucp_ep_params_t ep_params = { .field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS,
	.address = (ucp_address_t*)remote_addr };
	ucp_ep_h ep; CHECK_UCS(ucp_ep_create(worker, &ep_params, &ep));
	ucp_rkey_h rrkey; CHECK_UCS(ucp_ep_rkey_unpack(ep, remote_rkey, &rrkey));

	/* --- 9. The actual transfer -------------------------------------- *
	* Client posts a single ucp_put_nbx of the FULL TOTAL_BYTES. With
	* vanilla UCX this issues an RDMA_WRITE WQE that walks off the end
	* of the truncated MR at the first chunk boundary and the HCA
	* returns IBV_WC_LOC_PROT_ERR (CQE syndrome 0x4). Server just
	* progresses the worker until the client signals completion. */
	if (is_client) {
	struct req_state st = {0};
	ucp_request_param_t prm = { .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK \|
	UCP_OP_ATTR_FIELD_USER_DATA,
	.cb.send = on_send,
	.user_data = &st };
	printf("issuing ucp_put_nbx %zu bytes...\n", TOTAL_BYTES);
	ucs_status_ptr_t req = ucp_put_nbx(ep, (const void*)(uintptr_t)buf,
	TOTAL_BYTES, remote_va, rrkey, &prm);
	wait_req(worker, req, &st, "ucp_put_nbx");
	printf("put completed OK\n");

	/* Signal the server we're done so it can exit cleanly. */
	uint32_t marker = 0xC0FFEE;
	oob_send(sock, &marker, sizeof(marker));
	} else {
	/* Server doesn't actively progress here - the put is fully
	* one-sided. We just wait on the OOB done marker. (In a
	* realistic app you'd be progressing the worker too.) */
	size_t n; void *p = oob_recv(sock, &n);
	free(p);
	printf("server: client signaled done\n");
	}

	/* Minimal reproducer: skip teardown - process exit handles cleanup.
	* Real code should ucp_ep_close_nbx, ucp_rkey_destroy,
	* ucp_rkey_buffer_release, ucp_mem_unmap, ucp_worker_destroy,
	* ucp_cleanup, cuMemUnmap, cuMemAddressFree etc. */
	return 0;
	}
No results found