Skip to content

Instantly share code, notes, and snippets.

@youkaichao
Created November 5, 2024 00:06
Show Gist options
  • Save youkaichao/8f87555bdeaaf68f4492b0dc96fbd206 to your computer and use it in GitHub Desktop.
Save youkaichao/8f87555bdeaaf68f4492b0dc96fbd206 to your computer and use it in GitHub Desktop.
cuda ipc
import os
from typing import List
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torch.distributed as dist
dist.init_process_group(backend="gloo")
rank = local_rank = dist.get_rank()
world_size = dist.get_world_size()
torch.cuda.set_device(local_rank)
def share_tensor(A: torch.Tensor, group=None) -> List[torch.Tensor]:
from torch.multiprocessing.reductions import reduce_tensor
A_meta = reduce_tensor(A)
tensor_metas = [None] * world_size
dist.all_gather_object(tensor_metas, A_meta, group=group)
rank = dist.get_rank(group)
all_tensors = []
for i, obj in enumerate(tensor_metas):
func = obj[0]
args = list(obj[1])
args[6] = A.device.index
if i != rank:
all_tensors.append(func(*args))
else:
all_tensors.append(A)
return all_tensors
A = torch.ones((10,), device=local_rank) * rank
all_tensors = share_tensor(A)
dist.barrier()
torch.cuda.synchronize()
if rank == 0:
for x in all_tensors:
x.zero_()
dist.barrier()
torch.cuda.synchronize()
for i, x in enumerate(all_tensors):
print(f"{rank=}, {i=}, {x=}")
@youkaichao
Copy link
Author

running on 2.6.0.dev20241112+cu124 , still get the same error RuntimeError: pidfd_getfd: Operation not permitted .

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment