sparticlesteve · January 8, 2020 03:45
diff --git a/test_pytorch_mpi_cuda_2.py b/test_pytorch_mpi_cuda_2.py
 import torch
 import torch.distributed as dist

 # Configuration
 ranks_per_node = 8
 shape = 2**17
 dtype = torch.float32

 # Initialize
 dist.init_process_group(backend='mpi')
 rank, n_ranks = dist.get_rank(), dist.get_world_size()
 local_rank = rank % ranks_per_node

 # First, try allocating a small tensor on every gpu from every rank
 for i in range(ranks_per_node):
    _ = torch.randn(1).to(torch.device('cuda', i))

 # Now select our gpu
 device = torch.device('cuda', local_rank)
 print('MPI rank', rank, 'size', n_ranks, 'device', device)

 # Allocate a tensor
 x = torch.randn(shape, dtype=dtype).to(device)
 print('local result:', x.sum())

 # Do a broadcast from rank 0
 dist.broadcast(x, 0)
 print('broadcast result:', x.sum())

 # Do an allreduce
 dist.all_reduce(x)
 print('allreduce result:', x.sum())
	import torch
	import torch.distributed as dist

	# Configuration
	ranks_per_node = 8
	shape = 2**17
	dtype = torch.float32

	# Initialize
	dist.init_process_group(backend='mpi')
	rank, n_ranks = dist.get_rank(), dist.get_world_size()
	local_rank = rank % ranks_per_node

	# First, try allocating a small tensor on every gpu from every rank
	for i in range(ranks_per_node):
	_ = torch.randn(1).to(torch.device('cuda', i))

	# Now select our gpu
	device = torch.device('cuda', local_rank)
	print('MPI rank', rank, 'size', n_ranks, 'device', device)

	# Allocate a tensor
	x = torch.randn(shape, dtype=dtype).to(device)
	print('local result:', x.sum())

	# Do a broadcast from rank 0
	dist.broadcast(x, 0)
	print('broadcast result:', x.sum())

	# Do an allreduce
	dist.all_reduce(x)
	print('allreduce result:', x.sum())