Last active
November 12, 2021 19:12
-
-
Save alcides/08ffe6c9dc09a708f476886d04300678 to your computer and use it in GitHub Desktop.
MWE for grid-wide synchronization in CUDA 9+. Source: https://stackoverflow.com/questions/6404992/cuda-block-synchronization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime_api.h> | |
#include <cuda.h> | |
#include <cooperative_groups.h> | |
#include "workshop.h" | |
#define N 10 | |
__global__ void block_wide_kernel() { | |
int index = blockIdx.x * blockDim.x + threadIdx.x; | |
cooperative_groups::grid_group g = cooperative_groups::this_grid(); | |
for (int i=0; i<N; i++) { | |
if (threadIdx.x == 0) | |
printf("%d | %d\n", i, index); | |
g.sync(); | |
} | |
} | |
int main(int argc, char **argv) { | |
// This will not work! | |
//block_wide_kernel<<<64, 1024>>>(); | |
// You have to use the low-level CooperativeKernel launch api. | |
// Note the required imports for this. | |
void *kernelArgs[] = { /* add kernel args */ }; | |
dim3 dimGrid(64, 1, 1); // deviceProp.multiProcessorCount*numBlocksPerSm | |
dim3 dimBlock(1024, 1, 1); | |
cudaLaunchCooperativeKernel((void*)block_wide_kernel, dimGrid, dimBlock, kernelArgs); | |
cudaDeviceSynchronize(); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime_api.h> | |
#include <cuda.h> | |
#include "workshop.h" | |
#define N 10 | |
__global__ void block_wide_kernel() { | |
int index = blockIdx.x * blockDim.x + threadIdx.x; | |
for (int i=0; i<N; i++) { | |
if (threadIdx.x == 0) | |
printf("%d | %d\n", i, index); | |
} | |
} | |
int main(int argc, char **argv) { | |
block_wide_kernel<<<64, 1024>>>(); | |
cudaDeviceSynchronize(); | |
return 0; | |
} | |
/* | |
prints something like: | |
... | |
7 | 41984 | |
7 | 46080 | |
7 | 47104 | |
9 | 26624 | |
7 | 15360 | |
7 | 17408 | |
7 | 8192 | |
7 | 14336 | |
... | |
*/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime_api.h> | |
#include <cuda.h> | |
#include "workshop.h" | |
#define N 10 | |
__device__ uint64_t phaser_between_32_groups; // Can synchronize among 64 groups. | |
__global__ void block_wide_kernel() { | |
int index = blockIdx.x * blockDim.x + threadIdx.x; | |
for (int i=0; i<N; i++) { | |
// You need to reset the mask in the beginning, only in the first thread | |
if (blockIdx.x == 0 && threadIdx.x == 0) { | |
atomicAnd((int*) &phaser_between_32_groups, 0); | |
} | |
if (threadIdx.x == 0) | |
printf("%d | %d\n", i, index); | |
__syncthreads(); | |
// Then you use atomic or to do a busy wait until all threads have arrived. | |
const auto phaser_complete = (1 << gridDim.x) - 1; | |
if (threadIdx.x == 0) { | |
while ((atomicOr((int*) &phaser_between_32_groups, 1ULL << blockIdx.x)) != phaser_complete) { /*do nothing*/ } | |
} | |
} | |
} | |
int main(int argc, char **argv) { | |
block_wide_kernel<<<32, 1024>>>(); | |
cudaDeviceSynchronize(); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment