Last active
December 15, 2016 10:47
-
-
Save WOnder93/fed4929017e48f1f2aed0fda228e7bf8 to your computer and use it in GitHub Desktop.
PB197 - project solution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2016, Ondrej Mosnacek <[email protected]> | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
/* | |
* Informational benchmarks: | |
* 1024 x 1024: 26240 ME/s | |
* 2048 x 2048: 33520 ME/s | |
* 4096 x 4096: 36010 ME/s | |
* 8192 x 8192: 36790 ME/s | |
* 8192 x 1024: 35140 ME/s | |
* 1024 x 8192: 35150 ME/s | |
* 16384 x 256: 32920 ME/s | |
* 256 x 16384: 32950 ME/s | |
* 262144 x 32: 29340 ME/s | |
* 32 x 262144: 28790 ME/s | |
*/ | |
#define WARP_SIZE 32 /* do not change! ;) */ | |
/* begin tuned parameters */ | |
#define VEC_SIZE 4 /* vector size for global memory reading; one of {1,2,4} */ | |
#define WARP_ROWS 1 /* number of warps per block (in the Y axis) */ | |
#define WARP_COLS 4 /* number of warps per block (in the X axis) */ | |
#define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */ | |
/* end tuned parameters */ | |
/* rows/columns per block: */ | |
#define BLOCK_ROWS (WARP_SIZE * WARP_ROWS) | |
#define BLOCK_COLS (WARP_SIZE * WARP_COLS) | |
#define VECS_PER_WARP (WARP_SIZE / VEC_SIZE) | |
__device__ void warp_exchange(int id, int mask, int mul, int *s0, int *s1) | |
{ | |
int message = (id & mask) == 0 ? *s1 : *s0; | |
message = __shfl_xor(message, mask * mul, WARP_SIZE); | |
((id & mask) == 0 ? *s1 : *s0) = message; | |
} | |
/* vector-size-specific definitions: */ | |
#if VEC_SIZE == 1 | |
typedef int1 vec_type; | |
#define VEC_ZERO make_int1(0) | |
__device__ void vec_add(vec_type *a, const vec_type *b) | |
{ | |
a->x += b->x; | |
} | |
__device__ int vec_sum(const vec_type *v) | |
{ | |
return v->x; | |
} | |
__device__ int vec_shuffle(int tir, vec_type *v) | |
{ | |
return v->x; | |
} | |
#elif VEC_SIZE == 2 | |
typedef int2 vec_type; | |
#define VEC_ZERO make_int2(0, 0) | |
__device__ void vec_add(vec_type *a, const vec_type *b) | |
{ | |
a->x += b->x; | |
a->y += b->y; | |
} | |
__device__ int vec_sum(const vec_type *v) | |
{ | |
return v->x + v->y; | |
} | |
__device__ int vec_shuffle(int tir, vec_type *v) | |
{ | |
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y); | |
return v->x + v->y; | |
} | |
#elif VEC_SIZE == 4 | |
typedef int4 vec_type; | |
#define VEC_ZERO make_int4(0, 0, 0, 0) | |
__device__ void vec_add(vec_type *a, const vec_type *b) | |
{ | |
a->x += b->x; | |
a->y += b->y; | |
a->z += b->z; | |
a->w += b->w; | |
} | |
__device__ int vec_sum(const vec_type *v) | |
{ | |
return v->x + v->y + v->z + v->w; | |
} | |
__device__ int vec_shuffle(int tir, vec_type *v) | |
{ | |
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y); | |
v->x += v->y; | |
warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w); | |
v->z += v->w; | |
warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z); | |
return v->x + v->z; | |
} | |
#else | |
#error "Invalid vector size!" | |
#endif | |
__global__ void sumTable(const vec_type *table, | |
int *sum_rows, int *sum_cols, | |
const int rows, const int cols) | |
{ | |
int br = blockIdx.y; | |
int bc = blockIdx.x; | |
int ti = threadIdx.x; | |
#if WARP_COLS > 1 | |
int tc = threadIdx.y; | |
#else | |
int tc = 0; | |
#endif | |
#if WARP_ROWS > 1 | |
int tr = threadIdx.z; | |
#else | |
int tr = 0; | |
#endif | |
int tic = ti % VECS_PER_WARP; | |
int tir = ti / VECS_PER_WARP; | |
int base_c = bc * BLOCK_COLS + tc * WARP_SIZE; | |
int base_r = br * BLOCK_ROWS + tr * WARP_SIZE; | |
#if WARP_COLS > 1 | |
if (base_c >= cols) { | |
return; | |
} | |
#endif | |
#if WARP_ROWS > 1 | |
if (base_r >= rows) { | |
return; | |
} | |
#endif | |
/* a stack buffer for accumulating partial row sums on the fly: | |
* (it should get mapped to about log2(VECS_PER_WARP) registers | |
* by the compiler) */ | |
int stack[VECS_PER_WARP]; | |
int stack_pos = 0; | |
vec_type col_sums = VEC_ZERO; | |
/* load two values from global memory and push them onto the stack: */ | |
#define LD2(i) \ | |
do { \ | |
vec_type v; \ | |
int pos_r = base_r + (i) * 2 * VEC_SIZE + tir; \ | |
int pos_c = base_c / VEC_SIZE + tic; \ | |
v = table[(pos_r + 0 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \ | |
stack[stack_pos++] = vec_sum(&v); \ | |
vec_add(&col_sums, &v); \ | |
v = table[(pos_r + 1 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \ | |
stack[stack_pos++] = vec_sum(&v); \ | |
vec_add(&col_sums, &v); \ | |
} while (0) | |
/* pop two values from stack, exchange one of them with another thread, | |
* add them, and push the result onto the stack: */ | |
#define ADD(i) \ | |
do { \ | |
int v1 = stack[--stack_pos]; \ | |
int v0 = stack[--stack_pos]; \ | |
warp_exchange(tic, 1 << (i), 1, &v0, &v1); \ | |
stack[stack_pos++] = v0 + v1; \ | |
} while(0) | |
/* load the values, re-distribute and accumulate partial row sums, | |
* and accumulate partial column sums: */ | |
#if VEC_SIZE == 1 | |
LD2( 0); ADD(0); | |
LD2( 1); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2( 2); ADD(0); | |
LD2( 3); ADD(0); ADD(1); ADD(2); | |
__threadfence_block(); | |
LD2( 4); ADD(0); | |
LD2( 5); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2( 6); ADD(0); | |
LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3); | |
__threadfence_block(); | |
LD2( 8); ADD(0); | |
LD2( 9); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2(10); ADD(0); | |
LD2(11); ADD(0); ADD(1); ADD(2); | |
__threadfence_block(); | |
LD2(12); ADD(0); | |
LD2(13); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2(14); ADD(0); | |
LD2(15); ADD(0); ADD(1); ADD(2); ADD(3); ADD(4); | |
#elif VEC_SIZE == 2 | |
LD2( 0); ADD(0); | |
LD2( 1); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2( 2); ADD(0); | |
LD2( 3); ADD(0); ADD(1); ADD(2); | |
__threadfence_block(); | |
LD2( 4); ADD(0); | |
LD2( 5); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2( 6); ADD(0); | |
LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3); | |
#elif VEC_SIZE == 4 | |
LD2( 0); ADD(0); | |
LD2( 1); ADD(0); ADD(1); | |
__threadfence_block(); | |
LD2( 2); ADD(0); | |
LD2( 3); ADD(0); ADD(1); ADD(2); | |
#else | |
#error "Invalid vector size!" | |
#endif | |
/* The above mess is equivalent to the following code, but | |
* unfortunately NVCC fails to unroll the inner loop... | |
* NOTE: The bounds for 'k' are the ruler function [1] of (i + 1). | |
* [1] https://oeis.org/A001511 */ | |
/* | |
#pragma unroll | |
for (int i = 0; i < VECS_PER_WARP / 2; i++) { | |
LD2(i); | |
#pragma unroll | |
for (int k = 0; (1 << k) <= ((i + 1) & -(i + 1)); k++) { | |
ADD(k); | |
} | |
if (i % 2 == 1) { | |
__threadfence_block(); | |
} | |
} | |
*/ | |
#undef LD2 | |
#undef ADD | |
/* re-distribute and accumulate the partial column sums: */ | |
int col_sum = vec_shuffle(tir, &col_sums); | |
/* we already have our row sum on the stack: */ | |
int row_sum = stack[0]; | |
/* update the global average corresp. to our chunk's row/column sum: */ | |
atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum); | |
atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum); | |
} | |
#define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE) | |
__global__ void toAverage2(float *avgs1, const int *sums1, | |
const float mul1, const int limit1, | |
float *avgs2, const int *sums2, | |
const float mul2, const int limit2) | |
{ | |
int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x; | |
if (pos < limit1) { | |
avgs1[pos] = (float)sums1[pos] * mul1; | |
} else { | |
pos -= limit1; | |
if (pos < limit2) { | |
avgs2[pos] = (float)sums2[pos] * mul2; | |
} | |
} | |
} | |
#define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block)) | |
void solveGPU( | |
const int *results, | |
float *avg_stud, float *avg_que, | |
const int students, const int questions) | |
{ | |
/* compute the total number of blocks in X/Y axis: */ | |
int blocksY = BLOCK_COUNT(students, BLOCK_ROWS); | |
int blocksX = BLOCK_COUNT(questions, BLOCK_COLS); | |
/* clear result arrays, so threads can just add to them: */ | |
cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0])); | |
cudaMemset(avg_que, 0, questions*sizeof(avg_que[0])); | |
/* compute column & row sums: */ | |
sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>( | |
(const vec_type *)results, (int *)avg_stud, (int *)avg_que, | |
students, questions); | |
/* convert sums to averages: */ | |
int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS); | |
toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>( | |
avg_stud, (int *)avg_stud, 1.0F / questions, students, | |
avg_que, (int *)avg_que, 1.0F / students, questions); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2016, Ondrej Mosnacek <[email protected]> | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
/* | |
* Informational benchmarks: | |
* 1024 x 1024: 26120 ME/s | |
* 2048 x 2048: 33480 ME/s | |
* 4096 x 4096: 35940 ME/s | |
* 8192 x 8192: 36780 ME/s | |
* 8192 x 1024: 35110 ME/s | |
* 1024 x 8192: 35120 ME/s | |
* 16384 x 256: 32820 ME/s | |
* 256 x 16384: 32910 ME/s | |
* 262144 x 32: 29430 ME/s | |
* 32 x 262144: 29090 ME/s | |
*/ | |
#define WARP_SIZE 32 /* do not change! ;) */ | |
/* begin tuned parameters */ | |
#define VEC_SIZE 4 /* vector size for global memory reading; one of {1,2,4} */ | |
#define WARP_ROWS 2 /* number of warps per block (in the Y axis) */ | |
#define WARP_COLS 2 /* number of warps per block (in the X axis) */ | |
#define FENCE_INTERVAL 2 /* how often to do a memory fence */ | |
#define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */ | |
/* end tuned parameters */ | |
/* rows/columns per block: */ | |
#define BLOCK_ROWS (WARP_SIZE * WARP_ROWS) | |
#define BLOCK_COLS (WARP_SIZE * WARP_COLS) | |
#define VECS_PER_WARP (WARP_SIZE / VEC_SIZE) | |
__device__ void warp_exchange(int id, int mask, int mul, int *s0, int *s1) | |
{ | |
int message = (id & mask) == 0 ? *s1 : *s0; | |
message = __shfl_xor(message, mask * mul, WARP_SIZE); | |
((id & mask) == 0 ? *s1 : *s0) = message; | |
} | |
/* vector-size-specific definitions: */ | |
#if VEC_SIZE == 1 | |
typedef int1 vec_type; | |
#define VEC_ZERO make_int1(0) | |
__device__ void vec_add(vec_type *a, const vec_type *b) | |
{ | |
a->x += b->x; | |
} | |
__device__ int vec_sum(const vec_type *v) | |
{ | |
return v->x; | |
} | |
__device__ int vec_shuffle(int tir, vec_type *v) | |
{ | |
return v->x; | |
} | |
#elif VEC_SIZE == 2 | |
typedef int2 vec_type; | |
#define VEC_ZERO make_int2(0, 0) | |
__device__ void vec_add(vec_type *a, const vec_type *b) | |
{ | |
a->x += b->x; | |
a->y += b->y; | |
} | |
__device__ int vec_sum(const vec_type *v) | |
{ | |
return v->x + v->y; | |
} | |
__device__ int vec_shuffle(int tir, vec_type *v) | |
{ | |
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y); | |
return v->x + v->y; | |
} | |
#elif VEC_SIZE == 4 | |
typedef int4 vec_type; | |
#define VEC_ZERO make_int4(0, 0, 0, 0) | |
__device__ void vec_add(vec_type *a, const vec_type *b) | |
{ | |
a->x += b->x; | |
a->y += b->y; | |
a->z += b->z; | |
a->w += b->w; | |
} | |
__device__ int vec_sum(const vec_type *v) | |
{ | |
return v->x + v->y + v->z + v->w; | |
} | |
__device__ int vec_shuffle(int tir, vec_type *v) | |
{ | |
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y); | |
v->x += v->y; | |
warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w); | |
v->z += v->w; | |
warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z); | |
return v->x + v->z; | |
} | |
#else | |
#error "Invalid vector size!" | |
#endif | |
__global__ void sumTable(const vec_type *table, | |
int *sum_rows, int *sum_cols, | |
const int rows, const int cols) | |
{ | |
int br = blockIdx.y; | |
int bc = blockIdx.x; | |
int ti = threadIdx.x; | |
int tc = threadIdx.y; | |
int tr = threadIdx.z; | |
int tic = ti % VECS_PER_WARP; | |
int tir = ti / VECS_PER_WARP; | |
int base_c = bc * BLOCK_COLS + tc * WARP_SIZE; | |
int base_r = br * BLOCK_ROWS + tr * WARP_SIZE; | |
#if WARP_COLS > 1 | |
if (base_c >= cols) { | |
return; | |
} | |
#endif | |
#if WARP_ROWS > 1 | |
if (base_r >= rows) { | |
return; | |
} | |
#endif | |
vec_type col_sums = VEC_ZERO; | |
int row_sum = 0; | |
#pragma unroll | |
for (int i = 0; i < VECS_PER_WARP; i++) { | |
/* load next row: */ | |
int pos_r = base_r + i * VEC_SIZE + tir; | |
int pos_c = base_c / VEC_SIZE + tic; | |
vec_type item = table[pos_r * (cols / VEC_SIZE) + pos_c]; | |
/* accumulate row sum using the butterfly reduction | |
* (stolen from the CUDA Programming Guide): */ | |
int value = vec_sum(&item); | |
#pragma unroll | |
for (int k = 1; k < VECS_PER_WARP; k *= 2) { | |
value += __shfl_xor(value, k, WARP_SIZE); | |
} | |
if (i == tic) { | |
row_sum = value; | |
} | |
/* increment partial column sums: */ | |
vec_add(&col_sums, &item); | |
/* perform a block-level memory fence to limit reordering | |
* of global loads: */ | |
if (i % FENCE_INTERVAL == (FENCE_INTERVAL - 1)) { | |
__threadfence_block(); | |
} | |
} | |
/* re-distribute and accumulate the partial column sums: */ | |
int col_sum = vec_shuffle(tir, &col_sums); | |
/* update the global average corresp. to our chunk's row/column sum: */ | |
atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum); | |
atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum); | |
} | |
#define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE) | |
__global__ void toAverage2(float *avgs1, const int *sums1, | |
const float mul1, const int limit1, | |
float *avgs2, const int *sums2, | |
const float mul2, const int limit2) | |
{ | |
int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x; | |
if (pos < limit1) { | |
avgs1[pos] = (float)sums1[pos] * mul1; | |
} else { | |
pos -= limit1; | |
if (pos < limit2) { | |
avgs2[pos] = (float)sums2[pos] * mul2; | |
} | |
} | |
} | |
#define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block)) | |
void solveGPU( | |
const int *results, | |
float *avg_stud, float *avg_que, | |
const int students, const int questions) | |
{ | |
/* compute the total number of blocks in X/Y axis: */ | |
int blocksY = BLOCK_COUNT(students, BLOCK_ROWS); | |
int blocksX = BLOCK_COUNT(questions, BLOCK_COLS); | |
/* clear result arrays, so threads can just add to them: */ | |
cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0])); | |
cudaMemset(avg_que, 0, questions*sizeof(avg_que[0])); | |
/* compute column & row sums: */ | |
sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>( | |
(const vec_type *)results, (int *)avg_stud, (int *)avg_que, | |
students, questions); | |
/* convert sums to averages: */ | |
int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS); | |
toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>( | |
avg_stud, (int *)avg_stud, 1.0F / questions, students, | |
avg_que, (int *)avg_que, 1.0F / students, questions); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment