WOnder93 · December 15, 2016 10:47
diff --git a/kernel.cu b/kernel.cu
 /*
 * Copyright (C) 2016, Ondrej Mosnacek <[email protected]>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

 /*
 * Informational benchmarks:
 * 1024 x 1024: 26240 ME/s
 * 2048 x 2048: 33520 ME/s
 * 4096 x 4096: 36010 ME/s
 * 8192 x 8192: 36790 ME/s
 * 8192 x 1024: 35140 ME/s
 * 1024 x 8192: 35150 ME/s
 * 16384 x 256: 32920 ME/s
 * 256 x 16384: 32950 ME/s
 * 262144 x 32: 29340 ME/s
 * 32 x 262144: 28790 ME/s
 */

 #define WARP_SIZE 32 /* do not change! ;) */

 /* begin tuned parameters */
 #define VEC_SIZE  4 /* vector size for global memory reading; one of {1,2,4} */
 #define WARP_ROWS 1 /* number of warps per block (in the Y axis) */
 #define WARP_COLS 4 /* number of warps per block (in the X axis) */
 #define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */
 /* end tuned parameters */

 /* rows/columns per block: */
 #define BLOCK_ROWS (WARP_SIZE * WARP_ROWS)
 #define BLOCK_COLS (WARP_SIZE * WARP_COLS)

 #define VECS_PER_WARP (WARP_SIZE / VEC_SIZE)

 __device__ void warp_exchange(int id, int mask, int mul, int *s0, int *s1)
 {
    int message = (id & mask) == 0 ? *s1 : *s0;
    message = __shfl_xor(message, mask * mul, WARP_SIZE);
    ((id & mask) == 0 ? *s1 : *s0) = message;
 }

 /* vector-size-specific definitions: */
 #if VEC_SIZE == 1
 typedef int1 vec_type;

 #define VEC_ZERO make_int1(0)

 __device__ void vec_add(vec_type *a, const vec_type *b)
 {
    a->x += b->x;
 }

 __device__ int vec_sum(const vec_type *v)
 {
    return v->x;
 }

 __device__ int vec_shuffle(int tir, vec_type *v)
 {
    return v->x;
 }
 #elif VEC_SIZE == 2
 typedef int2 vec_type;

 #define VEC_ZERO make_int2(0, 0)

 __device__ void vec_add(vec_type *a, const vec_type *b)
 {
    a->x += b->x;
    a->y += b->y;
 }

 __device__ int vec_sum(const vec_type *v)
 {
    return v->x + v->y;
 }

 __device__ int vec_shuffle(int tir, vec_type *v)
 {
    warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
    return v->x + v->y;
 }
 #elif VEC_SIZE == 4
 typedef int4 vec_type;

 #define VEC_ZERO make_int4(0, 0, 0, 0)

 __device__ void vec_add(vec_type *a, const vec_type *b)
 {
    a->x += b->x;
    a->y += b->y;
    a->z += b->z;
    a->w += b->w;
 }

 __device__ int vec_sum(const vec_type *v)
 {
    return v->x + v->y + v->z + v->w;
 }

 __device__ int vec_shuffle(int tir, vec_type *v)
 {
    warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
    v->x += v->y;
    warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w);
    v->z += v->w;
    warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z);
    return v->x + v->z;
 }
 #else
 #error "Invalid vector size!"
 #endif

 __global__ void sumTable(const vec_type *table,
                         int *sum_rows, int *sum_cols,
                         const int rows, const int cols)
 {
    int br = blockIdx.y;
    int bc = blockIdx.x;
    
    int ti = threadIdx.x;
 #if WARP_COLS > 1
    int tc = threadIdx.y;
 #else
    int tc = 0;
 #endif
 #if WARP_ROWS > 1
    int tr = threadIdx.z;
 #else
    int tr = 0;
 #endif
    
    int tic = ti % VECS_PER_WARP;
    int tir = ti / VECS_PER_WARP;
    
    int base_c = bc * BLOCK_COLS + tc * WARP_SIZE;
    int base_r = br * BLOCK_ROWS + tr * WARP_SIZE;
    
 #if WARP_COLS > 1
    if (base_c >= cols) {
        return;
    }
 #endif
 #if WARP_ROWS > 1
    if (base_r >= rows) {
        return;
    }
 #endif
    
    /* a stack buffer for accumulating partial row sums on the fly:
     * (it should get mapped to about log2(VECS_PER_WARP) registers
     * by the compiler) */
    int stack[VECS_PER_WARP];
    int stack_pos = 0;
    vec_type col_sums = VEC_ZERO;

 /* load two values from global memory and push them onto the stack: */
 #define LD2(i) \
    do { \
        vec_type v; \
        int pos_r = base_r + (i) * 2 * VEC_SIZE + tir; \
        int pos_c = base_c / VEC_SIZE + tic; \
        v = table[(pos_r + 0 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \
        stack[stack_pos++] = vec_sum(&v); \
        vec_add(&col_sums, &v); \
        v = table[(pos_r + 1 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \
        stack[stack_pos++] = vec_sum(&v); \
        vec_add(&col_sums, &v); \
    } while (0)
    
 /* pop two values from stack, exchange one of them with another thread,
 * add them, and push the result onto the stack: */
 #define ADD(i) \
    do { \
        int v1 = stack[--stack_pos]; \
        int v0 = stack[--stack_pos]; \
        warp_exchange(tic, 1 << (i), 1, &v0, &v1); \
        stack[stack_pos++] = v0 + v1; \
    } while(0)
    
    /* load the values, re-distribute and accumulate partial row sums,
     * and accumulate partial column sums: */
 #if VEC_SIZE == 1
    LD2( 0); ADD(0);
    LD2( 1); ADD(0); ADD(1);
    __threadfence_block();
    LD2( 2); ADD(0);
    LD2( 3); ADD(0); ADD(1); ADD(2);
    __threadfence_block();
    LD2( 4); ADD(0);
    LD2( 5); ADD(0); ADD(1);
    __threadfence_block();
    LD2( 6); ADD(0);
    LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3);
    __threadfence_block();
    LD2( 8); ADD(0);
    LD2( 9); ADD(0); ADD(1);
    __threadfence_block();
    LD2(10); ADD(0);
    LD2(11); ADD(0); ADD(1); ADD(2);
    __threadfence_block();
    LD2(12); ADD(0);
    LD2(13); ADD(0); ADD(1);
    __threadfence_block();
    LD2(14); ADD(0);
    LD2(15); ADD(0); ADD(1); ADD(2); ADD(3); ADD(4);
 #elif VEC_SIZE == 2
    LD2( 0); ADD(0);
    LD2( 1); ADD(0); ADD(1);
    __threadfence_block();
    LD2( 2); ADD(0);
    LD2( 3); ADD(0); ADD(1); ADD(2);
    __threadfence_block();
    LD2( 4); ADD(0);
    LD2( 5); ADD(0); ADD(1);
    __threadfence_block();
    LD2( 6); ADD(0);
    LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3);
 #elif VEC_SIZE == 4
    LD2( 0); ADD(0);
    LD2( 1); ADD(0); ADD(1);
    __threadfence_block();
    LD2( 2); ADD(0);
    LD2( 3); ADD(0); ADD(1); ADD(2);
 #else
 #error "Invalid vector size!"
 #endif
    /* The above mess is equivalent to the following code, but
     * unfortunately NVCC fails to unroll the inner loop...
     * NOTE: The bounds for 'k' are the ruler function [1] of (i + 1).
     * [1] https://oeis.org/A001511 */
    /*
    #pragma unroll
    for (int i = 0; i < VECS_PER_WARP / 2; i++) {
        LD2(i);
        
        #pragma unroll
        for (int k = 0; (1 << k) <= ((i + 1) & -(i + 1)); k++) {
            ADD(k);
        }
        if (i % 2 == 1) {
            __threadfence_block();
        }
    }
    */
    
 #undef LD2
 #undef ADD
    
    /* re-distribute and accumulate the partial column sums: */
    int col_sum = vec_shuffle(tir, &col_sums);
    /* we already have our row sum on the stack: */
    int row_sum = stack[0];
    
    /* update the global average corresp. to our chunk's row/column sum: */
    atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum);
    atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum);
 }

 #define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE)

 __global__ void toAverage2(float *avgs1, const int *sums1,
                           const float mul1, const int limit1,
                           float *avgs2, const int *sums2,
                           const float mul2, const int limit2)
 {
    int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x;
    if (pos < limit1) {
        avgs1[pos] = (float)sums1[pos] * mul1;
    } else {
        pos -= limit1;
        if (pos < limit2) {
            avgs2[pos] = (float)sums2[pos] * mul2;
        }
    }
 }

 #define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block))

 void solveGPU(
    const int *results,
    float *avg_stud, float *avg_que,
    const int students, const int questions)
 {
    /* compute the total number of blocks in X/Y axis: */
    int blocksY = BLOCK_COUNT(students, BLOCK_ROWS);
    int blocksX = BLOCK_COUNT(questions, BLOCK_COLS);
    
    /* clear result arrays, so threads can just add to them: */
    cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0]));
    cudaMemset(avg_que, 0, questions*sizeof(avg_que[0]));
    
    /* compute column & row sums: */
    sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>(
        (const vec_type *)results, (int *)avg_stud, (int *)avg_que,
        students, questions);

    /* convert sums to averages: */
    int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS);
    toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>(
        avg_stud, (int *)avg_stud, 1.0F / questions, students,
        avg_que,  (int *)avg_que,  1.0F / students, questions);
 }
diff --git a/simpler-kernel.cu b/simpler-kernel.cu
 /*
 * Copyright (C) 2016, Ondrej Mosnacek <[email protected]>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

 /*
 * Informational benchmarks:
 * 1024 x 1024: 26120 ME/s
 * 2048 x 2048: 33480 ME/s
 * 4096 x 4096: 35940 ME/s
 * 8192 x 8192: 36780 ME/s
 * 8192 x 1024: 35110 ME/s
 * 1024 x 8192: 35120 ME/s
 * 16384 x 256: 32820 ME/s
 * 256 x 16384: 32910 ME/s
 * 262144 x 32: 29430 ME/s
 * 32 x 262144: 29090 ME/s
 */

 #define WARP_SIZE 32 /* do not change! ;) */

 /* begin tuned parameters */
 #define VEC_SIZE  4 /* vector size for global memory reading; one of {1,2,4} */
 #define WARP_ROWS 2 /* number of warps per block (in the Y axis) */
 #define WARP_COLS 2 /* number of warps per block (in the X axis) */
 #define FENCE_INTERVAL 2 /* how often to do a memory fence */
 #define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */
 /* end tuned parameters */

 /* rows/columns per block: */
 #define BLOCK_ROWS (WARP_SIZE * WARP_ROWS)
 #define BLOCK_COLS (WARP_SIZE * WARP_COLS)

 #define VECS_PER_WARP (WARP_SIZE / VEC_SIZE)

 __device__ void warp_exchange(int id, int mask, int mul, int *s0, int *s1)
 {
    int message = (id & mask) == 0 ? *s1 : *s0;
    message = __shfl_xor(message, mask * mul, WARP_SIZE);
    ((id & mask) == 0 ? *s1 : *s0) = message;
 }

 /* vector-size-specific definitions: */
 #if VEC_SIZE == 1
 typedef int1 vec_type;

 #define VEC_ZERO make_int1(0)

 __device__ void vec_add(vec_type *a, const vec_type *b)
 {
    a->x += b->x;
 }

 __device__ int vec_sum(const vec_type *v)
 {
    return v->x;
 }

 __device__ int vec_shuffle(int tir, vec_type *v)
 {
    return v->x;
 }
 #elif VEC_SIZE == 2
 typedef int2 vec_type;

 #define VEC_ZERO make_int2(0, 0)

 __device__ void vec_add(vec_type *a, const vec_type *b)
 {
    a->x += b->x;
    a->y += b->y;
 }

 __device__ int vec_sum(const vec_type *v)
 {
    return v->x + v->y;
 }

 __device__ int vec_shuffle(int tir, vec_type *v)
 {
    warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
    return v->x + v->y;
 }
 #elif VEC_SIZE == 4
 typedef int4 vec_type;

 #define VEC_ZERO make_int4(0, 0, 0, 0)

 __device__ void vec_add(vec_type *a, const vec_type *b)
 {
    a->x += b->x;
    a->y += b->y;
    a->z += b->z;
    a->w += b->w;
 }

 __device__ int vec_sum(const vec_type *v)
 {
    return v->x + v->y + v->z + v->w;
 }

 __device__ int vec_shuffle(int tir, vec_type *v)
 {
    warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
    v->x += v->y;
    warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w);
    v->z += v->w;
    warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z);
    return v->x + v->z;
 }
 #else
 #error "Invalid vector size!"
 #endif

 __global__ void sumTable(const vec_type *table,
                         int *sum_rows, int *sum_cols,
                         const int rows, const int cols)
 {
    int br = blockIdx.y;
    int bc = blockIdx.x;
    
    int ti = threadIdx.x;
    int tc = threadIdx.y;
    int tr = threadIdx.z;
    
    int tic = ti % VECS_PER_WARP;
    int tir = ti / VECS_PER_WARP;
    
    int base_c = bc * BLOCK_COLS + tc * WARP_SIZE;
    int base_r = br * BLOCK_ROWS + tr * WARP_SIZE;
    
 #if WARP_COLS > 1
    if (base_c >= cols) {
        return;
    }
 #endif
 #if WARP_ROWS > 1
    if (base_r >= rows) {
        return;
    }
 #endif
    
    vec_type col_sums = VEC_ZERO;
    int row_sum = 0;
    #pragma unroll
    for (int i = 0; i < VECS_PER_WARP; i++) {
        /* load next row: */
        int pos_r = base_r + i * VEC_SIZE + tir;
        int pos_c = base_c / VEC_SIZE + tic;
        vec_type item = table[pos_r * (cols / VEC_SIZE) + pos_c];
        
        /* accumulate row sum using the butterfly reduction
         * (stolen from the CUDA Programming Guide): */
        int value = vec_sum(&item);
        #pragma unroll
        for (int k = 1; k < VECS_PER_WARP; k *= 2) {
            value += __shfl_xor(value, k, WARP_SIZE);
        }
        if (i == tic) {
            row_sum = value;
        }
        
        /* increment partial column sums: */
        vec_add(&col_sums, &item);
        
        /* perform a block-level memory fence to limit reordering
         * of global loads: */
        if (i % FENCE_INTERVAL == (FENCE_INTERVAL - 1)) {
            __threadfence_block();
        }
    }
    
    /* re-distribute and accumulate the partial column sums: */
    int col_sum = vec_shuffle(tir, &col_sums);
    
    /* update the global average corresp. to our chunk's row/column sum: */
    atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum);
    atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum);
 }

 #define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE)

 __global__ void toAverage2(float *avgs1, const int *sums1,
                           const float mul1, const int limit1,
                           float *avgs2, const int *sums2,
                           const float mul2, const int limit2)
 {
    int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x;
    if (pos < limit1) {
        avgs1[pos] = (float)sums1[pos] * mul1;
    } else {
        pos -= limit1;
        if (pos < limit2) {
            avgs2[pos] = (float)sums2[pos] * mul2;
        }
    }
 }

 #define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block))

 void solveGPU(
    const int *results,
    float *avg_stud, float *avg_que,
    const int students, const int questions)
 {
    /* compute the total number of blocks in X/Y axis: */
    int blocksY = BLOCK_COUNT(students, BLOCK_ROWS);
    int blocksX = BLOCK_COUNT(questions, BLOCK_COLS);
    
    /* clear result arrays, so threads can just add to them: */
    cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0]));
    cudaMemset(avg_que, 0, questions*sizeof(avg_que[0]));
    
    /* compute column & row sums: */
    sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>(
        (const vec_type *)results, (int *)avg_stud, (int *)avg_que,
        students, questions);
    
    /* convert sums to averages: */
    int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS);
    toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>(
        avg_stud, (int *)avg_stud, 1.0F / questions, students,
        avg_que,  (int *)avg_que,  1.0F / students, questions);
 }
	/*
	* Copyright (C) 2016, Ondrej Mosnacek <[email protected]>
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/

	/*
	* Informational benchmarks:
	* 1024 x 1024: 26240 ME/s
	* 2048 x 2048: 33520 ME/s
	* 4096 x 4096: 36010 ME/s
	* 8192 x 8192: 36790 ME/s
	* 8192 x 1024: 35140 ME/s
	* 1024 x 8192: 35150 ME/s
	* 16384 x 256: 32920 ME/s
	* 256 x 16384: 32950 ME/s
	* 262144 x 32: 29340 ME/s
	* 32 x 262144: 28790 ME/s
	*/

	#define WARP_SIZE 32 /* do not change! ;) */

	/* begin tuned parameters */
	#define VEC_SIZE 4 /* vector size for global memory reading; one of {1,2,4} */
	#define WARP_ROWS 1 /* number of warps per block (in the Y axis) */
	#define WARP_COLS 4 /* number of warps per block (in the X axis) */
	#define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */
	/* end tuned parameters */

	/* rows/columns per block: */
	#define BLOCK_ROWS (WARP_SIZE * WARP_ROWS)
	#define BLOCK_COLS (WARP_SIZE * WARP_COLS)

	#define VECS_PER_WARP (WARP_SIZE / VEC_SIZE)

	__device__ void warp_exchange(int id, int mask, int mul, int s0, int s1)
	{
	int message = (id & mask) == 0 ? s1 : s0;
	message = __shfl_xor(message, mask * mul, WARP_SIZE);
	((id & mask) == 0 ? s1 : s0) = message;
	}

	/* vector-size-specific definitions: */
	#if VEC_SIZE == 1
	typedef int1 vec_type;

	#define VEC_ZERO make_int1(0)

	__device__ void vec_add(vec_type a, const vec_type b)
	{
	a->x += b->x;
	}

	__device__ int vec_sum(const vec_type *v)
	{
	return v->x;
	}

	__device__ int vec_shuffle(int tir, vec_type *v)
	{
	return v->x;
	}
	#elif VEC_SIZE == 2
	typedef int2 vec_type;

	#define VEC_ZERO make_int2(0, 0)

	__device__ void vec_add(vec_type a, const vec_type b)
	{
	a->x += b->x;
	a->y += b->y;
	}

	__device__ int vec_sum(const vec_type *v)
	{
	return v->x + v->y;
	}

	__device__ int vec_shuffle(int tir, vec_type *v)
	{
	warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
	return v->x + v->y;
	}
	#elif VEC_SIZE == 4
	typedef int4 vec_type;

	#define VEC_ZERO make_int4(0, 0, 0, 0)

	__device__ void vec_add(vec_type a, const vec_type b)
	{
	a->x += b->x;
	a->y += b->y;
	a->z += b->z;
	a->w += b->w;
	}

	__device__ int vec_sum(const vec_type *v)
	{
	return v->x + v->y + v->z + v->w;
	}

	__device__ int vec_shuffle(int tir, vec_type *v)
	{
	warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
	v->x += v->y;
	warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w);
	v->z += v->w;
	warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z);
	return v->x + v->z;
	}
	#else
	#error "Invalid vector size!"
	#endif

	__global__ void sumTable(const vec_type *table,
	int sum_rows, int sum_cols,
	const int rows, const int cols)
	{
	int br = blockIdx.y;
	int bc = blockIdx.x;

	int ti = threadIdx.x;
	#if WARP_COLS > 1
	int tc = threadIdx.y;
	#else
	int tc = 0;
	#endif
	#if WARP_ROWS > 1
	int tr = threadIdx.z;
	#else
	int tr = 0;
	#endif

	int tic = ti % VECS_PER_WARP;
	int tir = ti / VECS_PER_WARP;

	int base_c = bc * BLOCK_COLS + tc * WARP_SIZE;
	int base_r = br * BLOCK_ROWS + tr * WARP_SIZE;

	#if WARP_COLS > 1
	if (base_c >= cols) {
	return;
	}
	#endif
	#if WARP_ROWS > 1
	if (base_r >= rows) {
	return;
	}
	#endif

	/* a stack buffer for accumulating partial row sums on the fly:
	* (it should get mapped to about log2(VECS_PER_WARP) registers
	* by the compiler) */
	int stack[VECS_PER_WARP];
	int stack_pos = 0;
	vec_type col_sums = VEC_ZERO;

	/* load two values from global memory and push them onto the stack: */
	#define LD2(i) \
	do { \
	vec_type v; \
	int pos_r = base_r + (i) * 2 * VEC_SIZE + tir; \
	int pos_c = base_c / VEC_SIZE + tic; \
	v = table[(pos_r + 0 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \
	stack[stack_pos++] = vec_sum(&v); \
	vec_add(&col_sums, &v); \
	v = table[(pos_r + 1 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \
	stack[stack_pos++] = vec_sum(&v); \
	vec_add(&col_sums, &v); \
	} while (0)

	/* pop two values from stack, exchange one of them with another thread,
	* add them, and push the result onto the stack: */
	#define ADD(i) \
	do { \
	int v1 = stack[--stack_pos]; \
	int v0 = stack[--stack_pos]; \
	warp_exchange(tic, 1 << (i), 1, &v0, &v1); \
	stack[stack_pos++] = v0 + v1; \
	} while(0)

	/* load the values, re-distribute and accumulate partial row sums,
	* and accumulate partial column sums: */
	#if VEC_SIZE == 1
	LD2( 0); ADD(0);
	LD2( 1); ADD(0); ADD(1);
	__threadfence_block();
	LD2( 2); ADD(0);
	LD2( 3); ADD(0); ADD(1); ADD(2);
	__threadfence_block();
	LD2( 4); ADD(0);
	LD2( 5); ADD(0); ADD(1);
	__threadfence_block();
	LD2( 6); ADD(0);
	LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3);
	__threadfence_block();
	LD2( 8); ADD(0);
	LD2( 9); ADD(0); ADD(1);
	__threadfence_block();
	LD2(10); ADD(0);
	LD2(11); ADD(0); ADD(1); ADD(2);
	__threadfence_block();
	LD2(12); ADD(0);
	LD2(13); ADD(0); ADD(1);
	__threadfence_block();
	LD2(14); ADD(0);
	LD2(15); ADD(0); ADD(1); ADD(2); ADD(3); ADD(4);
	#elif VEC_SIZE == 2
	LD2( 0); ADD(0);
	LD2( 1); ADD(0); ADD(1);
	__threadfence_block();
	LD2( 2); ADD(0);
	LD2( 3); ADD(0); ADD(1); ADD(2);
	__threadfence_block();
	LD2( 4); ADD(0);
	LD2( 5); ADD(0); ADD(1);
	__threadfence_block();
	LD2( 6); ADD(0);
	LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3);
	#elif VEC_SIZE == 4
	LD2( 0); ADD(0);
	LD2( 1); ADD(0); ADD(1);
	__threadfence_block();
	LD2( 2); ADD(0);
	LD2( 3); ADD(0); ADD(1); ADD(2);
	#else
	#error "Invalid vector size!"
	#endif
	/* The above mess is equivalent to the following code, but
	* unfortunately NVCC fails to unroll the inner loop...
	* NOTE: The bounds for 'k' are the ruler function [1] of (i + 1).
	* [1] https://oeis.org/A001511 */
	/*
	#pragma unroll
	for (int i = 0; i < VECS_PER_WARP / 2; i++) {
	LD2(i);

	#pragma unroll
	for (int k = 0; (1 << k) <= ((i + 1) & -(i + 1)); k++) {
	ADD(k);
	}
	if (i % 2 == 1) {
	__threadfence_block();
	}
	}
	*/

	#undef LD2
	#undef ADD

	/* re-distribute and accumulate the partial column sums: */
	int col_sum = vec_shuffle(tir, &col_sums);
	/* we already have our row sum on the stack: */
	int row_sum = stack[0];

	/* update the global average corresp. to our chunk's row/column sum: */
	atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum);
	atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum);
	}

	#define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE)

	__global__ void toAverage2(float avgs1, const int sums1,
	const float mul1, const int limit1,
	float avgs2, const int sums2,
	const float mul2, const int limit2)
	{
	int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x;
	if (pos < limit1) {
	avgs1[pos] = (float)sums1[pos] * mul1;
	} else {
	pos -= limit1;
	if (pos < limit2) {
	avgs2[pos] = (float)sums2[pos] * mul2;
	}
	}
	}

	#define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block))

	void solveGPU(
	const int *results,
	float avg_stud, float avg_que,
	const int students, const int questions)
	{
	/* compute the total number of blocks in X/Y axis: */
	int blocksY = BLOCK_COUNT(students, BLOCK_ROWS);
	int blocksX = BLOCK_COUNT(questions, BLOCK_COLS);

	/* clear result arrays, so threads can just add to them: */
	cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0]));
	cudaMemset(avg_que, 0, questions*sizeof(avg_que[0]));

	/* compute column & row sums: */
	sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>(
	(const vec_type )results, (int )avg_stud, (int *)avg_que,
	students, questions);

	/* convert sums to averages: */
	int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS);
	toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>(
	avg_stud, (int *)avg_stud, 1.0F / questions, students,
	avg_que, (int *)avg_que, 1.0F / students, questions);
	}