Last active
May 15, 2024 15:22
-
-
Save chetandhembre/6e93a652026f0bb669c981513e2cc5b8 to your computer and use it in GitHub Desktop.
puzzle10_dotproduct floating point overflow error
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include<common.h> | |
#include <cassert> | |
/* | |
Implement a kernel that computes the dot-product of a and b and stores it in out. | |
You have 1 thread per position. | |
You only need 2 global reads and 1 global write per thread. | |
Note: For this problem you don't need to worry about number of shared reads. | |
We will handle that challenge later. | |
*/ | |
//fasted | |
#define BLOCK_SIZE 1024 | |
#define COARSE_FACTORE 16 | |
// for n = 20480000 this kernel fails because of floating point precision issue | |
__global__ void dotproduct(float *a, float *b, int n, float *out) { | |
assert(blockDim.x == BLOCK_SIZE); | |
int x = blockIdx.x * blockDim.x + threadIdx.x; | |
float val; | |
if (x < n) { | |
val = (a[x] * b[x]); | |
atomicAdd(&out[0], val); | |
} | |
} | |
__global__ void dotproduct1(float *a, float *b, int n, float *out) { | |
assert(blockDim.x == BLOCK_SIZE); | |
int x = blockIdx.x * blockDim.x + threadIdx.x; | |
if (x < n) { | |
out[x] = a[x] * b[x]; | |
} | |
} | |
__global__ void reduction(float *input, int n) { | |
assert(blockDim.x == BLOCK_SIZE); | |
int x = blockIdx.x * blockDim.x + threadIdx.x; | |
__shared__ float s[BLOCK_SIZE]; | |
float val = 0; | |
int k; | |
for (int i = 0; i < COARSE_FACTORE && x + i * blockDim.x < n; i++) { | |
k = i * blockDim.x + x; | |
val += input[k]; | |
} | |
s[threadIdx.x] = val; | |
__syncthreads(); | |
for (int i = 1; i < blockDim.x; i *= 2) { | |
if (threadIdx.x % (2 * i) == 0) { | |
s[threadIdx.x] += s[threadIdx.x + i]; | |
} | |
__syncthreads(); | |
} | |
if (threadIdx.x == 0) { | |
input[blockIdx.x] = s[0]; | |
} | |
} | |
void __puzzle_part_2(float *input, int n) { | |
int block_size = BLOCK_SIZE; | |
int grid_size = ceil_div(n/COARSE_FACTORE, block_size); | |
while (grid_size > 0) { | |
reduction<<<grid_size, block_size>>>(input, n); | |
if (grid_size == 1) { | |
break; | |
} | |
n = grid_size; | |
grid_size = ceil_div(n/COARSE_FACTORE, block_size); | |
} | |
} | |
float puzzle10_cpu(float *a, float *b, int n) { | |
double out = 0; | |
int i = 0; | |
for (; i < n; i++) { | |
out += (a[i] * b[i]); | |
} | |
return out; | |
} | |
void _puzzle10(float *a, float *b, int n, float *output) { | |
int block_size = 1024; | |
int grid_size = ceil_div(n, block_size); | |
// dotproduct<<<grid_size, block_size>>>(a, b, n, output); | |
dotproduct1<<<grid_size, block_size>>>(a, b, n, output); | |
__puzzle_part_2(output, n); | |
} | |
void puzzle10() { | |
float *a, *b, *output; | |
int n = 20480000; | |
a = (float *)malloc(n * sizeof(float)); | |
b = (float *)malloc(n * sizeof(float)); | |
output = (float *)malloc(n * sizeof(float)); | |
for (int i = 0; i < n; i++) { | |
a[i] = 1; | |
b[i] = 1; | |
output[i] = 0; | |
} | |
float *d_a, *d_b, *d_output; | |
cudaCheck(cudaMalloc(&d_a, n * sizeof(float))); | |
cudaCheck(cudaMalloc(&d_b, n * sizeof(float))); | |
cudaCheck(cudaMalloc(&d_output, n * sizeof(float))); | |
cudaCheck(cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice)); | |
cudaCheck(cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice)); | |
cudaCheck(cudaMemcpy(d_output, output, n * sizeof(float), cudaMemcpyHostToDevice)); | |
int repeat_times = 1; | |
float elapsed_time = benchmark_kernel(repeat_times, _puzzle10, d_a, d_b, n, d_output); | |
printf("time gpu %.8f ms\n", elapsed_time); | |
cudaCheck(cudaMemcpy(output, d_output, sizeof(float), cudaMemcpyDeviceToHost)); | |
float output_cpu = puzzle10_cpu(a, b, n); | |
if (output_cpu != *output) { | |
printf("Error: %f != %f\n", output_cpu, *output); | |
} else { | |
printf("Output is correct %f = %f \n", output_cpu, *output); | |
} | |
cudaCheck(cudaFree(d_a)); | |
cudaCheck(cudaFree(d_b)); | |
cudaCheck(cudaFree(d_output)); | |
free(a); | |
free(b); | |
free(output); | |
} | |
//with dotproduct kernel Error: 20480000.000000 != 16777216.000000 | |
//with dotproduct1 and reduction kernel Error: 20480000.000000 == 20480000.000000 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment