Created
January 8, 2023 15:39
-
-
Save ddjerqq/c5072f80cc56bbd42975f41cea148a4f to your computer and use it in GitHub Desktop.
hello CUDA application my first CUDA app
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
using namespace std; | |
// create a 'kernel'. a kernel is a function that runs on the 'device' - GPU | |
__global__ | |
void saxpy(int n, float a, float* x, float* y) | |
{ | |
// get the current index in the arrays; | |
int i = blockIdx.x * blockDim.x + threadIdx.x; | |
if (i < n) | |
{ | |
y[i] = a * x[i] + y[i]; | |
} | |
} | |
int main() | |
{ | |
// the size of the array | |
const int N = 1 << 30; | |
cout << "multiplying 2 arrays with the size of: " << N << endl; | |
// define variables | |
// x and y are pointers to the array on the host | |
// d_x and d_y - device array pointers | |
float *x, *y, *d_x, *d_y; | |
x = (float*) malloc(N * sizeof(float)); | |
y = (float*) malloc(N * sizeof(float)); | |
cout << "allocated " << 2 * N * sizeof(float) << " bytes on the host" << endl; | |
// allocate memory on the device | |
// to use this function, we need to pass a pointer to allocated device memory | |
// and a requested allocation size in bytes. | |
cudaMalloc(&d_x, N * sizeof(float)); | |
// Allocates size bytes of linear memory on the device and returns in *devPtr a pointer to the allocated memory. | |
// The allocated memory is suitably aligned for any kind of variable. The memory is not cleared. | |
// cudaMalloc() returns cudaErrorMemoryAllocation in case of failure. | |
// The device version of cudaFree cannot be used with a *devPtr allocated using the host API, and vice versa. | |
cudaMalloc(&d_y, N * sizeof(float)); | |
cout << "allocated " << 2 * N * sizeof(float) << " bytes on the device" << endl; | |
// fill the arrays on the host before | |
for (int i = 0; i < N; i++) | |
{ | |
x[i] = 1.0f; | |
y[i] = 2.0f; | |
} | |
// sending it, or `copy`-ing it to the device | |
cout << "arrays filled on the host" << endl; | |
// copy from destination - y to source - d_y. | |
// count: size in bytes to copy. | |
// kind: type of transfer | |
cout << "copying from host to device started" << endl; | |
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyHostToDevice); | |
cout << "copying from host to device finished" << endl; | |
cout << "launching kernel" << endl; | |
// launch the kernel | |
saxpy<<<(N + 256) / 256, 256>>>(N, 2.0f, d_x, d_y); | |
// The execution configuration (of a global function call) is specified by inserting an expression of the form | |
// kernel<<<Dg, Db, Ns, S>>>() | |
// where: | |
// * Dg (dim3) specifies the dimension and size of the grid. | |
// * Db (dim3) specifies the dimension and size of each block | |
// * Ns (size_t) specifies the number of bytes in shared memory that is dynamically allocated per block for | |
// this call in addition to the statically allocated memory. | |
// * S (cudaStream_t) specifies the associated stream, is an optional parameter which defaults to 0. | |
cout << "kernel finished successfully" << endl; | |
// Copies `count` bytes from the memory area pointed to by src to the memory area pointed to by dst, | |
// where kind specifies the direction of the copy and must be one of: h-h h-d d-h d-d | |
cout << "copying from back from device to host started" << endl; | |
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); | |
cout << "copying from back from device to host finished" << endl; | |
cout << "checking for maxError in the resulting array" << endl; | |
// check for max error | |
// we have the result array in y | |
float maxError = 0.0f; | |
for (int i = 0; i < N; i++) { | |
maxError = max(maxError, abs(y[i] - 4.0f)); | |
} | |
cout << "checking for maxError in the resulting done" << endl; | |
cout << "max error: " << maxError << endl; | |
cout << "freeing memory" << endl; | |
// free the memory, no need to explain this a lot. | |
cudaFree(d_x); | |
cudaFree(d_y); | |
free(x); | |
free(y); | |
cout << "freeing memory done" << endl; | |
cout << "exiting program" << endl; | |
system("pause"); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment