Skip to content

Instantly share code, notes, and snippets.

@ddjerqq
Created January 8, 2023 15:39
Show Gist options
  • Save ddjerqq/c5072f80cc56bbd42975f41cea148a4f to your computer and use it in GitHub Desktop.
Save ddjerqq/c5072f80cc56bbd42975f41cea148a4f to your computer and use it in GitHub Desktop.
hello CUDA application my first CUDA app
#include <iostream>
using namespace std;
// create a 'kernel'. a kernel is a function that runs on the 'device' - GPU
__global__
void saxpy(int n, float a, float* x, float* y)
{
// get the current index in the arrays;
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a * x[i] + y[i];
}
}
int main()
{
// the size of the array
const int N = 1 << 30;
cout << "multiplying 2 arrays with the size of: " << N << endl;
// define variables
// x and y are pointers to the array on the host
// d_x and d_y - device array pointers
float *x, *y, *d_x, *d_y;
x = (float*) malloc(N * sizeof(float));
y = (float*) malloc(N * sizeof(float));
cout << "allocated " << 2 * N * sizeof(float) << " bytes on the host" << endl;
// allocate memory on the device
// to use this function, we need to pass a pointer to allocated device memory
// and a requested allocation size in bytes.
cudaMalloc(&d_x, N * sizeof(float));
// Allocates size bytes of linear memory on the device and returns in *devPtr a pointer to the allocated memory.
// The allocated memory is suitably aligned for any kind of variable. The memory is not cleared.
// cudaMalloc() returns cudaErrorMemoryAllocation in case of failure.
// The device version of cudaFree cannot be used with a *devPtr allocated using the host API, and vice versa.
cudaMalloc(&d_y, N * sizeof(float));
cout << "allocated " << 2 * N * sizeof(float) << " bytes on the device" << endl;
// fill the arrays on the host before
for (int i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
// sending it, or `copy`-ing it to the device
cout << "arrays filled on the host" << endl;
// copy from destination - y to source - d_y.
// count: size in bytes to copy.
// kind: type of transfer
cout << "copying from host to device started" << endl;
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyHostToDevice);
cout << "copying from host to device finished" << endl;
cout << "launching kernel" << endl;
// launch the kernel
saxpy<<<(N + 256) / 256, 256>>>(N, 2.0f, d_x, d_y);
// The execution configuration (of a global function call) is specified by inserting an expression of the form
// kernel<<<Dg, Db, Ns, S>>>()
// where:
// * Dg (dim3) specifies the dimension and size of the grid.
// * Db (dim3) specifies the dimension and size of each block
// * Ns (size_t) specifies the number of bytes in shared memory that is dynamically allocated per block for
// this call in addition to the statically allocated memory.
// * S (cudaStream_t) specifies the associated stream, is an optional parameter which defaults to 0.
cout << "kernel finished successfully" << endl;
// Copies `count` bytes from the memory area pointed to by src to the memory area pointed to by dst,
// where kind specifies the direction of the copy and must be one of: h-h h-d d-h d-d
cout << "copying from back from device to host started" << endl;
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cout << "copying from back from device to host finished" << endl;
cout << "checking for maxError in the resulting array" << endl;
// check for max error
// we have the result array in y
float maxError = 0.0f;
for (int i = 0; i < N; i++) {
maxError = max(maxError, abs(y[i] - 4.0f));
}
cout << "checking for maxError in the resulting done" << endl;
cout << "max error: " << maxError << endl;
cout << "freeing memory" << endl;
// free the memory, no need to explain this a lot.
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
cout << "freeing memory done" << endl;
cout << "exiting program" << endl;
system("pause");
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment