ddjerqq · January 8, 2023 15:39
diff --git a/hello.cu b/hello.cu
 #include <iostream>

 using namespace std;

 // create a 'kernel'. a kernel is a function that runs on the 'device' - GPU
 __global__
 void saxpy(int n, float a, float* x, float* y)
 {
    // get the current index in the arrays;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        y[i] = a * x[i] + y[i];
    }
 }

 int main()
 {
    // the size of the array
    const int N = 1 << 30;
    cout << "multiplying 2 arrays with the size of: " << N << endl;

    // define variables
    // x and y are pointers to the array on the host
    // d_x and d_y - device array pointers
    float *x, *y, *d_x, *d_y;
    x = (float*) malloc(N * sizeof(float));
    y = (float*) malloc(N * sizeof(float));
    cout << "allocated " << 2 * N * sizeof(float) << " bytes on the host" << endl;

    // allocate memory on the device
    // to use this function, we need to pass a pointer to allocated device memory
    // and a requested allocation size in bytes.
    cudaMalloc(&d_x, N * sizeof(float));
    // Allocates size bytes of linear memory on the device and returns in *devPtr a pointer to the allocated memory.
    // The allocated memory is suitably aligned for any kind of variable. The memory is not cleared.
    // cudaMalloc() returns cudaErrorMemoryAllocation in case of failure.
    // The device version of cudaFree cannot be used with a *devPtr allocated using the host API, and vice versa.
    cudaMalloc(&d_y, N * sizeof(float));
    cout << "allocated " << 2 * N * sizeof(float) << " bytes on the device" << endl;

    // fill the arrays on the host before
    for (int i = 0; i < N; i++)
    {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    // sending it, or `copy`-ing it to the device
    cout << "arrays filled on the host" << endl;

    // copy from destination - y to source - d_y.
    // count: size in bytes to copy.
    // kind: type of transfer
    cout << "copying from host to device started" << endl;
    cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyHostToDevice);
    cout << "copying from host to device finished" << endl;


    cout << "launching kernel" << endl;
    // launch the kernel
    saxpy<<<(N + 256) / 256, 256>>>(N, 2.0f, d_x, d_y);
    // The execution configuration (of a global function call) is specified by inserting an expression of the form
    // kernel<<<Dg, Db, Ns, S>>>()
    // where:
    //   * Dg (dim3) specifies the dimension and size of the grid.
    //   * Db (dim3) specifies the dimension and size of each block
    //   * Ns (size_t) specifies the number of bytes in shared memory that is dynamically allocated per block for
    //     this call in addition to the statically allocated memory.
    //   * S (cudaStream_t) specifies the associated stream, is an optional parameter which defaults to 0.
    cout << "kernel finished successfully" << endl;

    // Copies `count` bytes from the memory area pointed to by src to the memory area pointed to by dst,
    // where kind specifies the direction of the copy and must be one of: h-h h-d d-h d-d
    cout << "copying from back from device to host started" << endl;
    cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
    cout << "copying from back from device to host finished" << endl;

    cout << "checking for maxError in the resulting array" << endl;
    // check for max error
    // we have the result array in y
    float maxError = 0.0f;
    for (int i = 0; i < N; i++) {
        maxError = max(maxError, abs(y[i] - 4.0f));
    }
    cout << "checking for maxError in the resulting done" << endl;
    cout << "max error: " << maxError << endl;

    cout << "freeing memory" << endl;
    // free the memory, no need to explain this a lot.
    cudaFree(d_x);
    cudaFree(d_y);
    free(x);
    free(y);
    cout << "freeing memory done" << endl;
    cout << "exiting program" << endl;

    system("pause");

    return EXIT_SUCCESS;
 }
	#include <iostream>

	using namespace std;

	// create a 'kernel'. a kernel is a function that runs on the 'device' - GPU
	__global__
	void saxpy(int n, float a, float* x, float* y)
	{
	// get the current index in the arrays;
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	if (i < n)
	{
	y[i] = a * x[i] + y[i];
	}
	}

	int main()
	{
	// the size of the array
	const int N = 1 << 30;
	cout << "multiplying 2 arrays with the size of: " << N << endl;

	// define variables
	// x and y are pointers to the array on the host
	// d_x and d_y - device array pointers
	float x, y, d_x, d_y;
	x = (float) malloc(N sizeof(float));
	y = (float) malloc(N sizeof(float));
	cout << "allocated " << 2 * N * sizeof(float) << " bytes on the host" << endl;

	// allocate memory on the device
	// to use this function, we need to pass a pointer to allocated device memory
	// and a requested allocation size in bytes.
	cudaMalloc(&d_x, N * sizeof(float));
	// Allocates size bytes of linear memory on the device and returns in *devPtr a pointer to the allocated memory.
	// The allocated memory is suitably aligned for any kind of variable. The memory is not cleared.
	// cudaMalloc() returns cudaErrorMemoryAllocation in case of failure.
	// The device version of cudaFree cannot be used with a *devPtr allocated using the host API, and vice versa.
	cudaMalloc(&d_y, N * sizeof(float));
	cout << "allocated " << 2 * N * sizeof(float) << " bytes on the device" << endl;

	// fill the arrays on the host before
	for (int i = 0; i < N; i++)
	{
	x[i] = 1.0f;
	y[i] = 2.0f;
	}
	// sending it, or `copy`-ing it to the device
	cout << "arrays filled on the host" << endl;

	// copy from destination - y to source - d_y.
	// count: size in bytes to copy.
	// kind: type of transfer
	cout << "copying from host to device started" << endl;
	cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyHostToDevice);
	cout << "copying from host to device finished" << endl;


	cout << "launching kernel" << endl;
	// launch the kernel
	saxpy<<<(N + 256) / 256, 256>>>(N, 2.0f, d_x, d_y);
	// The execution configuration (of a global function call) is specified by inserting an expression of the form
	// kernel<<<Dg, Db, Ns, S>>>()
	// where:
	// * Dg (dim3) specifies the dimension and size of the grid.
	// * Db (dim3) specifies the dimension and size of each block
	// * Ns (size_t) specifies the number of bytes in shared memory that is dynamically allocated per block for
	// this call in addition to the statically allocated memory.
	// * S (cudaStream_t) specifies the associated stream, is an optional parameter which defaults to 0.
	cout << "kernel finished successfully" << endl;

	// Copies `count` bytes from the memory area pointed to by src to the memory area pointed to by dst,
	// where kind specifies the direction of the copy and must be one of: h-h h-d d-h d-d
	cout << "copying from back from device to host started" << endl;
	cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
	cout << "copying from back from device to host finished" << endl;

	cout << "checking for maxError in the resulting array" << endl;
	// check for max error
	// we have the result array in y
	float maxError = 0.0f;
	for (int i = 0; i < N; i++) {
	maxError = max(maxError, abs(y[i] - 4.0f));
	}
	cout << "checking for maxError in the resulting done" << endl;
	cout << "max error: " << maxError << endl;

	cout << "freeing memory" << endl;
	// free the memory, no need to explain this a lot.
	cudaFree(d_x);
	cudaFree(d_y);
	free(x);
	free(y);
	cout << "freeing memory done" << endl;
	cout << "exiting program" << endl;

	system("pause");

	return EXIT_SUCCESS;
	}