qfgaohao · October 17, 2017 04:13 · qfgaohao · Oct 11, 2017
diff --git a/bank_conflicts_test.cu b/bank_conflicts_test.cu
 #include <stdio.h>

 #define N (32)


 __global__ void increment(int* time) {
    __shared__ float s[1024];
    for (int i = 0; i < 1024; i++) {
        s[i] = 1.0f;
    }
    __syncthreads();
    
    for (int i = 0; i < 32; i++) {
        int start = clock();
        // enable broadcast by accessing the same element in a bank:
        // s[threadIdx.x * (i + 1) % 32] += 1.0f;
        s[threadIdx.x * (i + 1)] += 1.0f;  // stride: i + 1
        int end = clock();
        if (threadIdx.x == 0) {
            time[i] = end - start;
        }
    }
 }


 int main() {
    int *h_time;
    int* d_time;


    h_time = (int*)malloc(32 * sizeof(int));
    cudaMalloc(&d_time, N * sizeof(int));

    // setup the kernal
    increment<<<1, N>>>(d_time);
    cudaError_t ierrSync = cudaGetLastError();
    if(ierrSync != cudaSuccess) { 
        printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); 
    }
    
    // run the kernal
    cudaError_t ierrAsync = cudaDeviceSynchronize();
    if(ierrAsync != cudaSuccess) {
        printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); 
    }
    
    cudaMemcpy(h_time, d_time, 32 * sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 32; i++) {
        printf("%d ", h_time[i]);
    }
    printf("%s", "\n");
    
    cudaFree(d_time);
    free(h_time);

    return 0;
 }
	#include <stdio.h>

	#define N (32)


	__global__ void increment(int* time) {
	__shared__ float s[1024];
	for (int i = 0; i < 1024; i++) {
	s[i] = 1.0f;
	}
	__syncthreads();

	for (int i = 0; i < 32; i++) {
	int start = clock();
	// enable broadcast by accessing the same element in a bank:
	// s[threadIdx.x * (i + 1) % 32] += 1.0f;
	s[threadIdx.x * (i + 1)] += 1.0f; // stride: i + 1
	int end = clock();
	if (threadIdx.x == 0) {
	time[i] = end - start;
	}
	}
	}


	int main() {
	int *h_time;
	int* d_time;


	h_time = (int)malloc(32 sizeof(int));
	cudaMalloc(&d_time, N * sizeof(int));

	// setup the kernal
	increment<<<1, N>>>(d_time);
	cudaError_t ierrSync = cudaGetLastError();
	if(ierrSync != cudaSuccess) {
	printf("Sync error: %s\n", cudaGetErrorString(ierrSync));
	}

	// run the kernal
	cudaError_t ierrAsync = cudaDeviceSynchronize();
	if(ierrAsync != cudaSuccess) {
	printf("Async error: %s\n", cudaGetErrorString(ierrAsync));
	}

	cudaMemcpy(h_time, d_time, 32 * sizeof(int), cudaMemcpyDeviceToHost);
	for (int i = 0; i < 32; i++) {
	printf("%d ", h_time[i]);
	}
	printf("%s", "\n");

	cudaFree(d_time);
	free(h_time);

	return 0;
	}