1nF0rmed · July 1, 2025 16:52
diff --git a/100k_main.cu b/100k_main.cu
 // build with:  nvcc -O3 -std=c++17 -lcublas -arch=sm_75 100k_main.cu
 // note: uses ~3GB VRAM. Took 20 min on 1080
 #include <cstdio>
 #include <cstdlib>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>

 using ll = long long;
 constexpr ll  N    = 100'000LL;  
 constexpr int BS   = 16384;       
 constexpr int LDD  = BS;         
 const     size_t TILE_BYTES = size_t(BS) * BS * sizeof(float);

 inline void gpuCheck(cudaError_t e)       { if (e!=cudaSuccess){fprintf(stderr,"CUDA:%s\n",cudaGetErrorString(e));std::exit(1);} }
 inline void blasCheck(cublasStatus_t st)  { if (st!=CUBLAS_STATUS_SUCCESS){fprintf(stderr,"cuBLAS err %d\n",st);std::exit(1);} }

 float *random_alloc_host(size_t bytes){
    float *p = (float*)std::malloc(bytes);
    for (size_t i=0;i<bytes/sizeof(float); ++i) p[i] = rand()/float(RAND_MAX);
    return p;
 }

 int main(){
    std::printf("Streaming GEMM %lld x %lld (float, row-major on host, col-major on GPU tiles)\n",N,N);

    float *hA = (float*)std::malloc(TILE_BYTES);
    float *hB = (float*)std::malloc(TILE_BYTES);
    float *hC = (float*)std::malloc(TILE_BYTES); 

    float *dA,*dB,*dC;
    gpuCheck(cudaMalloc(&dA, TILE_BYTES));
    gpuCheck(cudaMalloc(&dB, TILE_BYTES));
    gpuCheck(cudaMalloc(&dC, TILE_BYTES));

    cublasHandle_t handle;  blasCheck(cublasCreate(&handle));
    blasCheck(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));

    const float alpha = 1.f, beta0 = 0.f, beta1 = 1.f;
    cudaEvent_t start,stop;  cudaEventCreate(&start);  cudaEventCreate(&stop);
    cudaEventRecord(start);

    for (ll i=0; i<N; i+=BS){                       
        int rows = int( std::min<ll>(BS, N-i) );
        for (ll j=0; j<N; j+=BS){
            int cols = int( std::min<ll>(BS, N-j) );
            std::fill(hC, hC + rows*cols, 0.f);

            for (ll k=0; k<N; k+=BS){
                int innr = int( std::min<ll>(BS, N-k) );

                for (int r=0; r<rows; ++r)
                    for (int c=0; c<innr; ++c)
                        hA[c*rows + r] = rand()/float(RAND_MAX);
                gpuCheck(cudaMemcpy(dA, hA, size_t(rows)*innr*sizeof(float), cudaMemcpyHostToDevice));

                for (int r=0; r<innr; ++r)
                    for (int c=0; c<cols; ++c)
                        hB[c*innr + r] = rand()/float(RAND_MAX);          
                gpuCheck(cudaMemcpy(dB, hB, size_t(innr)*cols*sizeof(float), cudaMemcpyHostToDevice));

                blasCheck(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                       rows, cols, innr,
                                       &alpha,
                                       dA, rows,
                                       dB, innr,
                                       (k==0 ? &beta0 : &beta1),
                                       dC, rows));
            }

            gpuCheck(cudaMemcpy(hC, dC, size_t(rows)*cols*sizeof(float), cudaMemcpyDeviceToHost));
            double s=0; for (int t=0;t<rows*cols;++t) s+=hC[t];
            std::printf("Block C(%lld:%lld,%lld:%lld) checksum %.2e\n",i,i+rows-1,j,j+cols-1,s);
        }
    }

    cudaEventRecord(stop); cudaEventSynchronize(stop);
    float ms=0; cudaEventElapsedTime(&ms,start,stop);
    std::printf("Total streamed GEMM time: %.1f s\n", ms/1000.0);

    cudaFree(dA); cudaFree(dB); cudaFree(dC);
    cublasDestroy(handle);
    std::free(hA); std::free(hB); std::free(hC);
    return 0;
 }
	// build with: nvcc -O3 -std=c++17 -lcublas -arch=sm_75 100k_main.cu
	// note: uses ~3GB VRAM. Took 20 min on 1080
	#include <cstdio>
	#include <cstdlib>
	#include <cublas_v2.h>
	#include <cuda_runtime.h>

	using ll = long long;
	constexpr ll N = 100'000LL;
	constexpr int BS = 16384;
	constexpr int LDD = BS;
	const size_t TILE_BYTES = size_t(BS) * BS * sizeof(float);

	inline void gpuCheck(cudaError_t e) { if (e!=cudaSuccess){fprintf(stderr,"CUDA:%s\n",cudaGetErrorString(e));std::exit(1);} }
	inline void blasCheck(cublasStatus_t st) { if (st!=CUBLAS_STATUS_SUCCESS){fprintf(stderr,"cuBLAS err %d\n",st);std::exit(1);} }

	float *random_alloc_host(size_t bytes){
	float p = (float)std::malloc(bytes);
	for (size_t i=0;i<bytes/sizeof(float); ++i) p[i] = rand()/float(RAND_MAX);
	return p;
	}

	int main(){
	std::printf("Streaming GEMM %lld x %lld (float, row-major on host, col-major on GPU tiles)\n",N,N);

	float hA = (float)std::malloc(TILE_BYTES);
	float hB = (float)std::malloc(TILE_BYTES);
	float hC = (float)std::malloc(TILE_BYTES);

	float dA,dB,*dC;
	gpuCheck(cudaMalloc(&dA, TILE_BYTES));
	gpuCheck(cudaMalloc(&dB, TILE_BYTES));
	gpuCheck(cudaMalloc(&dC, TILE_BYTES));

	cublasHandle_t handle; blasCheck(cublasCreate(&handle));
	blasCheck(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));

	const float alpha = 1.f, beta0 = 0.f, beta1 = 1.f;
	cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop);
	cudaEventRecord(start);

	for (ll i=0; i<N; i+=BS){
	int rows = int( std::min<ll>(BS, N-i) );
	for (ll j=0; j<N; j+=BS){
	int cols = int( std::min<ll>(BS, N-j) );
	std::fill(hC, hC + rows*cols, 0.f);

	for (ll k=0; k<N; k+=BS){
	int innr = int( std::min<ll>(BS, N-k) );

	for (int r=0; r<rows; ++r)
	for (int c=0; c<innr; ++c)
	hA[c*rows + r] = rand()/float(RAND_MAX);
	gpuCheck(cudaMemcpy(dA, hA, size_t(rows)innrsizeof(float), cudaMemcpyHostToDevice));

	for (int r=0; r<innr; ++r)
	for (int c=0; c<cols; ++c)
	hB[c*innr + r] = rand()/float(RAND_MAX);
	gpuCheck(cudaMemcpy(dB, hB, size_t(innr)colssizeof(float), cudaMemcpyHostToDevice));

	blasCheck(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
	rows, cols, innr,
	&alpha,
	dA, rows,
	dB, innr,
	(k==0 ? &beta0 : &beta1),
	dC, rows));
	}

	gpuCheck(cudaMemcpy(hC, dC, size_t(rows)colssizeof(float), cudaMemcpyDeviceToHost));
	double s=0; for (int t=0;t<rows*cols;++t) s+=hC[t];
	std::printf("Block C(%lld:%lld,%lld:%lld) checksum %.2e\n",i,i+rows-1,j,j+cols-1,s);
	}
	}

	cudaEventRecord(stop); cudaEventSynchronize(stop);
	float ms=0; cudaEventElapsedTime(&ms,start,stop);
	std::printf("Total streamed GEMM time: %.1f s\n", ms/1000.0);

	cudaFree(dA); cudaFree(dB); cudaFree(dC);
	cublasDestroy(handle);
	std::free(hA); std::free(hB); std::free(hC);
	return 0;
	}