Created
July 1, 2025 16:52
-
-
Save 1nF0rmed/2405839b1322e058ba1470908931d580 to your computer and use it in GitHub Desktop.
100k x 100k matrix mat mul
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// build with: nvcc -O3 -std=c++17 -lcublas -arch=sm_75 100k_main.cu | |
// note: uses ~3GB VRAM. Took 20 min on 1080 | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cublas_v2.h> | |
#include <cuda_runtime.h> | |
using ll = long long; | |
constexpr ll N = 100'000LL; | |
constexpr int BS = 16384; | |
constexpr int LDD = BS; | |
const size_t TILE_BYTES = size_t(BS) * BS * sizeof(float); | |
inline void gpuCheck(cudaError_t e) { if (e!=cudaSuccess){fprintf(stderr,"CUDA:%s\n",cudaGetErrorString(e));std::exit(1);} } | |
inline void blasCheck(cublasStatus_t st) { if (st!=CUBLAS_STATUS_SUCCESS){fprintf(stderr,"cuBLAS err %d\n",st);std::exit(1);} } | |
float *random_alloc_host(size_t bytes){ | |
float *p = (float*)std::malloc(bytes); | |
for (size_t i=0;i<bytes/sizeof(float); ++i) p[i] = rand()/float(RAND_MAX); | |
return p; | |
} | |
int main(){ | |
std::printf("Streaming GEMM %lld x %lld (float, row-major on host, col-major on GPU tiles)\n",N,N); | |
float *hA = (float*)std::malloc(TILE_BYTES); | |
float *hB = (float*)std::malloc(TILE_BYTES); | |
float *hC = (float*)std::malloc(TILE_BYTES); | |
float *dA,*dB,*dC; | |
gpuCheck(cudaMalloc(&dA, TILE_BYTES)); | |
gpuCheck(cudaMalloc(&dB, TILE_BYTES)); | |
gpuCheck(cudaMalloc(&dC, TILE_BYTES)); | |
cublasHandle_t handle; blasCheck(cublasCreate(&handle)); | |
blasCheck(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); | |
const float alpha = 1.f, beta0 = 0.f, beta1 = 1.f; | |
cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop); | |
cudaEventRecord(start); | |
for (ll i=0; i<N; i+=BS){ | |
int rows = int( std::min<ll>(BS, N-i) ); | |
for (ll j=0; j<N; j+=BS){ | |
int cols = int( std::min<ll>(BS, N-j) ); | |
std::fill(hC, hC + rows*cols, 0.f); | |
for (ll k=0; k<N; k+=BS){ | |
int innr = int( std::min<ll>(BS, N-k) ); | |
for (int r=0; r<rows; ++r) | |
for (int c=0; c<innr; ++c) | |
hA[c*rows + r] = rand()/float(RAND_MAX); | |
gpuCheck(cudaMemcpy(dA, hA, size_t(rows)*innr*sizeof(float), cudaMemcpyHostToDevice)); | |
for (int r=0; r<innr; ++r) | |
for (int c=0; c<cols; ++c) | |
hB[c*innr + r] = rand()/float(RAND_MAX); | |
gpuCheck(cudaMemcpy(dB, hB, size_t(innr)*cols*sizeof(float), cudaMemcpyHostToDevice)); | |
blasCheck(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, | |
rows, cols, innr, | |
&alpha, | |
dA, rows, | |
dB, innr, | |
(k==0 ? &beta0 : &beta1), | |
dC, rows)); | |
} | |
gpuCheck(cudaMemcpy(hC, dC, size_t(rows)*cols*sizeof(float), cudaMemcpyDeviceToHost)); | |
double s=0; for (int t=0;t<rows*cols;++t) s+=hC[t]; | |
std::printf("Block C(%lld:%lld,%lld:%lld) checksum %.2e\n",i,i+rows-1,j,j+cols-1,s); | |
} | |
} | |
cudaEventRecord(stop); cudaEventSynchronize(stop); | |
float ms=0; cudaEventElapsedTime(&ms,start,stop); | |
std::printf("Total streamed GEMM time: %.1f s\n", ms/1000.0); | |
cudaFree(dA); cudaFree(dB); cudaFree(dC); | |
cublasDestroy(handle); | |
std::free(hA); std::free(hB); std::free(hC); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment