Skip to content

Instantly share code, notes, and snippets.

@1nF0rmed
Created July 1, 2025 16:52
Show Gist options
  • Save 1nF0rmed/2405839b1322e058ba1470908931d580 to your computer and use it in GitHub Desktop.
Save 1nF0rmed/2405839b1322e058ba1470908931d580 to your computer and use it in GitHub Desktop.
100k x 100k matrix mat mul
// build with: nvcc -O3 -std=c++17 -lcublas -arch=sm_75 100k_main.cu
// note: uses ~3GB VRAM. Took 20 min on 1080
#include <cstdio>
#include <cstdlib>
#include <cublas_v2.h>
#include <cuda_runtime.h>
using ll = long long;
constexpr ll N = 100'000LL;
constexpr int BS = 16384;
constexpr int LDD = BS;
const size_t TILE_BYTES = size_t(BS) * BS * sizeof(float);
inline void gpuCheck(cudaError_t e) { if (e!=cudaSuccess){fprintf(stderr,"CUDA:%s\n",cudaGetErrorString(e));std::exit(1);} }
inline void blasCheck(cublasStatus_t st) { if (st!=CUBLAS_STATUS_SUCCESS){fprintf(stderr,"cuBLAS err %d\n",st);std::exit(1);} }
float *random_alloc_host(size_t bytes){
float *p = (float*)std::malloc(bytes);
for (size_t i=0;i<bytes/sizeof(float); ++i) p[i] = rand()/float(RAND_MAX);
return p;
}
int main(){
std::printf("Streaming GEMM %lld x %lld (float, row-major on host, col-major on GPU tiles)\n",N,N);
float *hA = (float*)std::malloc(TILE_BYTES);
float *hB = (float*)std::malloc(TILE_BYTES);
float *hC = (float*)std::malloc(TILE_BYTES);
float *dA,*dB,*dC;
gpuCheck(cudaMalloc(&dA, TILE_BYTES));
gpuCheck(cudaMalloc(&dB, TILE_BYTES));
gpuCheck(cudaMalloc(&dC, TILE_BYTES));
cublasHandle_t handle; blasCheck(cublasCreate(&handle));
blasCheck(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
const float alpha = 1.f, beta0 = 0.f, beta1 = 1.f;
cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
for (ll i=0; i<N; i+=BS){
int rows = int( std::min<ll>(BS, N-i) );
for (ll j=0; j<N; j+=BS){
int cols = int( std::min<ll>(BS, N-j) );
std::fill(hC, hC + rows*cols, 0.f);
for (ll k=0; k<N; k+=BS){
int innr = int( std::min<ll>(BS, N-k) );
for (int r=0; r<rows; ++r)
for (int c=0; c<innr; ++c)
hA[c*rows + r] = rand()/float(RAND_MAX);
gpuCheck(cudaMemcpy(dA, hA, size_t(rows)*innr*sizeof(float), cudaMemcpyHostToDevice));
for (int r=0; r<innr; ++r)
for (int c=0; c<cols; ++c)
hB[c*innr + r] = rand()/float(RAND_MAX);
gpuCheck(cudaMemcpy(dB, hB, size_t(innr)*cols*sizeof(float), cudaMemcpyHostToDevice));
blasCheck(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
rows, cols, innr,
&alpha,
dA, rows,
dB, innr,
(k==0 ? &beta0 : &beta1),
dC, rows));
}
gpuCheck(cudaMemcpy(hC, dC, size_t(rows)*cols*sizeof(float), cudaMemcpyDeviceToHost));
double s=0; for (int t=0;t<rows*cols;++t) s+=hC[t];
std::printf("Block C(%lld:%lld,%lld:%lld) checksum %.2e\n",i,i+rows-1,j,j+cols-1,s);
}
}
cudaEventRecord(stop); cudaEventSynchronize(stop);
float ms=0; cudaEventElapsedTime(&ms,start,stop);
std::printf("Total streamed GEMM time: %.1f s\n", ms/1000.0);
cudaFree(dA); cudaFree(dB); cudaFree(dC);
cublasDestroy(handle);
std::free(hA); std::free(hB); std::free(hC);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment