Created
February 16, 2023 17:28
-
-
Save luraess/5e697f857a7aa4d1d00e99ca02cbbb3d to your computer and use it in GitHub Desktop.
Laplacian 2D to test boundscheck perf on AMDGPU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkTools, AMDGPU | |
function diff2D_step_inbounds!(T2, T, Ci, lam, dt, _dx, _dy) | |
ix = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x | |
iy = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y | |
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2)) | |
@inbounds T2[ix,iy] = T[ix,iy] + dt*(Ci[ix,iy]*( | |
- ((-lam*(T[ix+1,iy] - T[ix,iy])*_dx) - (-lam*(T[ix,iy] - T[ix-1,iy])*_dx))*_dx | |
- ((-lam*(T[ix,iy+1] - T[ix,iy])*_dy) - (-lam*(T[ix,iy] - T[ix,iy-1])*_dy))*_dy )) | |
end | |
return | |
end | |
function diff2D_step!(T2, T, Ci, lam, dt, _dx, _dy) | |
ix = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x | |
iy = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y | |
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2)) | |
T2[ix,iy] = T[ix,iy] + dt*(Ci[ix,iy]*( | |
- ((-lam*(T[ix+1,iy] - T[ix,iy])*_dx) - (-lam*(T[ix,iy] - T[ix-1,iy])*_dx))*_dx | |
- ((-lam*(T[ix,iy+1] - T[ix,iy])*_dy) - (-lam*(T[ix,iy] - T[ix,iy-1])*_dy))*_dy )) | |
end | |
return | |
end | |
function run_bench(;DAT=Float64) | |
sc = DAT==Float64 ? 1 : 2 | |
fact = 24 | |
nx,ny,nz = sc*fact*1024-1, fact*1024-1, 1 | |
threads = (128, 2, 1) | |
grid = (nx+1, ny+1) | |
A = ROCArray(zeros(DAT, nx, ny)) | |
B = ROCArray( rand(DAT, nx, ny)) | |
C = ROCArray( ones(DAT, nx, ny)) | |
lam = rand(DAT) | |
_dx,_dy = DAT(1.0), DAT(1.0) | |
dt = DAT(1.0/10.0/4.1) | |
println("Process selecting device $(AMDGPU.default_device_id())") | |
println("Problem size: nx=$nx, ny=$ny, nz=$nz, $(DAT)") | |
println("ROCm grid=$(grid), threads=$(threads)") | |
# run memcopy | |
sig = ROCSignal() | |
rocqueue = ROCQueue(AMDGPU.default_device(); priority=:high) | |
# run Laplacian | |
# t_it = @belapsed begin wait( @roc groupsize=$threads gridsize=$grid diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy) ) end | |
t_it = @belapsed begin | |
AMDGPU.HSA.signal_store_screlease($(sig.signal),1) | |
@roc wait=false mark=false signal=$sig queue=$rocqueue groupsize=$threads gridsize=$grid diff2D_step_inbounds!($A, $B, $C, $lam, $dt, $_dx, $_dy) | |
wait($sig) | |
end | |
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it | |
println("T_tot Lap2D inbounds = $(round(T_tot,sigdigits=7)) GB/s") | |
t_it = @belapsed begin | |
AMDGPU.HSA.signal_store_screlease($(sig.signal),1) | |
@roc wait=false mark=false signal=$sig queue=$rocqueue groupsize=$threads gridsize=$grid diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy) | |
wait($sig) | |
end | |
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it | |
println("T_tot Lap2D no inbounds = $(round(T_tot,sigdigits=7)) GB/s") | |
t_it = @belapsed begin | |
AMDGPU.HSA.signal_store_screlease($(sig.signal),1) | |
@roc wait=false mark=false boundscheck=false signal=$sig queue=$rocqueue groupsize=$threads gridsize=$grid diff2D_step!($A, $B, $C, $lam, $dt, $_dx, $_dy) | |
wait($sig) | |
end | |
T_tot = 3*1/1e9*nx*ny*sizeof(DAT)/t_it | |
println("T_tot Lap2D boundscheck=false = $(round(T_tot,sigdigits=7)) GB/s") | |
return | |
end | |
run_bench() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment