import Pkg Pkg.activate("@colab") Pkg.add([ "CUDA", "KernelAbstractions", "Atomix", "AcceleratedKernels", "BenchmarkTools", "Adapt", "NVTX", "ImageShow", ]) versioninfo() using CUDA, KernelAbstractions, Adapt function copy_cpu!(A, B) for I in 1:length(A) @inbounds A[I] = B[I] end end @kernel function copy_kernel!(A, B) I = @index(Global) @inbounds A[I] = B[I] end function copy_ka!(A, B) backend = get_backend(A) @assert size(A) == size(B) @assert get_backend(B) == backend kernel = copy_kernel!(backend) kernel(A, B, ndrange = length(A)) return end using CUDA: i32 function copy_kernel_cuda!(A, B) I = (blockIdx().x-1i32) * blockDim().x + threadIdx().x if I <= length(A) @inbounds A[I] = B[I] end return nothing end function copy_cuda!(A, B) kernel = @cuda launch=false copy_kernel_cuda!(A, B) config = launch_configuration(kernel.fun) threads = min(length(A), config.threads) blocks = cld(length(A), threads) kernel(A, B; threads, blocks) end B = rand(64_000); let A = similar(B) copy_cpu!(A, B) @assert A == B end d_B = adapt(CuArray, B); typeof(d_B) let d_A = similar(d_B) copy_cuda!(d_A, d_B) @assert d_A == d_B end using BenchmarkTools @benchmark copy_cuda!(d_A, $d_B) setup=(d_A = similar(d_B)) CUDA.@profile let d_A = similar(d_B) for _ in 1:10 copy_cuda!(d_A, d_B) end end @benchmark CUDA.@sync(copy_cuda!(d_A, $d_B)) setup=(d_A = similar(d_B)) let A = similar(B) copy_ka!(A, B) @assert A == B end let d_A = similar(d_B) copy_ka!(d_A, d_B) @assert d_A == d_B end @benchmark copy_ka!(A, $B) setup=(A = similar(B)) @benchmark CUDA.@sync(copy_ka!(d_A, $d_B)) setup=(d_A = similar(d_B)) CUDA.@profile let d_A = similar(d_B) for _ in 1:10 copy_ka!(d_A, d_B) end end @kernel unsafe_indices=true function copy_kernel_faster!(A, B) N = @uniform prod(@groupsize()) I = (@index(Group, Linear)-1i32) * N + @index(Local, Linear) if I <= length(A) @inbounds A[I] = B[I] end end CUDA.@profile let d_A = similar(d_B) for _ in 1:10 copy_kernel_faster!(CUDABackend())(d_A, d_B, ndrange=length(d_A)) end end const nreps = 3 const N = 2048 const T = Float32 const TILE_DIM = 32 const BLOCK_ROWS = 8 @kernel function simple_copy_kernel!(output, @Const(input)) I, J = @index(Global, NTuple) @inbounds output[I, J] = input[I, J] end @kernel function simple_transpose_kernel!(output, @Const(input)) I, J = @index(Global, NTuple) @inbounds output[J, I] = input[I, J] end @kernel unsafe_indices = true function lmem_copy_kernel!( output, @Const(input), ::Val{BANK} = Val(1), ) where {BANK} I, J = @index(Global, NTuple) i, j = @index(Local, NTuple) N = @uniform @groupsize()[1] M = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory tile = @localmem eltype(output) (N + BANK, M) @inbounds tile[i, j] = input[I, J] @synchronize @inbounds output[I, J] = tile[i, j] end @kernel unsafe_indices = true function lmem_transpose_kernel!( output, @Const(input), ::Val{BANK} = Val(1), ) where {BANK} gi, gj = @index(Group, NTuple) i, j = @index(Local, NTuple) N = @uniform @groupsize()[1] M = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory tile = @localmem eltype(output) (N + BANK, M) # Manually calculate global indexes # Later on we need to pivot the group index I = (gi - 1) * N + i J = (gj - 1) * M + j @inbounds tile[i, j] = input[I, J] @synchronize # Pivot the group index I = (gj - 1) * M + i J = (gi - 1) * N + j @inbounds output[I, J] = tile[j, i] end using KernelAbstractions.Extras: @unroll @kernel unsafe_indices=true function coalesced_copy_kernel!( output, @Const(input), ::Val{BANK} = Val(1), ) where {BANK} gi, gj = @index(Group, NTuple) i, j = @index(Local, NTuple) TILE_DIM = @uniform @groupsize()[1] BLOCK_ROWS = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) # Can't use @index(Global), because we use a smaller ndrange I = (gi - 1) * TILE_DIM + i J = (gj - 1) * TILE_DIM + j @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) @inbounds tile[i, j + k] = input[I, J + k] end @synchronize @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) @inbounds output[I, J + k] = tile[i, j + k] end end @kernel unsafe_indices = true function coalesced_transpose_kernel!( output, @Const(input), ::Val{BANK} = Val(1), ) where {BANK} gi, gj = @index(Group, NTuple) i, j = @index(Local, NTuple) TILE_DIM = @uniform @groupsize()[1] BLOCK_ROWS = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) # Can't use @index(Global), because we use a smaller ndrange I = (gi - 1) * TILE_DIM + i J = (gj - 1) * TILE_DIM + j @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) @inbounds tile[i, j + k] = input[I, J + k] end @synchronize # Transpose block offsets I = (gj - 1) * TILE_DIM + i J = (gi - 1) * TILE_DIM + j @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) @inbounds output[I, J + k] = tile[j + k, i] end end using NVTX, Random backend = CUDABackend() CUDA.@profile for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM)) for (name, kernel) in ( ("copy", simple_copy_kernel!(backend, block_dims)), ("transpose", simple_transpose_kernel!(backend, block_dims)), ) NVTX.@range "Simple $name $block_dims" let input = rand!(allocate(backend, T, N, N)) output = similar(input) # compile kernel kernel(output, input, ndrange = size(output)) for rep in 1:nreps kernel(output, input, ndrange = size(output)) end KernelAbstractions.synchronize(backend) end end end # Benchmark localmem CUDA.@profile for (name, kernel) in ( ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))), ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))), ) for bank in (true, false) NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let input = rand!(allocate(backend, T, N, N)) output = similar(input) # compile kernel kernel(output, input, Val(Int(bank)), ndrange = size(output)) for rep in 1:nreps kernel(output, input, Val(Int(bank)), ndrange = size(output)) end KernelAbstractions.synchronize(backend) end end end # Benchmark localmem + multiple elements per lane CUDA.@profile for (name, kernel) in ( ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))), ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))), ) for bank in (true, false) NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let input = rand!(allocate(backend, T, N, N)) output = similar(input) # We want a number of blocks equivalent to (TILE_DIM, TILE_DIM) # but our blocks are (TILE_DIM, BLOCK_ROWS) so we need to remove # a factor from the size of the array otherwise we get to many blocks block_factor = div(TILE_DIM, BLOCK_ROWS) ndrange = (N, div(N, block_factor)) # compile kernel kernel(output, input, Val(Int(bank)), ndrange = ndrange) for rep in 1:nreps kernel(output, input, Val(Int(bank)), ndrange = ndrange) end KernelAbstractions.synchronize(backend) end end end @kernel function racy_kernel!(out, arr) i, j = @index(Global, NTuple) for k in 1:size(out, 1) out[k, i] += arr[i, j] end end using ImageShow img = zeros(Float32, (50, 50)); img[10:20, 10:20] .= 1; img[35:45, 35:45] .= 2; simshow(img) out = KernelAbstractions.zeros(backend, Float32, size(img)); racy_kernel!(backend)(out, adapt(backend, img), ndrange=size(img)) simshow(Array(out)) using Atomix: @atomic, @atomicswap, @atomicreplace @kernel function nonracy_kernel!(out, arr) i, j = @index(Global, NTuple) for k in 1:size(out, 1) @atomic out[k, i] += arr[i, j] end end out = KernelAbstractions.zeros(backend, Float32, size(img)); nonracy_kernel!(backend)(out, adapt(backend, img), ndrange=size(img)) simshow(Array(out)) @kernel function naive_matmul_kernel!(output, a, b) i, j = @index(Global, NTuple) # creating a temporary sum variable for matrix multiplication tmp_sum = zero(eltype(output)) for k in 1:size(a)[2] tmp_sum += a[i, k] * b[k, j] end output[i, j] = tmp_sum end # Creating a wrapper kernel for launching with error checks function naive_matmul!(output, a, b) if size(a)[2] != size(b)[1] println("Matrix size mismatch!") return nothing end backend = KernelAbstractions.get_backend(a) kernel! = naive_matmul_kernel!(backend) kernel!(output, a, b, ndrange = size(output)) return end let a = rand!(allocate(backend, Float32, 256, 123)) b = rand!(allocate(backend, Float32, 123, 45)) output = KernelAbstractions.zeros(backend, Float32, 256, 45) naive_matmul!(output, a, b) @assert isapprox(output, a * b) end @kernel unsafe_indices = true function coalesced_matmul_kernel!( output, @Const(A), @Const(B), ::Val{BANK} = Val(1), ) where {BANK} gi, gj = @index(Group, NTuple) i, j = @index(Local, NTuple) TILE_DIM = @uniform @groupsize()[1] # +1 to avoid bank conflicts on shared memory tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) # private variable for tile output outval = @private eltype(output) 1 @inbounds outval[1] = -zero(eltype(output)) @uniform N = size(output, 1) @uniform M = size(output, 2) @uniform R = size(A, 2) # number of tiles depends on inner dimension @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM) # loop over all tiles needed for this calculation for t in 0:(NUM_TILES - 1) # Can't use @index(Global), because we use a smaller ndrange I = (gi - 1) * TILE_DIM + i J = (gj - 1) * TILE_DIM + j # load inputs into tiles, with bounds checking for non-square matrices if I <= N && t * TILE_DIM + j <= R @inbounds tile1[i, j] = A[I, t * TILE_DIM + j] else @inbounds tile1[i, j] = 0.0 end if t * TILE_DIM + i <= R && J <= M @inbounds tile2[i, j] = B[t * TILE_DIM + i, J] else @inbounds tile2[i, j] = 0.0 end # wait for all tiles to be loaded @synchronize # get global values again I = (gi - 1) * TILE_DIM + i J = (gj - 1) * TILE_DIM + j # calculate value of spot in output, use temporary value to allow for vectorization out = zero(eltype(output)) @simd for k in 1:TILE_DIM @inbounds out += tile1[i, k] * tile2[k, j] end outval[1] += out @synchronize end # get global indices again I = (gi - 1) * TILE_DIM + i J = (gj - 1) * TILE_DIM + j # save if inbounds if I <= N && J <= M @inbounds output[I, J] = outval[1] end end # Creating a wrapper kernel for launching with error checks function coalesced_matmul!(output, a, b) if size(a)[2] != size(b)[1] println("Matrix size mismatch!") return nothing end backend = KernelAbstractions.get_backend(a) kernel! = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM)) kernel!(output, a, b, ndrange = size(output)) return end let a = rand!(allocate(backend, Float32, 256, 123)) b = rand!(allocate(backend, Float32, 123, 45)) output = KernelAbstractions.zeros(backend, Float32, 256, 45) coalesced_matmul!(output, a, b) @assert isapprox(output, a * b) end import LinearAlgebra let N = 1024 R = 512 M = 2048 T = Float64 A = rand!(allocate(backend, T, N, R)) B = rand!(allocate(backend, T, R, M)) output_naive = KernelAbstractions.zeros(backend, T, N, M) output_coalesced = KernelAbstractions.zeros(backend, T, N, M) output_mul = KernelAbstractions.zeros(backend, T, N, M) CUDA.@profile for _ in 1:nreps NVTX.@range "Naive Matmul" begin naive_matmul!(output_naive, A, B) KernelAbstractions.synchronize(backend) end NVTX.@range "Coalesced Matmul" begin coalesced_matmul!(output_coalesced, A, B) KernelAbstractions.synchronize(backend) end NVTX.@range "LinearAlgebra.mul!" begin LinearAlgebra.mul!(output_mul, A, B) KernelAbstractions.synchronize(backend) end end end