import Pkg
Pkg.activate("@colab")
Pkg.add([
"CUDA",
"KernelAbstractions",
"Atomix",
"AcceleratedKernels",
"BenchmarkTools",
"Adapt",
"NVTX",
"ImageShow",
])
Activating project at `/content/@colab` Resolving package versions... Installed ImageShow ─ v0.3.8 Updating `/content/@colab/Project.toml` [4e3cecfd] + ImageShow v0.3.8 Updating `/content/@colab/Manifest.toml` [35d6a980] + ColorSchemes v3.29.0 [c3611d14] + ColorVectorSpace v0.11.0 [5789e2e9] + FileIO v1.17.0 [c817782e] + ImageBase v0.1.7 [a09fc81d] + ImageCore v0.10.5 [4e3cecfd] + ImageShow v0.3.8 [dbb5928d] + MappedArrays v0.4.2 [e94cdb99] + MosaicViews v0.3.4 [6fe1bfb0] + OffsetArrays v1.17.0 [5432bcbf] + PaddedViews v0.5.12 [cae243ae] + StackViews v0.1.2 [62fd8b95] + TensorCore v0.1.1 Precompiling packages... 1395.2 ms ✓ ImageShow 1 dependency successfully precompiled in 2 seconds. 92 already precompiled.
versioninfo()
Julia Version 1.10.9 Commit 5595d20a287 (2025-03-10 12:51 UTC) Build Info: Official https://julialang.org/ release Platform Info: OS: Linux (x86_64-linux-gnu) CPU: 2 × Intel(R) Xeon(R) CPU @ 2.00GHz WORD_SIZE: 64 LIBM: libopenlibm LLVM: libLLVM-15.0.7 (ORCJIT, skylake-avx512) Threads: 2 default, 0 interactive, 1 GC (on 2 virtual cores) Environment: LD_LIBRARY_PATH = /usr/lib64-nvidia JULIA_NUM_THREADS = auto
using CUDA, KernelAbstractions, Adapt
function copy_cpu!(A, B)
for I in 1:length(A)
@inbounds A[I] = B[I]
end
end
copy_cpu! (generic function with 1 method)
@kernel function copy_kernel!(A, B)
I = @index(Global)
@inbounds A[I] = B[I]
end
copy_kernel! (generic function with 4 methods)
function copy_ka!(A, B)
backend = get_backend(A)
@assert size(A) == size(B)
@assert get_backend(B) == backend
kernel = copy_kernel!(backend)
kernel(A, B, ndrange = length(A))
return
end
copy_ka! (generic function with 1 method)
using CUDA: i32
function copy_kernel_cuda!(A, B)
I = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
if I <= length(A)
@inbounds A[I] = B[I]
end
return nothing
end
copy_kernel_cuda! (generic function with 1 method)
function copy_cuda!(A, B)
kernel = @cuda launch=false copy_kernel_cuda!(A, B)
config = launch_configuration(kernel.fun)
threads = min(length(A), config.threads)
blocks = cld(length(A), threads)
kernel(A, B; threads, blocks)
end
copy_cuda! (generic function with 1 method)
B = rand(64_000);
let
A = similar(B)
copy_cpu!(A, B)
@assert A == B
end
Julia GPU ecosystem follows the motto: Compute follows Data
So let's move our data to the GPU!
d_B = adapt(CuArray, B);
typeof(d_B)
CuArray{Float64, 1, CUDA.DeviceMemory}
let
d_A = similar(d_B)
copy_cuda!(d_A, d_B)
@assert d_A == d_B
end
Note that Julia GPU Ecosystem, synchronizes the GPU on access. So we are launchign two GPU kernels here, first the copy, then the comparision and they are both executing asynchronously, but ordered with respect to each other.
We then "wait" for the result of the comparision.
using BenchmarkTools
@benchmark copy_cuda!(d_A, $d_B) setup=(d_A = similar(d_B))
BenchmarkTools.Trial: 10000 samples with 6 evaluations per sample. Range (min … max): 5.686 μs … 16.612 μs ┊ GC (min … max): 0.00% … 0.00% Time (median): 6.044 μs ┊ GC (median): 0.00% Time (mean ± σ): 6.248 μs ± 780.293 ns ┊ GC (mean ± σ): 0.00% ± 0.00% ▄▇████▇▅▄▃▁▁▁ ▁ ▁▂▂▂▁▁▁▁ ▂ ████████████████████████████▇▆▆▆▅▆▆▅▆▆▅▄▅▅▅▆▅▄▃▅▅▅▄▅▅▅▇▅▆▆▆ █ 5.69 μs Histogram: log(frequency) by time 10.2 μs < Memory estimate: 720 bytes, allocs estimate: 28.
CUDA.@profile let
d_A = similar(d_B)
for _ in 1:10
copy_cuda!(d_A, d_B)
end
end
Profiler ran for 270.84 µs, capturing 261 events. Host-side activity: calling CUDA APIs took 119.92 µs (44.28% of the trace) ┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤ │ 27.64% │ 74.86 µs │ 10 │ 7.49 µs ± 9.96 ( 3.58 ‥ 35.76) │ cuLaunchKernel │ │ 4.40% │ 11.92 µs │ 1 │ │ cuMemAllocFromPoolAsync │ └──────────┴────────────┴───────┴─────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 56.03 µs (20.69% of the trace) ┌──────────┬────────────┬───────┬────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────┤ │ 20.69% │ 56.03 µs │ 10 │ 5.6 µs ± 0.59 ( 5.25 ‥ 7.15) │ copy_kernel_cuda_(CuDeviceArray<Float64, 1, 1>, CuDeviceArray<Float64, 1, 1>) │ └──────────┴────────────┴───────┴────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────┘
So there seems to be a discrepancy between the measurement of @benchmark
and CUDA.@profile
, @benchmark
seems to vastly over-estimate the performance of the GPU code. To remedy this we need to include a synchronization operation with benchmarking.
@benchmark CUDA.@sync(copy_cuda!(d_A, $d_B)) setup=(d_A = similar(d_B))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample. Range (min … max): 19.916 μs … 104.479 μs ┊ GC (min … max): 0.00% … 0.00% Time (median): 21.002 μs ┊ GC (median): 0.00% Time (mean ± σ): 21.472 μs ± 2.380 μs ┊ GC (mean ± σ): 0.00% ± 0.00% ▁▃▅▇███▆▅▄▃▂▁ ▂ ███████████████▇▇▆▆▆▇▆▇▇▆▆▆▆▆▅▅▆▆▆▆▆▆▄▆▇▆▇▆▅▇▅▅▅▆▅▆▆▆▅▆▆▆▆▆▆ █ 19.9 μs Histogram: log(frequency) by time 31.4 μs < Memory estimate: 720 bytes, allocs estimate: 28.
With KernelAbstractions we can now write code that is portable and can be used both for data that resides on the CPU as well as the GPU, therefore implementing the "Compute follows Data" paradigm.
let
A = similar(B)
copy_ka!(A, B)
@assert A == B
end
let
d_A = similar(d_B)
copy_ka!(d_A, d_B)
@assert d_A == d_B
end
@benchmark copy_ka!(A, $B) setup=(A = similar(B))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample. Range (min … max): 108.141 μs … 7.116 ms ┊ GC (min … max): 0.00% … 0.00% Time (median): 126.832 μs ┊ GC (median): 0.00% Time (mean ± σ): 186.821 μs ± 128.714 μs ┊ GC (mean ± σ): 0.00% ± 0.00% ▆█▇▆▄▂▁ ▁▁▂▂▄▅▆▄▃▂▁ ▁▁ ▁▃▄▄▃▁ ▂ ████████▇▇████████████▆▅▆▆▆▇▇█▆▆▆▅▅▅▇█████▇▆▅▄▄▂▃▂▃▄▄▇▇██████ █ 108 μs Histogram: log(frequency) by time 435 μs < Memory estimate: 1.83 KiB, allocs estimate: 33.
@benchmark CUDA.@sync(copy_ka!(d_A, $d_B)) setup=(d_A = similar(d_B))
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample. Range (min … max): 24.232 μs … 8.109 ms ┊ GC (min … max): 0.00% … 0.00% Time (median): 25.365 μs ┊ GC (median): 0.00% Time (mean ± σ): 27.597 μs ± 84.574 μs ┊ GC (mean ± σ): 0.00% ± 0.00% ▄▇█▇▆▅▃▂▁ ▂ ███████████▆█▇▇▇▇▇▇▇▇▇▆▇▇▇▆▇▆▆▅▅▆▅▅▅▆▆▆▇▇█▇▇▇▇▇▇▆▃▅▄▅▃▆▅▅▅▅ █ 24.2 μs Histogram: log(frequency) by time 46.6 μs < Memory estimate: 1.62 KiB, allocs estimate: 60.
CUDA.@profile let
d_A = similar(d_B)
for _ in 1:10
copy_ka!(d_A, d_B)
end
end
Profiler ran for 333.31 µs, capturing 261 events. Host-side activity: calling CUDA APIs took 105.62 µs (31.69% of the trace) ┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤ │ 21.03% │ 70.1 µs │ 10 │ 7.01 µs ± 7.78 ( 3.81 ‥ 29.09) │ cuLaunchKernel │ │ 1.50% │ 5.01 µs │ 1 │ │ cuMemAllocFromPoolAsync │ └──────────┴────────────┴───────┴─────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 59.13 µs (17.74% of the trace) ┌──────────┬────────────┬───────┬───────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼───────────────────────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ 17.74% │ 59.13 µs │ 10 │ 5.91 µs ± 0.22 ( 5.48 ‥ 6.2) │ gpu_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<1, Tuple<OneTo<Int64>>>, NDRange<1, DynamicSize, DynamicSize, CartesianIndices<1, Tuple<OneTo<Int64>>>, CartesianIndices<1, Tuple<OneTo<Int64>>>>>, CuDeviceArray<Float64, 1, 1>, CuDeviceArray<Float64, 1, 1>) │ └──────────┴────────────┴───────┴───────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
We can see that KernelAbstractions is a bit slower than pure CUDA, and that is partially expected due to some convenience functionality.
@kernel unsafe_indices=true function copy_kernel_faster!(A, B)
N = @uniform prod(@groupsize())
I = (@index(Group, Linear)-1i32) * N + @index(Local, Linear)
if I <= length(A)
@inbounds A[I] = B[I]
end
end
copy_kernel_faster! (generic function with 4 methods)
CUDA.@profile let
d_A = similar(d_B)
for _ in 1:10
copy_kernel_faster!(CUDABackend())(d_A, d_B, ndrange=length(d_A))
end
end
Profiler ran for 204.54 ms, capturing 281 events. Host-side activity: calling CUDA APIs took 316.38 µs (0.15% of the trace) ┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤ │ 0.05% │ 105.38 µs │ 1 │ │ cuModuleLoadDataEx │ │ 0.04% │ 88.21 µs │ 10 │ 8.82 µs ± 12.46 ( 3.81 ‥ 44.11) │ cuLaunchKernel │ │ 0.02% │ 46.49 µs │ 1 │ │ cuModuleGetFunction │ │ 0.01% │ 11.21 µs │ 1 │ │ cuCtxSynchronize │ │ 0.00% │ 5.48 µs │ 1 │ │ cuMemAllocFromPoolAsync │ └──────────┴────────────┴───────┴─────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 57.94 µs (0.03% of the trace) ┌──────────┬────────────┬───────┬────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ 0.03% │ 57.94 µs │ 10 │ 5.79 µs ± 0.45 ( 5.25 ‥ 6.91) │ gpu_copy_kernel_faster_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<1, Tuple<OneTo<Int64>>>, NDRange<1, DynamicSize, DynamicSize, CartesianIndices<1, Tuple<OneTo<Int64>>>, CartesianIndices<1, Tuple<OneTo<Int64>>>>>, CuDeviceArray<Float64, 1, 1>, CuDeviceArray<Float64, 1, 1>) │ └──────────┴────────────┴───────┴────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
const nreps = 3
const N = 2048
const T = Float32
const TILE_DIM = 32
const BLOCK_ROWS = 8
8
@kernel function simple_copy_kernel!(output, @Const(input))
I, J = @index(Global, NTuple)
@inbounds output[I, J] = input[I, J]
end
simple_copy_kernel! (generic function with 4 methods)
@kernel function simple_transpose_kernel!(output, @Const(input))
I, J = @index(Global, NTuple)
@inbounds output[J, I] = input[I, J]
end
simple_transpose_kernel! (generic function with 4 methods)
@kernel unsafe_indices = true function lmem_copy_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
I, J = @index(Global, NTuple)
i, j = @index(Local, NTuple)
N = @uniform @groupsize()[1]
M = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (N + BANK, M)
@inbounds tile[i, j] = input[I, J]
@synchronize
@inbounds output[I, J] = tile[i, j]
end
lmem_copy_kernel! (generic function with 4 methods)
@kernel unsafe_indices = true function lmem_transpose_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
N = @uniform @groupsize()[1]
M = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (N + BANK, M)
# Manually calculate global indexes
# Later on we need to pivot the group index
I = (gi - 1) * N + i
J = (gj - 1) * M + j
@inbounds tile[i, j] = input[I, J]
@synchronize
# Pivot the group index
I = (gj - 1) * M + i
J = (gi - 1) * N + j
@inbounds output[I, J] = tile[j, i]
end
lmem_transpose_kernel! (generic function with 4 methods)
using KernelAbstractions.Extras: @unroll
@kernel unsafe_indices=true function coalesced_copy_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
TILE_DIM = @uniform @groupsize()[1]
BLOCK_ROWS = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
# Can't use @index(Global), because we use a smaller ndrange
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds tile[i, j + k] = input[I, J + k]
end
@synchronize
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds output[I, J + k] = tile[i, j + k]
end
end
coalesced_copy_kernel! (generic function with 4 methods)
@kernel unsafe_indices = true function coalesced_transpose_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
TILE_DIM = @uniform @groupsize()[1]
BLOCK_ROWS = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
# Can't use @index(Global), because we use a smaller ndrange
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds tile[i, j + k] = input[I, J + k]
end
@synchronize
# Transpose block offsets
I = (gj - 1) * TILE_DIM + i
J = (gi - 1) * TILE_DIM + j
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds output[I, J + k] = tile[j + k, i]
end
end
coalesced_transpose_kernel! (generic function with 4 methods)
using NVTX, Random
backend = CUDABackend()
CUDABackend(false, false)
CUDA.@profile for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM))
for (name, kernel) in (
("copy", simple_copy_kernel!(backend, block_dims)),
("transpose", simple_transpose_kernel!(backend, block_dims)),
)
NVTX.@range "Simple $name $block_dims" let
input = rand!(allocate(backend, T, N, N))
output = similar(input)
# compile kernel
kernel(output, input, ndrange = size(output))
for rep in 1:nreps
kernel(output, input, ndrange = size(output))
end
KernelAbstractions.synchronize(backend)
end
end
end
Profiler ran for 17.26 ms, capturing 5372 events. Host-side activity: calling CUDA APIs took 4.69 ms (27.17% of the trace) ┌──────────┬────────────┬───────┬───────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼───────────────────────────────────────┼─────────────────────────┤ │ 71.43% │ 12.33 ms │ 6 │ 2.05 ms ± 1.29 ( 0.64 ‥ 3.98) │ cuStreamSynchronize │ │ 1.05% │ 180.72 µs │ 24 │ 7.53 µs ± 3.92 ( 4.29 ‥ 19.31) │ cuLaunchKernel │ │ 0.80% │ 137.81 µs │ 6 │ 22.97 µs ± 5.96 ( 19.07 ‥ 34.81) │ cudaLaunchKernel │ │ 0.54% │ 93.94 µs │ 12 │ 7.83 µs ± 3.72 ( 3.58 ‥ 13.59) │ cuMemAllocFromPoolAsync │ │ 0.04% │ 7.39 µs │ 12 │ 615.91 ns ± 448.45 ( 0.0 ‥ 1192.09) │ cudaGetLastError │ └──────────┴────────────┴───────┴───────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 16.16 ms (93.62% of the trace) ┌──────────┬────────────┬───────┬──────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼──────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ 26.33% │ 4.54 ms │ 4 │ 1.14 ms ± 0.0 ( 1.13 ‥ 1.14) │ gpu_simple_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_1__1024_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32) │ │ 18.60% │ 3.21 ms │ 4 │ 802.46 µs ± 3.33 (798.46 ‥ 805.85) │ gpu_simple_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32) │ │ 17.80% │ 3.07 ms │ 4 │ 768.01 µs ± 3.44 (765.56 ‥ 772.95) │ gpu_simple_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_1024__1_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32) │ │ 13.06% │ 2.25 ms │ 4 │ 563.26 µs ± 8.18 (555.04 ‥ 573.87) │ gpu_simple_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_1__1024_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32) │ │ 7.33% │ 1.27 ms │ 4 │ 316.26 µs ± 2.03 (313.52 ‥ 318.29) │ gpu_simple_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32) │ │ 6.78% │ 1.17 ms │ 4 │ 292.48 µs ± 0.49 (291.82 ‥ 293.02) │ gpu_simple_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_1024__1_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32) │ │ 3.72% │ 641.58 µs │ 6 │ 106.93 µs ± 0.85 ( 106.1 ‥ 108.24) │ void gen_sequenced<curandStateXORWOW, float, int, &float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int), rng_config<curandStateXORWOW, (curandOrdering)101>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) │ └──────────┴────────────┴───────┴──────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ NVTX ranges: ┌──────────┬────────────┬───────┬─────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Name │ ├──────────┼────────────┼───────┼─────────────────────────────────┤ │ 27.75% │ 4.79 ms │ 1 │ Main.Simple copy (1, 1024) │ │ 20.21% │ 3.49 ms │ 1 │ Main.Simple transpose (32, 32) │ │ 19.07% │ 3.29 ms │ 1 │ Main.Simple transpose (1024, 1) │ │ 14.36% │ 2.48 ms │ 1 │ Main.Simple transpose (1, 1024) │ │ 8.90% │ 1.54 ms │ 1 │ Main.Simple copy (32, 32) │ │ 7.99% │ 1.38 ms │ 1 │ Main.Simple copy (1024, 1) │ └──────────┴────────────┴───────┴─────────────────────────────────┘
# Benchmark localmem
CUDA.@profile for (name, kernel) in (
("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
)
for bank in (true, false)
NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
input = rand!(allocate(backend, T, N, N))
output = similar(input)
# compile kernel
kernel(output, input, Val(Int(bank)), ndrange = size(output))
for rep in 1:nreps
kernel(output, input, Val(Int(bank)), ndrange = size(output))
end
KernelAbstractions.synchronize(backend)
end
end
end
Profiler ran for 7.2 ms, capturing 3582 events. Host-side activity: calling CUDA APIs took 1.67 ms (23.24% of the trace) ┌──────────┬────────────┬───────┬───────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼───────────────────────────────────────┼─────────────────────────┤ │ 53.14% │ 3.83 ms │ 4 │ 956.89 µs ± 477.44 (709.06 ‥ 1672.98) │ cuStreamSynchronize │ │ 1.74% │ 125.65 µs │ 16 │ 7.85 µs ± 3.47 ( 4.77 ‥ 15.5) │ cuLaunchKernel │ │ 1.36% │ 97.75 µs │ 4 │ 24.44 µs ± 4.58 ( 20.5 ‥ 30.76) │ cudaLaunchKernel │ │ 0.92% │ 66.04 µs │ 8 │ 8.26 µs ± 4.12 ( 3.58 ‥ 15.02) │ cuMemAllocFromPoolAsync │ │ 0.07% │ 5.25 µs │ 8 │ 655.65 ns ± 594.34 ( 0.0 ‥ 1430.51) │ cudaGetLastError │ └──────────┴────────────┴───────┴───────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 6.45 ms (89.56% of the trace) ┌──────────┬────────────┬───────┬──────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼──────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ 30.03% │ 2.16 ms │ 4 │ 540.73 µs ± 1.99 (538.11 ‥ 542.4) │ gpu_lmem_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<0>) │ │ 17.94% │ 1.29 ms │ 4 │ 323.0 µs ± 2.04 (321.15 ‥ 325.2) │ gpu_lmem_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<1>) │ │ 17.86% │ 1.29 ms │ 4 │ 321.51 µs ± 0.81 (320.67 ‥ 322.58) │ gpu_lmem_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<0>) │ │ 17.85% │ 1.29 ms │ 4 │ 321.39 µs ± 1.66 (319.48 ‥ 323.3) │ gpu_lmem_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<1>) │ │ 5.89% │ 424.15 µs │ 4 │ 106.04 µs ± 3.63 (100.61 ‥ 108.24) │ void gen_sequenced<curandStateXORWOW, float, int, &float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int), rng_config<curandStateXORWOW, (curandOrdering)101>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) │ └──────────┴────────────┴───────┴──────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ NVTX ranges: ┌──────────┬────────────┬───────┬─────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Name │ ├──────────┼────────────┼───────┼─────────────────────────────────────────────┤ │ 33.08% │ 2.38 ms │ 1 │ Main.Localmem transpose (32, 32) bank=false │ │ 21.79% │ 1.57 ms │ 1 │ Main.Localmem copy (32, 32) bank=true │ │ 21.58% │ 1.55 ms │ 1 │ Main.Localmem copy (32, 32) bank=false │ │ 21.42% │ 1.54 ms │ 1 │ Main.Localmem transpose (32, 32) bank=true │ └──────────┴────────────┴───────┴─────────────────────────────────────────────┘
# Benchmark localmem + multiple elements per lane
CUDA.@profile for (name, kernel) in (
("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
)
for bank in (true, false)
NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
input = rand!(allocate(backend, T, N, N))
output = similar(input)
# We want a number of blocks equivalent to (TILE_DIM, TILE_DIM)
# but our blocks are (TILE_DIM, BLOCK_ROWS) so we need to remove
# a factor from the size of the array otherwise we get to many blocks
block_factor = div(TILE_DIM, BLOCK_ROWS)
ndrange = (N, div(N, block_factor))
# compile kernel
kernel(output, input, Val(Int(bank)), ndrange = ndrange)
for rep in 1:nreps
kernel(output, input, Val(Int(bank)), ndrange = ndrange)
end
KernelAbstractions.synchronize(backend)
end
end
end
Profiler ran for 4.65 ms, capturing 2934 events. Host-side activity: calling CUDA APIs took 1.1 ms (23.57% of the trace) ┌──────────┬────────────┬───────┬──────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼──────────────────────────────────────┼─────────────────────────┤ │ 23.73% │ 1.1 ms │ 4 │ 275.67 µs ± 545.62 ( 2.62 ‥ 1094.1) │ cuStreamSynchronize │ │ 4.08% │ 189.78 µs │ 16 │ 11.86 µs ± 5.74 ( 5.01 ‥ 28.37) │ cuLaunchKernel │ │ 1.81% │ 83.92 µs │ 4 │ 20.98 µs ± 8.79 ( 15.74 ‥ 34.09) │ cudaLaunchKernel │ │ 1.26% │ 58.65 µs │ 8 │ 7.33 µs ± 4.05 ( 3.34 ‥ 13.35) │ cuMemAllocFromPoolAsync │ │ 0.09% │ 4.29 µs │ 8 │ 536.44 ns ± 397.93 ( 0.0 ‥ 953.67) │ cudaGetLastError │ └──────────┴────────────┴───────┴──────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 3.85 ms (82.79% of the trace) ┌──────────┬────────────┬───────┬──────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼──────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ 35.41% │ 1.65 ms │ 4 │ 411.33 µs ± 0.3 (411.03 ‥ 411.75) │ gpu_coalesced_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__8_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<0>) │ │ 14.25% │ 662.33 µs │ 4 │ 165.58 µs ± 0.5 (164.99 ‥ 166.18) │ gpu_coalesced_transpose_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__8_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<1>) │ │ 11.99% │ 557.18 µs │ 4 │ 139.3 µs ± 2.53 (137.57 ‥ 143.05) │ gpu_coalesced_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__8_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<1>) │ │ 11.95% │ 555.28 µs │ 4 │ 138.82 µs ± 2.05 (137.57 ‥ 141.86) │ gpu_coalesced_copy_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__8_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float32, 2, 1>, Float32, Val<0>) │ │ 9.19% │ 426.77 µs │ 4 │ 106.69 µs ± 2.59 ( 103.0 ‥ 108.48) │ void gen_sequenced<curandStateXORWOW, float, int, &float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int), rng_config<curandStateXORWOW, (curandOrdering)101>>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) │ └──────────┴────────────┴───────┴──────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ NVTX ranges: ┌──────────┬────────────┬───────┬────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Name │ ├──────────┼────────────┼───────┼────────────────────────────────────────────────────────────────┤ │ 40.50% │ 1.88 ms │ 1 │ Main.Localmem + multiple elements transpose (32, 8) bank=false │ │ 20.22% │ 939.37 µs │ 1 │ Main.Localmem + multiple elements copy (32, 8) bank=true │ │ 18.81% │ 874.04 µs │ 1 │ Main.Localmem + multiple elements transpose (32, 8) bank=true │ │ 16.71% │ 776.29 µs │ 1 │ Main.Localmem + multiple elements copy (32, 8) bank=false │ └──────────┴────────────┴───────┴────────────────────────────────────────────────────────────────┘
@kernel function racy_kernel!(out, arr)
i, j = @index(Global, NTuple)
for k in 1:size(out, 1)
out[k, i] += arr[i, j]
end
end
racy_kernel! (generic function with 4 methods)
using ImageShow
img = zeros(Float32, (50, 50));
img[10:20, 10:20] .= 1;
img[35:45, 35:45] .= 2;
simshow(img)
out = KernelAbstractions.zeros(backend, Float32, size(img));
racy_kernel!(backend)(out, adapt(backend, img), ndrange=size(img))
simshow(Array(out))
using Atomix: @atomic, @atomicswap, @atomicreplace
@kernel function nonracy_kernel!(out, arr)
i, j = @index(Global, NTuple)
for k in 1:size(out, 1)
@atomic out[k, i] += arr[i, j]
end
end
nonracy_kernel! (generic function with 4 methods)
out = KernelAbstractions.zeros(backend, Float32, size(img));
nonracy_kernel!(backend)(out, adapt(backend, img), ndrange=size(img))
simshow(Array(out))
@kernel function naive_matmul_kernel!(output, a, b)
i, j = @index(Global, NTuple)
# creating a temporary sum variable for matrix multiplication
tmp_sum = zero(eltype(output))
for k in 1:size(a)[2]
tmp_sum += a[i, k] * b[k, j]
end
output[i, j] = tmp_sum
end
naive_matmul! (generic function with 1 method)
# Creating a wrapper kernel for launching with error checks
function naive_matmul!(output, a, b)
if size(a)[2] != size(b)[1]
println("Matrix size mismatch!")
return nothing
end
backend = KernelAbstractions.get_backend(a)
kernel! = naive_matmul_kernel!(backend)
kernel!(output, a, b, ndrange = size(output))
return
end
let
a = rand!(allocate(backend, Float32, 256, 123))
b = rand!(allocate(backend, Float32, 123, 45))
output = KernelAbstractions.zeros(backend, Float32, 256, 45)
naive_matmul!(output, a, b)
@assert isapprox(output, a * b)
end
@kernel unsafe_indices = true function coalesced_matmul_kernel!(
output, @Const(A), @Const(B),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
TILE_DIM = @uniform @groupsize()[1]
# +1 to avoid bank conflicts on shared memory
tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
# private variable for tile output
outval = @private eltype(output) 1
@inbounds outval[1] = -zero(eltype(output))
@uniform N = size(output, 1)
@uniform M = size(output, 2)
@uniform R = size(A, 2)
# number of tiles depends on inner dimension
@uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
# loop over all tiles needed for this calculation
for t in 0:(NUM_TILES - 1)
# Can't use @index(Global), because we use a smaller ndrange
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
# load inputs into tiles, with bounds checking for non-square matrices
if I <= N && t * TILE_DIM + j <= R
@inbounds tile1[i, j] = A[I, t * TILE_DIM + j]
else
@inbounds tile1[i, j] = 0.0
end
if t * TILE_DIM + i <= R && J <= M
@inbounds tile2[i, j] = B[t * TILE_DIM + i, J]
else
@inbounds tile2[i, j] = 0.0
end
# wait for all tiles to be loaded
@synchronize
# get global values again
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
# calculate value of spot in output, use temporary value to allow for vectorization
out = zero(eltype(output))
@simd for k in 1:TILE_DIM
@inbounds out += tile1[i, k] * tile2[k, j]
end
outval[1] += out
@synchronize
end
# get global indices again
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
# save if inbounds
if I <= N && J <= M
@inbounds output[I, J] = outval[1]
end
end
# Creating a wrapper kernel for launching with error checks
function coalesced_matmul!(output, a, b)
if size(a)[2] != size(b)[1]
println("Matrix size mismatch!")
return nothing
end
backend = KernelAbstractions.get_backend(a)
kernel! = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
kernel!(output, a, b, ndrange = size(output))
return
end
coalesced_matmul! (generic function with 1 method)
let
a = rand!(allocate(backend, Float32, 256, 123))
b = rand!(allocate(backend, Float32, 123, 45))
output = KernelAbstractions.zeros(backend, Float32, 256, 45)
coalesced_matmul!(output, a, b)
@assert isapprox(output, a * b)
end
import LinearAlgebra
let
N = 1024
R = 512
M = 2048
T = Float64
A = rand!(allocate(backend, T, N, R))
B = rand!(allocate(backend, T, R, M))
output_naive = KernelAbstractions.zeros(backend, T, N, M)
output_coalesced = KernelAbstractions.zeros(backend, T, N, M)
output_mul = KernelAbstractions.zeros(backend, T, N, M)
CUDA.@profile for _ in 1:nreps
NVTX.@range "Naive Matmul" begin
naive_matmul!(output_naive, A, B)
KernelAbstractions.synchronize(backend)
end
NVTX.@range "Coalesced Matmul" begin
coalesced_matmul!(output_coalesced, A, B)
KernelAbstractions.synchronize(backend)
end
NVTX.@range "LinearAlgebra.mul!" begin
LinearAlgebra.mul!(output_mul, A, B)
KernelAbstractions.synchronize(backend)
end
end
end
Profiler ran for 211.92 ms, capturing 7394 events. Host-side activity: calling CUDA APIs took 67.39 ms (31.80% of the trace) ┌──────────┬────────────┬───────┬──────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼──────────────────────────────────────┼─────────────────────────┤ │ 97.21% │ 206.02 ms │ 9 │ 22.89 ms ± 2.26 ( 20.4 ‥ 26.56) │ cuStreamSynchronize │ │ 0.08% │ 162.36 µs │ 6 │ 27.06 µs ± 4.18 ( 20.98 ‥ 31.23) │ cuLaunchKernel │ │ 0.03% │ 70.33 µs │ 6 │ 11.72 µs ± 6.26 ( 4.29 ‥ 18.12) │ cuMemAllocFromPoolAsync │ │ 0.03% │ 62.23 µs │ 6 │ 10.37 µs ± 7.08 ( 3.58 ‥ 18.36) │ cuMemcpyHtoDAsync │ │ 0.02% │ 52.21 µs │ 3 │ 17.4 µs ± 0.63 ( 16.93 ‥ 18.12) │ cudaLaunchKernel │ │ 0.00% │ 1.91 µs │ 3 │ 635.78 ns ± 275.3 (476.84 ‥ 953.67) │ cudaGetLastError │ └──────────┴────────────┴───────┴──────────────────────────────────────┴─────────────────────────┘ Device-side activity: GPU was busy for 210.23 ms (99.20% of the trace) ┌──────────┬────────────┬───────┬──────────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼──────────────────────────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ │ 34.03% │ 72.11 ms │ 3 │ 24.04 ms ± 2.59 ( 22.53 ‥ 27.03) │ gpu_coalesced_matmul_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, StaticSize<_32__32_>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, void>>, CuDeviceArray<Float64, 2, 1>, Float64, Float64) │ │ 33.54% │ 71.07 ms │ 3 │ 23.69 ms ± 2.26 ( 21.12 ‥ 25.37) │ gpu_naive_matmul_kernel_(CompilerMetadata<DynamicSize, DynamicCheck, void, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, NDRange<2, DynamicSize, DynamicSize, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>, CartesianIndices<2, Tuple<OneTo<Int64>, OneTo<Int64>>>>>, CuDeviceArray<Float64, 2, 1>, CuDeviceArray<Float64, 2, 1>, CuDeviceArray<Float64, 2, 1>) │ │ 31.63% │ 67.04 ms │ 3 │ 22.35 ms ± 2.49 ( 20.91 ‥ 25.22) │ volta_dgemm_128x64_nn │ │ 0.00% │ 4.53 µs │ 6 │ 754.99 ns ± 179.47 (476.84 ‥ 953.67) │ [copy pageable to device memory] │ └──────────┴────────────┴───────┴──────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ NVTX ranges: ┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐ │ Time (%) │ Total time │ Calls │ Time distribution │ Name │ ├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤ │ 34.23% │ 72.54 ms │ 3 │ 24.18 ms ± 2.6 ( 22.66 ‥ 27.18) │ Main.Coalesced Matmul │ │ 33.77% │ 71.56 ms │ 3 │ 23.85 ms ± 2.29 ( 21.26 ‥ 25.57) │ Main.Naive Matmul │ │ 31.98% │ 67.78 ms │ 3 │ 22.59 ms ± 2.51 ( 21.14 ‥ 25.49) │ Main.LinearAlgebra.mul! │ └──────────┴────────────┴───────┴─────────────────────────────────────┴─────────────────────────┘