import Pkg
Pkg.activate("@colab")
Pkg.add([
    "CUDA",
    "KernelAbstractions",
    "Atomix",
    "AcceleratedKernels",
    "BenchmarkTools",
    "Adapt",
    "NVTX",
    "ImageShow",
])

versioninfo()

using CUDA, KernelAbstractions, Adapt

function copy_cpu!(A, B)
  for I in 1:length(A)
    @inbounds A[I] = B[I]
  end
end

@kernel function copy_kernel!(A, B)
  I = @index(Global)
  @inbounds A[I] = B[I]
end

function copy_ka!(A, B)
  backend = get_backend(A)
  @assert size(A) == size(B)
  @assert get_backend(B) == backend

  kernel = copy_kernel!(backend)
  kernel(A, B, ndrange = length(A))
  return
end

using CUDA: i32

function copy_kernel_cuda!(A, B)
  I = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
  if I <= length(A)
      @inbounds A[I] = B[I]
  end
  return nothing
end

function copy_cuda!(A, B)
  kernel = @cuda launch=false copy_kernel_cuda!(A, B)
  config = launch_configuration(kernel.fun)
  threads = min(length(A), config.threads)
  blocks = cld(length(A), threads)

  kernel(A, B; threads, blocks)
end

B = rand(64_000);

let
  A = similar(B)
  copy_cpu!(A, B)
  @assert A == B
end

d_B = adapt(CuArray, B);

typeof(d_B)

let
  d_A = similar(d_B)
  copy_cuda!(d_A, d_B)
  @assert d_A == d_B
end

using BenchmarkTools

@benchmark copy_cuda!(d_A, $d_B) setup=(d_A = similar(d_B))

CUDA.@profile let
  d_A = similar(d_B)
  for _ in 1:10
    copy_cuda!(d_A, d_B)
  end
end

@benchmark CUDA.@sync(copy_cuda!(d_A, $d_B)) setup=(d_A = similar(d_B))

let
  A = similar(B)
  copy_ka!(A, B)
  @assert A == B
end

let
  d_A = similar(d_B)
  copy_ka!(d_A, d_B)
  @assert d_A == d_B
end

@benchmark copy_ka!(A, $B) setup=(A = similar(B))

@benchmark CUDA.@sync(copy_ka!(d_A, $d_B)) setup=(d_A = similar(d_B))

CUDA.@profile let
  d_A = similar(d_B)
  for _ in 1:10
    copy_ka!(d_A, d_B)
  end
end

@kernel unsafe_indices=true function copy_kernel_faster!(A, B)
  N = @uniform prod(@groupsize())
  I = (@index(Group, Linear)-1i32) * N + @index(Local, Linear)
  if I <= length(A)
    @inbounds A[I] = B[I]
  end
end

CUDA.@profile let
  d_A = similar(d_B)
  for _ in 1:10
    copy_kernel_faster!(CUDABackend())(d_A, d_B, ndrange=length(d_A))
  end
end

const nreps = 3
const N = 2048
const T = Float32

const TILE_DIM = 32
const BLOCK_ROWS = 8

@kernel function simple_copy_kernel!(output, @Const(input))
    I, J = @index(Global, NTuple)
    @inbounds output[I, J] = input[I, J]
end

@kernel function simple_transpose_kernel!(output, @Const(input))
    I, J = @index(Global, NTuple)
    @inbounds output[J, I] = input[I, J]
end

@kernel unsafe_indices = true function lmem_copy_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    I, J = @index(Global, NTuple)
    i, j = @index(Local, NTuple)

    N = @uniform @groupsize()[1]
    M = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (N + BANK, M)

    @inbounds tile[i, j] = input[I, J]

    @synchronize

    @inbounds output[I, J] = tile[i, j]
end

@kernel unsafe_indices = true function lmem_transpose_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    N = @uniform @groupsize()[1]
    M = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (N + BANK, M)

    # Manually calculate global indexes
    # Later on we need to pivot the group index
    I = (gi - 1) * N + i
    J = (gj - 1) * M + j

    @inbounds tile[i, j] = input[I, J]

    @synchronize

    # Pivot the group index
    I = (gj - 1) * M + i
    J = (gi - 1) * N + j

    @inbounds output[I, J] = tile[j, i]
end

using KernelAbstractions.Extras: @unroll

@kernel unsafe_indices=true function coalesced_copy_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]
    BLOCK_ROWS = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # Can't use @index(Global), because we use a smaller ndrange
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds tile[i, j + k] = input[I, J + k]
    end

    @synchronize

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds output[I, J + k] = tile[i, j + k]
    end
end

@kernel unsafe_indices = true function coalesced_transpose_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]
    BLOCK_ROWS = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # Can't use @index(Global), because we use a smaller ndrange
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds tile[i, j + k] = input[I, J + k]
    end

    @synchronize

    # Transpose block offsets
    I = (gj - 1) * TILE_DIM + i
    J = (gi - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds output[I, J + k] = tile[j + k, i]
    end
end

using NVTX, Random

backend = CUDABackend()

CUDA.@profile for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM))
    for (name, kernel) in (
            ("copy", simple_copy_kernel!(backend, block_dims)),
            ("transpose", simple_transpose_kernel!(backend, block_dims)),
        )
        NVTX.@range "Simple $name $block_dims" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # compile kernel
            kernel(output, input, ndrange = size(output))
            for rep in 1:nreps
                kernel(output, input, ndrange = size(output))
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

# Benchmark localmem
CUDA.@profile for (name, kernel) in (
        ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
        ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
    )
    for bank in (true, false)
        NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # compile kernel
            kernel(output, input, Val(Int(bank)), ndrange = size(output))
            for rep in 1:nreps
                kernel(output, input, Val(Int(bank)), ndrange = size(output))
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

# Benchmark localmem + multiple elements per lane
CUDA.@profile for (name, kernel) in (
        ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
        ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
    )
    for bank in (true, false)
        NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # We want a number of blocks equivalent to (TILE_DIM, TILE_DIM)
            # but our blocks are (TILE_DIM, BLOCK_ROWS) so we need to remove
            # a factor from the size of the array otherwise we get to many blocks
            block_factor = div(TILE_DIM, BLOCK_ROWS)
            ndrange = (N, div(N, block_factor))

            # compile kernel
            kernel(output, input, Val(Int(bank)), ndrange = ndrange)
            for rep in 1:nreps
                kernel(output, input, Val(Int(bank)), ndrange = ndrange)
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

@kernel function racy_kernel!(out, arr)
	i, j = @index(Global, NTuple)
	for k in 1:size(out, 1)
		out[k, i] += arr[i, j]
	end
end

using ImageShow

img = zeros(Float32, (50, 50));
img[10:20, 10:20] .= 1;
img[35:45, 35:45] .= 2;
simshow(img)

out = KernelAbstractions.zeros(backend, Float32, size(img));
racy_kernel!(backend)(out, adapt(backend, img), ndrange=size(img))
simshow(Array(out))

using Atomix: @atomic, @atomicswap, @atomicreplace

@kernel function nonracy_kernel!(out, arr)
	i, j = @index(Global, NTuple)
	for k in 1:size(out, 1)
		@atomic out[k, i] += arr[i, j]
	end
end

out = KernelAbstractions.zeros(backend, Float32, size(img));
nonracy_kernel!(backend)(out, adapt(backend, img), ndrange=size(img))
simshow(Array(out))

@kernel function naive_matmul_kernel!(output, a, b)
    i, j = @index(Global, NTuple)

    # creating a temporary sum variable for matrix multiplication
    tmp_sum = zero(eltype(output))
    for k in 1:size(a)[2]
        tmp_sum += a[i, k] * b[k, j]
    end

    output[i, j] = tmp_sum
end

# Creating a wrapper kernel for launching with error checks
function naive_matmul!(output, a, b)
    if size(a)[2] != size(b)[1]
        println("Matrix size mismatch!")
        return nothing
    end
    backend = KernelAbstractions.get_backend(a)
    kernel! = naive_matmul_kernel!(backend)
    kernel!(output, a, b, ndrange = size(output))
    return
end

let
  a = rand!(allocate(backend, Float32, 256, 123))
  b = rand!(allocate(backend, Float32, 123, 45))
  output = KernelAbstractions.zeros(backend, Float32, 256, 45)

  naive_matmul!(output, a, b)

  @assert isapprox(output, a * b)
end

@kernel unsafe_indices = true function coalesced_matmul_kernel!(
        output, @Const(A), @Const(B),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]

    # +1 to avoid bank conflicts on shared memory
    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # private variable for tile output
    outval = @private eltype(output) 1
    @inbounds outval[1] = -zero(eltype(output))

    @uniform N = size(output, 1)
    @uniform M = size(output, 2)
    @uniform R = size(A, 2)
    # number of tiles depends on inner dimension
    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)

    # loop over all tiles needed for this calculation
    for t in 0:(NUM_TILES - 1)
        # Can't use @index(Global), because we use a smaller ndrange
        I = (gi - 1) * TILE_DIM + i
        J = (gj - 1) * TILE_DIM + j

        # load inputs into tiles, with bounds checking for non-square matrices
        if I <= N && t * TILE_DIM + j <= R
            @inbounds tile1[i, j] = A[I, t * TILE_DIM + j]
        else
            @inbounds tile1[i, j] = 0.0
        end
        if t * TILE_DIM + i <= R && J <= M
            @inbounds tile2[i, j] = B[t * TILE_DIM + i, J]
        else
            @inbounds tile2[i, j] = 0.0
        end

        # wait for all tiles to be loaded
        @synchronize

        # get global values again
        I = (gi - 1) * TILE_DIM + i
        J = (gj - 1) * TILE_DIM + j

        # calculate value of spot in output, use temporary value to allow for vectorization
        out = zero(eltype(output))
        @simd for k in 1:TILE_DIM
            @inbounds out += tile1[i, k] * tile2[k, j]
        end
        outval[1] += out

        @synchronize
    end

    # get global indices again
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    # save if inbounds
    if I <= N && J <= M
        @inbounds output[I, J] = outval[1]
    end
end

# Creating a wrapper kernel for launching with error checks
function coalesced_matmul!(output, a, b)
    if size(a)[2] != size(b)[1]
        println("Matrix size mismatch!")
        return nothing
    end
    backend = KernelAbstractions.get_backend(a)
    kernel! = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
    kernel!(output, a, b, ndrange = size(output))
    return
end

let
  a = rand!(allocate(backend, Float32, 256, 123))
  b = rand!(allocate(backend, Float32, 123, 45))
  output = KernelAbstractions.zeros(backend, Float32, 256, 45)

  coalesced_matmul!(output, a, b)

  @assert isapprox(output, a * b)
end

import LinearAlgebra

let
    N = 1024
    R = 512
    M = 2048
    T = Float64
    A = rand!(allocate(backend, T, N, R))
    B = rand!(allocate(backend, T, R, M))
    output_naive = KernelAbstractions.zeros(backend, T, N, M)
    output_coalesced = KernelAbstractions.zeros(backend, T, N, M)
    output_mul = KernelAbstractions.zeros(backend, T, N, M)


    CUDA.@profile for _ in 1:nreps
      NVTX.@range "Naive Matmul" begin
          naive_matmul!(output_naive, A, B)
          KernelAbstractions.synchronize(backend)
      end

      NVTX.@range "Coalesced Matmul" begin
          coalesced_matmul!(output_coalesced, A, B)
          KernelAbstractions.synchronize(backend)
      end

      NVTX.@range "LinearAlgebra.mul!" begin
          LinearAlgebra.mul!(output_mul, A, B)
          KernelAbstractions.synchronize(backend)
      end
    end
end