using PyPlot
import OpenCL
const cl = OpenCL;
BLOCK_SIZE = 16;
naive_transpose = "
__kernel void transpose(__global *a_t,
__global *a,
unsigned a_width,
unsigned a_height)
{
int read_idx = get_global_id(0) + get_global_id(1) * a_width;
int write_idx = get_global_id(1) + get_global_id(0) * a_height;
a_t[write_idx] = a[read_idx];
}";
block_transpose = "
#define BLOCK_SIZE $(BLOCK_SIZE)
__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1)))
void transpose(__global float *a_t,
__global float *a,
unsigned a_width,
unsigned a_height,
__local float *a_local)
{
int base_idx_a = get_group_id(0) * BLOCK_SIZE + get_group_id(1) * (BLOCK_SIZE * a_width);
int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + get_group_id(0) * (BLOCK_SIZE * a_height);
int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1);
int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1);
a_local[get_local_id(1) * BLOCK_SIZE + get_local_id(0)] = a[glob_idx_a];
barrier(CLK_LOCAL_MEM_FENCE);
a_t[glob_idx_a_t] = a_local[get_local_id(0) * BLOCK_SIZE + get_local_id(1)];
}";
function enqueue_naive_kernel{T}(queue::cl.CmdQueue,
k::cl.Kernel,
dst::cl.Buffer{T},
src::cl.Buffer{T},
dims)
h, w = dims
@assert w % BLOCK_SIZE == 0
@assert h % BLOCK_SIZE == 0
cl.set_args!(k, dst, src, uint32(h), uint32(h))
cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE))
end
enqueue_naive_kernel (generic function with 1 method)
function enqueue_block_kernel{T}(queue::cl.CmdQueue,
k::cl.Kernel,
dst::cl.Buffer{T},
src::cl.Buffer{T},
dims::Dims)
h, w = dims
@assert w % BLOCK_SIZE == 0
@assert h % BLOCK_SIZE == 0
lmem = cl.LocalMem(Float32, BLOCK_SIZE * (BLOCK_SIZE + 1))
cl.set_args!(k, dst, src, uint32(h), uint32(w), lmem)
cl.enqueue_kernel(queue, k, (h, w), (BLOCK_SIZE, BLOCK_SIZE))
end
enqueue_block_kernel (generic function with 1 method)
function benchmark_transpose()
gpu_device = first(cl.devices(:gpu))
ctx = cl.Context(gpu_device)
for dev in cl.devices(ctx)
@assert dev[:local_mem_size] > 0
end
prg = cl.Program(ctx, source=naive_transpose) |> cl.build!
naive_kern = cl.Kernel(prg, "transpose")
prg = cl.Program(ctx, source=block_transpose) |> cl.build!
block_kernel = cl.Kernel(prg, "transpose")
queue = cl.CmdQueue(ctx, :profile)
array_sizes = [int((2^i) / 32) * 32 for i in 6:12]
mem_bandwiths = Float32[]
for (name, method, kern) in (("naive", enqueue_naive_kernel, naive_kern),
("block", enqueue_block_kernel, block_kernel))
for s in array_sizes
src = rand(Float32, (s, s))
a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=src)
a_t_buf = cl.Buffer(Float32, ctx, :w, sizeof(src))
# warm up....
for i in 1:4
method(queue, kern, a_t_buf, a_buf, size(src))
end
# profile...
count = 20
events = cl.Event[]
for i in 1:count
push!(events, method(queue, kern, a_t_buf, a_buf, size(src)))
end
cl.wait(events[end])
time = sum([evt[:profile_duration] for evt in events])
mem_bw = 2 * sizeof(src) * count / (time * 1e-9)
push!(mem_bandwiths, mem_bw)
@printf("benchmarking %s, array size: %s^2, %.3f GB/s\n", name, s, mem_bw / 1e9)
# causes problems with ijulia's show...
#cl.release!(a_buf)
#cl.release!(a_t_buf)
end
end
# plot results, sleep a little to prevent overlap with show..
sleep(1)
clf()
n_sizes = length(array_sizes)
naive_bandwidths = mem_bandwiths[1:n_sizes]
block_bandwidths = mem_bandwiths[n_sizes+1:end]
plot(array_sizes, naive_bandwidths/1e9, "o-", label="naive")
plot(array_sizes, block_bandwidths/1e9, "o-", label="block")
title("Tanspose Tests for $(gpu_device[:name])")
xlabel("Matrix width/height N")
ylabel("Memory Bandwidth [GB/s]")
legend(loc="best")
grid(true)
show()
end
benchmark_transpose (generic function with 1 method)
benchmark_transpose()
benchmarking naive, array size: 64^2, 1.601 GB/s benchmarking naive, array size: 128^2, 14.433 GB/s benchmarking naive, array size: 256^2, 21.358 GB/s benchmarking naive, array size: 512^2, 13.429 GB/s benchmarking naive, array size: 1024^2, 10.693 GB/s benchmarking naive, array size: 2048^2, 17.667 GB/s benchmarking naive, array size: 4096^2, 7.202 GB/s benchmarking block, array size: 64^2, 10.869 GB/s benchmarking block, array size: 128^2, 42.130 GB/s benchmarking block, array size: 256^2, 113.791 GB/s benchmarking block, array size: 512^2, 100.789 GB/s benchmarking block, array size: 1024^2, 52.034 GB/s benchmarking block, array size: 2048^2, 20.124 GB/s benchmarking block, array size: 4096^2, 8.435 GB/s