#!/usr/bin/env python
# coding: utf-8

# # Axis transposition benchmarking
# This notebook compares performance of different implementations of transposing axes.
# 
# **Note:** benchmarking results vary heavily depending on image size, kernel size, used operations, parameters and used hardware. Use this notebook to adapt it to your use-case scenario and benchmark on your target hardware. If you have different scenarios or use-cases, you are very welcome to submit your notebook as pull-request!

# In[1]:


import pyclesperanto_prototype as cle
import numpy as np
import time
import cupy as cp

# to measure kernel execution duration properly, we need to set this flag. It will slow down exection of workflows a bit though
cle.set_wait_for_kernel_finish(True)

# selet a GPU with the following in the name. This will fallback to any other GPU if none with this name is found
cle.select_device('RTX')


# In[2]:


# test data
import numpy as np

test_image = np.random.random([100, 512, 1024])


# ## clEsperanto

# In[3]:


# transpose with pyclesperanto
result_image = None

test_image_gpu = cle.push_zyx(test_image)

for i in range(0, 10):
    start_time = time.time()
    result_image = cle.transpose_xz(test_image_gpu, result_image)
    print("pyclesperanto transpose duration: " + str(time.time() - start_time))
print(result_image.shape)


# ## cupy

# In[4]:


# transpose with numpy
result_image = None
cu_test_image = cp.asarray(test_image)

for i in range(0, 10):
    start_time = time.time()
    result_image = cp.transpose(cu_test_image, (2, 1, 0))
    cp.cuda.stream.get_current_stream().synchronize() # we need to wait here to measure time properly
    print("cupy transpose duration: " + str(time.time() - start_time))
print(result_image.shape)


# ## numpy

# In[5]:


# transpose with numpy
result_image = None

for i in range(0, 10):
    start_time = time.time()
    result_image = np.transpose(test_image, (2, 1, 0))
    print("numpy transpose duration: " + str(time.time() - start_time))
print(result_image.shape)


# In[ ]: