#!/usr/bin/env python
# coding: utf-8

# # Numba 0.52 CUDA release demo
# 
# Key changes to the CUDA target for Release 0.52 include:
# 
# * Support for Unified Memory on Linux (Experimental support only on Windows) (@maxpkatz)
# * Reduced kernel launch overhead for eagerly-compiled kernels (@gmarkall)
# * Access to all libdevice functions (@gmarkall)
# * Support for atomic subtraction (@testhound)
# * Additional `math` library functions (@zhihaoy)
# * Support for complex power (@gmarkall)
# * New convenience functions for creating mapped and pinned arrays like existing arrays (@c200chromebook)

# In[1]:


# The usual imports

from numba import cuda, float32, njit, void
import math
import numpy as np
import time


# ## Unified memory
# 
# Unified Memory provides a single address space accessible by any CPU or GPU in a system, backed by a paging implementation that automatically ensures data is moved to a device or host only when it is needed. Some use cases this enables:
# 
# * Allocating arrays accessible on the device that are larger than the device memory, paged out to system RAM.
# * Operations on data from both the CPU and GPU with no copying on Tegra systems.
# 
# To allocate a managed array, use the new `cuda.managed_array()` function:

# In[2]:


# A small array
arr = cuda.managed_array(100, dtype=np.float64)


# An example using an array that is larger than GPU memory (note that if you don't have enough system memory, this may cause the kernel to be killed):

# In[3]:


# If you have more than, or a lot less than, 16GB of GPU RAM then edit this:
GB = 16

n_elements = (GB + 1) * (1024 * 1024 * 1024)

# Create a very large array
big_arr = cuda.managed_array(n_elements, dtype=np.uint8)


# Now we can set the memory on the device:

# In[4]:


@cuda.jit
def initialize_array(x):
    start, stride = cuda.grid(1), cuda.gridsize(1)
    for i in range(start, len(x), stride):
        x[i] = 0xAB

initialize_array[1024, 1024](big_arr)


# Now we can verify that all elements are set as expected:

# In[5]:


@njit
def check(x):
    difference = False
    for i in range(len(x)):
        if x[i] != 0xAB:
            difference = True
    
    if difference:
        print("Difference detected!")
    else:
        print("All values as expected!")
            
check(big_arr)


# ## Kernel launch overhead
# 
# Launch overhead for eagerly-compiled kernels (those where `@cuda.jit` is given a signature) has been reduced in Numba 0.52. The following code provides a benchmark for the launch overhead depending on the number of arguments to a function:

# In[6]:


@cuda.jit('void()')
def some_kernel_1():
    return

@cuda.jit('void(float32[:])')
def some_kernel_2(arr1):
    return

@cuda.jit('void(float32[:],float32[:])')
def some_kernel_3(arr1,arr2):
    return

@cuda.jit('void(float32[:],float32[:],float32[:])')
def some_kernel_4(arr1,arr2,arr3):
    return

@cuda.jit('void(float32[:],float32[:],float32[:],float32[:])')
def some_kernel_5(arr1,arr2,arr3,arr4):
    return

arr = cuda.device_array(10000, dtype=np.float32)

get_ipython().run_line_magic('timeit', 'some_kernel_1[1, 1]()')
get_ipython().run_line_magic('timeit', 'some_kernel_2[1, 1](arr)')
get_ipython().run_line_magic('timeit', 'some_kernel_3[1, 1](arr,arr)')
get_ipython().run_line_magic('timeit', 'some_kernel_4[1, 1](arr,arr,arr)')
get_ipython().run_line_magic('timeit', 'some_kernel_5[1, 1](arr,arr,arr,arr)')


# The results of this benchmark on an HP Z8 G4 with a Xeon Gold 6128 and a Quadro RTX 8000 are:
# 
# Numba 0.51.2:
# 
# ```
# 32.3 µs ± 461 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 64 µs ± 501 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 86.6 µs ± 925 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 106 µs ± 24.6 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 125 µs ± 165 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# ```
# 
# Numba 0.52:
# 
# ```
# 20 µs ± 72.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 32.4 µs ± 30.1 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 41 µs ± 176 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 48.6 µs ± 12.6 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# 56.7 µs ± 262 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# ```
# 
# Your results may vary depending on your configuration - try out the benchmark with 0.51.2 and 0.52!

# ## Libdevice functions
# 
# All [CUDA libdevice functions](https://docs.nvidia.com/cuda/libdevice-users-guide/index.html) (with the exception of `__nv_nan` and `__nv_nanf`) are now available in the `cuda.libdevice` module. The leading `__nv_` is stripped from the names - for example, [`__nv_fast_cosf`](https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_cosf.html#__nv_fast_cosf) is available as `libdevice.fast_cosf`. Here's an example showing the use of some of the fast trigonomentric functions in use:

# In[7]:


from numba.cuda import libdevice

# Implementation using the standard trigonometric functions
@cuda.jit(void(float32[::1], float32[::1], float32[::1]))
def trig_functions(r, x, y):
    i = cuda.grid(1)
    if i < len(r):
        r[i] = math.sin(x[i]) * math.cos(y[i]) + math.tan(x[i] + y[i])

# Implementation using the fast trigonometric functions
@cuda.jit(void(float32[::1], float32[::1], float32[::1]))
def fast_trig_functions(r, x, y):
    i = cuda.grid(1)
    if i < len(r):
        r[i] = libdevice.fast_sinf(x[i]) * libdevice.fast_cosf(y[i]) + libdevice.fast_tanf(x[i] + y[i])


# Create some random input
N = 100000
np.random.seed(1)
x = np.random.random(N).astype(np.float32)
y = np.random.random(N).astype(np.float32)

# Copy input to the device and allocate space for output
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
r_math = cuda.device_array_like(x)
r_libdevice = cuda.device_array_like(x)

n_runs = 100
n_threads = 256
n_blocks = math.ceil(N / n_threads)

# Run and time the normal version
start_math = time.perf_counter()
for i in range(n_runs):
    trig_functions[n_blocks, n_threads](r_math, d_x, d_y)
cuda.synchronize()
end_math = time.perf_counter()

# Run and time the version using fast trig functions
start_libdevice = time.perf_counter()
for i in range(n_runs):
    fast_trig_functions[n_blocks, n_threads](r_libdevice, d_x, d_y)
cuda.synchronize()
end_libdevice = time.perf_counter()


# Note that the fast versions of the functions sacrifice accuracy for speed,
# so a lower-than-default relative tolerance is required for this sanity check.
np.testing.assert_allclose(r_math.copy_to_host(), r_libdevice.copy_to_host(), rtol=1.0e-2)

# Note that timings will be fairly similar for this example, as the execution time will be
# dominated by the kernel launch time.
print(f"Standard version time {(end_math - start_math) / n_runs}ms")
print(f"Libdevice version time {(end_libdevice - start_libdevice) / n_runs}ms")


# ## Atomic subtract
# 
# Atomic subtraction is now supported. The following example subtracts several values from an element of an array with every thread contending on the same location:

# In[8]:


@cuda.jit
def subtract_example(x, values):
    i = cuda.grid(1)
    cuda.atomic.sub(x, 0, values[i])

initial = 12345.0
n_blocks = 4
n_threads = 32
n_values = n_blocks * n_threads
values = np.arange(n_values, dtype=np.float32)

x = np.zeros(1, dtype=np.float32)
x[0] = initial

subtract_example[n_blocks, n_threads](x, values)
    
# Floating point subtraction is not associative - the order in which subtractions
# occur can cause a slight variation, so we use assert_allclose instead of checking
# for exact equality.
np.testing.assert_allclose(x, [initial - np.sum(values)])
print(f"Result: {x[0]}")


# ## Math library functions
# 
# The functions `math.frexp` and `math.ldexp` are now supported in CUDA kernels. Example:

# In[9]:


@cuda.jit
def cuda_frexp_ldexp(x, y):    
    i = cuda.grid(1)
    if i < len(x):
        fractional, exponent = math.frexp(x[i])
        y[i] = math.ldexp(fractional, exponent)

np.random.seed(2)
n_values = 16384
n_threads = 256
n_blocks = n_values // n_threads

values = np.random.random(16384).astype(np.float32)
results = np.zeros_like(values)

cuda_frexp_ldexp[n_blocks, n_threads](values, results)

# Sanity check
np.testing.assert_equal(values, results)

# Print the first few values and results
print(values[:10])
print(results[:10])


# ## Powers of complex numbers
# 
# It is now possible to write a kernel using the power operator on complex numbers. For example:

# In[10]:


@cuda.jit
def complex_power(r, x, y):
    i = cuda.grid(1)
    if i < len(r):
        r[i] = x[i] ** y[i]

np.random.seed(3)
n_values = 16384
n_threads = 256
n_blocks = n_values // n_threads

def random_complex():
    "Generate an array of random complex values"
    real = np.random.random(n_values)
    imag = np.random.random(n_values)
    return real + imag * 1j

x = random_complex()
y = random_complex()
r = np.zeros_like(x)

complex_power[n_blocks, n_threads](r, x, y)

# Sanity check
np.testing.assert_allclose(r, x ** y)

# Print the first few results and the same computed on the CPU for comparison
print(r[:10])
print(x[:10] ** y[:10])


# ## `mapped_array_like` and `pinned_array_like`
# 
# In addition to `device_array_like`, `mapped_array_like` and `pinned_array_like` can be used for creating arrays like existing arrays:

# In[11]:


x = np.zeros(16384, dtype=np.int16)

d_x = cuda.device_array_like(x)
m_x = cuda.mapped_array_like(x)
p_x = cuda.pinned_array_like(x)