#!/usr/bin/env python
# coding: utf-8

# # Numba 0.51 CUDA Release demo
# 
# Key changes to the CUDA target include:
# 
# * Support for CUDA Toolkit 11, Ampere, and Compute Capability 8.0
# * Stream callbacks ([demo](#Stream-callbacks))
# * Async awaitable streams ([demo](#Async-awaitable-streams))
# * Printing of SASS code for kernels ([demo](#Inspecting-SASS-code))
# * Atomic ``nanmin`` and ``nanmax`` ([demo](#Atomic-nanmin-and-nanmax))

# In[1]:


# A couple of useful imports
from numba import cuda
import numpy as np


# ## Stream callbacks
# 
# Adding a callback to a stream allows a function on the host to be called when all the items presently enqueued on the stream have completed. This can be useful for notifying the host code that a certain sequence of events has completed.
# 
# The CUDA Toolkit Samples include an example of using stream callbacks in a multithreaded application in the `0_Simple/simpleCallback` directory - for this notebook, we demonstrate the CUDA Python API only.
# 
# First let's define a kernel and a callback function:

# In[2]:


# A simple kernel to add two arrays
@cuda.jit
def add_kernel(r, x, y):
    i = cuda.grid(1)
    
    if i < len(r):
        r[i] = x[i] + y[i]

# Python function for callback
# `data` is the data value set when the callback was added (see below)
def print_callback(stream, status, data):
    print(f'Callback function reached! Data is {data}')


# Now we'll create a stream then use it to transfer data and launch a kernel before enqueueing a callback:

# In[3]:


# Create a stream
s1 = cuda.stream()

# Transfer all data to the device on the stream
n_elements = 256
x = cuda.to_device(np.random.random(n_elements), stream=s1)
y = cuda.to_device(np.random.random(n_elements), stream=s1)
r = cuda.device_array_like(x, stream=s1)

# Launch the kernel on the stream
add_kernel[1, n_elements, s1](r, x, y)

# Add a callback that will be called on the host when the kernel launch is complete.
# The first parameter is the function to call.
# The second parameter is passed into the `data` (3rd) argument of the callback.
s1.add_callback(print_callback, 'HELLO WORLD')


# Callbacks can also be used to notify `Event` objects. For example:

# In[4]:


import threading

# Create a stream and an event that will be notified
s2 = cuda.stream()
callback_event = threading.Event()

# Define a callback function to notify the event
def notify_callback(stream, status, event):
    event.set()

# Add the callback to the stream:
s2.add_callback(notify_callback, callback_event)

# Wait for the event
print("Waiting for event")
ret = callback_event.wait(1.0)
print(f"Waiting returned: {ret}")


# ## Async awaitable streams
# 
# `Stream.async_done()` returns an awaitable that resolves once all preceding stream operations are complete:

# In[5]:


import asyncio

# Define a new stream
s3 = cuda.stream()

# Directly awaiting on async_done works in the notebook as it already
# has a running event loop
await s3.async_done()


# An example creating multiple tasks on multiple streams and gathering:

# In[6]:


async def async_cuda_fn(value_in):
    stream = cuda.stream()
    h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
    h_src[:] = value_in
    d_ary = cuda.to_device(h_src, stream=stream)
    d_ary.copy_to_host(h_dst, stream=stream)
    await stream.async_done()
    return h_dst.mean()

values_in = [1, 2, 3, 4]
tasks = [asyncio.create_task(async_cuda_fn(v)) for v in values_in]
values_out = await asyncio.gather(*tasks)
values_out


# ## Inspecting SASS code
# 
# Let's define a simple kernel, and compile it eagerly:

# In[7]:


@cuda.jit('void(float32[::1], float32[::1], float32[::1])')
def add(r, x, y):
    i = cuda.grid(1)
    
    if i < len(r):
        r[i] = x[i] + y[i]


# Now we can print the SASS code:

# In[8]:


print(add.inspect_sass())


# Note that SASS inspection uses `nvdisasm` from the [CUDA Binary Utilities](https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html), so the full CUDA toolkit must be installed - the conda `cudatoolkit` package does not include `nvdisasm`.
# 
# # Atomic `nanmin` and `nanmax`
# 
# Numba 0.50 changed the semantics of `cuda.atomic.min` and `cuda.atomic.max` to more closely match that of `math.max` and `numpy.max`, which always return the first argument if one of the operands is a NaN. The old behaviour, treating NaN values as missing data is now implemented as `cuda.atomic.nanmin` and `cuda.atomic.nanmax` (c.f. [`nanmin`](https://numpy.org/doc/stable/reference/generated/numpy.nanmin.html) and [`nanmax`](https://numpy.org/doc/stable/reference/generated/numpy.nanmax.html) in NumPy).
# 
# To demonstrate the difference, we'll define two kernels, one using `max`, and the other using `nanmax`:

# In[9]:


@cuda.jit
def atomic_max_array(val, arr):
    i = cuda.grid(1)
    cuda.atomic.max(arr, i, val[0])

@cuda.jit
def atomic_nanmax_array(val, arr):
    i = cuda.grid(1)
    cuda.atomic.nanmax(arr, i, val[0])


# Now we define input arrays - we'll set the value of one element to NaN:    

# In[10]:


n_elements = 8
x_max = np.arange(n_elements, dtype=np.float32)
x_max[5] = np.nan

# A copy of the input for the other kernel
x_nanmax = x_max.copy()

# Print an array to show the NaN in position:
x_max


# Now we'll launch the kernels on our arrays:

# In[11]:


# A value to compare the array values with
val = np.ones(1, dtype=np.float32) + 2

# Launch kernels
atomic_max_array[1, n_elements](val, x_max)
atomic_nanmax_array[1, n_elements](val, x_nanmax)


# Looking at the output from `cuda.atomic.max`, we see that it didn't treat the NaN value as missing data, but instead returned it:

# In[12]:


x_max


# Whereas `cuda.atomic.nanmax` treated the NaN as missing data and replaced it with the comparison value:

# In[13]:


x_nanmax