#!/usr/bin/env python # coding: utf-8 # # Numba 0.51 CUDA Release demo # # Key changes to the CUDA target include: # # * Support for CUDA Toolkit 11, Ampere, and Compute Capability 8.0 # * Stream callbacks ([demo](#Stream-callbacks)) # * Async awaitable streams ([demo](#Async-awaitable-streams)) # * Printing of SASS code for kernels ([demo](#Inspecting-SASS-code)) # * Atomic ``nanmin`` and ``nanmax`` ([demo](#Atomic-nanmin-and-nanmax)) # In[1]: # A couple of useful imports from numba import cuda import numpy as np # ## Stream callbacks # # Adding a callback to a stream allows a function on the host to be called when all the items presently enqueued on the stream have completed. This can be useful for notifying the host code that a certain sequence of events has completed. # # The CUDA Toolkit Samples include an example of using stream callbacks in a multithreaded application in the `0_Simple/simpleCallback` directory - for this notebook, we demonstrate the CUDA Python API only. # # First let's define a kernel and a callback function: # In[2]: # A simple kernel to add two arrays @cuda.jit def add_kernel(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = x[i] + y[i] # Python function for callback # `data` is the data value set when the callback was added (see below) def print_callback(stream, status, data): print(f'Callback function reached! Data is {data}') # Now we'll create a stream then use it to transfer data and launch a kernel before enqueueing a callback: # In[3]: # Create a stream s1 = cuda.stream() # Transfer all data to the device on the stream n_elements = 256 x = cuda.to_device(np.random.random(n_elements), stream=s1) y = cuda.to_device(np.random.random(n_elements), stream=s1) r = cuda.device_array_like(x, stream=s1) # Launch the kernel on the stream add_kernel[1, n_elements, s1](r, x, y) # Add a callback that will be called on the host when the kernel launch is complete. # The first parameter is the function to call. # The second parameter is passed into the `data` (3rd) argument of the callback. s1.add_callback(print_callback, 'HELLO WORLD') # Callbacks can also be used to notify `Event` objects. For example: # In[4]: import threading # Create a stream and an event that will be notified s2 = cuda.stream() callback_event = threading.Event() # Define a callback function to notify the event def notify_callback(stream, status, event): event.set() # Add the callback to the stream: s2.add_callback(notify_callback, callback_event) # Wait for the event print("Waiting for event") ret = callback_event.wait(1.0) print(f"Waiting returned: {ret}") # ## Async awaitable streams # # `Stream.async_done()` returns an awaitable that resolves once all preceding stream operations are complete: # In[5]: import asyncio # Define a new stream s3 = cuda.stream() # Directly awaiting on async_done works in the notebook as it already # has a running event loop await s3.async_done() # An example creating multiple tasks on multiple streams and gathering: # In[6]: async def async_cuda_fn(value_in): stream = cuda.stream() h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8) h_src[:] = value_in d_ary = cuda.to_device(h_src, stream=stream) d_ary.copy_to_host(h_dst, stream=stream) await stream.async_done() return h_dst.mean() values_in = [1, 2, 3, 4] tasks = [asyncio.create_task(async_cuda_fn(v)) for v in values_in] values_out = await asyncio.gather(*tasks) values_out # ## Inspecting SASS code # # Let's define a simple kernel, and compile it eagerly: # In[7]: @cuda.jit('void(float32[::1], float32[::1], float32[::1])') def add(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = x[i] + y[i] # Now we can print the SASS code: # In[8]: print(add.inspect_sass()) # Note that SASS inspection uses `nvdisasm` from the [CUDA Binary Utilities](https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html), so the full CUDA toolkit must be installed - the conda `cudatoolkit` package does not include `nvdisasm`. # # # Atomic `nanmin` and `nanmax` # # Numba 0.50 changed the semantics of `cuda.atomic.min` and `cuda.atomic.max` to more closely match that of `math.max` and `numpy.max`, which always return the first argument if one of the operands is a NaN. The old behaviour, treating NaN values as missing data is now implemented as `cuda.atomic.nanmin` and `cuda.atomic.nanmax` (c.f. [`nanmin`](https://numpy.org/doc/stable/reference/generated/numpy.nanmin.html) and [`nanmax`](https://numpy.org/doc/stable/reference/generated/numpy.nanmax.html) in NumPy). # # To demonstrate the difference, we'll define two kernels, one using `max`, and the other using `nanmax`: # In[9]: @cuda.jit def atomic_max_array(val, arr): i = cuda.grid(1) cuda.atomic.max(arr, i, val[0]) @cuda.jit def atomic_nanmax_array(val, arr): i = cuda.grid(1) cuda.atomic.nanmax(arr, i, val[0]) # Now we define input arrays - we'll set the value of one element to NaN: # In[10]: n_elements = 8 x_max = np.arange(n_elements, dtype=np.float32) x_max[5] = np.nan # A copy of the input for the other kernel x_nanmax = x_max.copy() # Print an array to show the NaN in position: x_max # Now we'll launch the kernels on our arrays: # In[11]: # A value to compare the array values with val = np.ones(1, dtype=np.float32) + 2 # Launch kernels atomic_max_array[1, n_elements](val, x_max) atomic_nanmax_array[1, n_elements](val, x_nanmax) # Looking at the output from `cuda.atomic.max`, we see that it didn't treat the NaN value as missing data, but instead returned it: # In[12]: x_max # Whereas `cuda.atomic.nanmax` treated the NaN as missing data and replaced it with the comparison value: # In[13]: x_nanmax