#!/usr/bin/env python # coding: utf-8 # # Numba 0.52 CUDA release demo # # Key changes to the CUDA target for Release 0.52 include: # # * Support for Unified Memory on Linux (Experimental support only on Windows) (@maxpkatz) # * Reduced kernel launch overhead for eagerly-compiled kernels (@gmarkall) # * Access to all libdevice functions (@gmarkall) # * Support for atomic subtraction (@testhound) # * Additional `math` library functions (@zhihaoy) # * Support for complex power (@gmarkall) # * New convenience functions for creating mapped and pinned arrays like existing arrays (@c200chromebook) # In[1]: # The usual imports from numba import cuda, float32, njit, void import math import numpy as np import time # ## Unified memory # # Unified Memory provides a single address space accessible by any CPU or GPU in a system, backed by a paging implementation that automatically ensures data is moved to a device or host only when it is needed. Some use cases this enables: # # * Allocating arrays accessible on the device that are larger than the device memory, paged out to system RAM. # * Operations on data from both the CPU and GPU with no copying on Tegra systems. # # To allocate a managed array, use the new `cuda.managed_array()` function: # In[2]: # A small array arr = cuda.managed_array(100, dtype=np.float64) # An example using an array that is larger than GPU memory (note that if you don't have enough system memory, this may cause the kernel to be killed): # In[3]: # If you have more than, or a lot less than, 16GB of GPU RAM then edit this: GB = 16 n_elements = (GB + 1) * (1024 * 1024 * 1024) # Create a very large array big_arr = cuda.managed_array(n_elements, dtype=np.uint8) # Now we can set the memory on the device: # In[4]: @cuda.jit def initialize_array(x): start, stride = cuda.grid(1), cuda.gridsize(1) for i in range(start, len(x), stride): x[i] = 0xAB initialize_array[1024, 1024](big_arr) # Now we can verify that all elements are set as expected: # In[5]: @njit def check(x): difference = False for i in range(len(x)): if x[i] != 0xAB: difference = True if difference: print("Difference detected!") else: print("All values as expected!") check(big_arr) # ## Kernel launch overhead # # Launch overhead for eagerly-compiled kernels (those where `@cuda.jit` is given a signature) has been reduced in Numba 0.52. The following code provides a benchmark for the launch overhead depending on the number of arguments to a function: # In[6]: @cuda.jit('void()') def some_kernel_1(): return @cuda.jit('void(float32[:])') def some_kernel_2(arr1): return @cuda.jit('void(float32[:],float32[:])') def some_kernel_3(arr1,arr2): return @cuda.jit('void(float32[:],float32[:],float32[:])') def some_kernel_4(arr1,arr2,arr3): return @cuda.jit('void(float32[:],float32[:],float32[:],float32[:])') def some_kernel_5(arr1,arr2,arr3,arr4): return arr = cuda.device_array(10000, dtype=np.float32) get_ipython().run_line_magic('timeit', 'some_kernel_1[1, 1]()') get_ipython().run_line_magic('timeit', 'some_kernel_2[1, 1](arr)') get_ipython().run_line_magic('timeit', 'some_kernel_3[1, 1](arr,arr)') get_ipython().run_line_magic('timeit', 'some_kernel_4[1, 1](arr,arr,arr)') get_ipython().run_line_magic('timeit', 'some_kernel_5[1, 1](arr,arr,arr,arr)') # The results of this benchmark on an HP Z8 G4 with a Xeon Gold 6128 and a Quadro RTX 8000 are: # # Numba 0.51.2: # # ``` # 32.3 µs ± 461 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 64 µs ± 501 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 86.6 µs ± 925 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 106 µs ± 24.6 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 125 µs ± 165 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # ``` # # Numba 0.52: # # ``` # 20 µs ± 72.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 32.4 µs ± 30.1 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 41 µs ± 176 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 48.6 µs ± 12.6 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # 56.7 µs ± 262 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) # ``` # # Your results may vary depending on your configuration - try out the benchmark with 0.51.2 and 0.52! # ## Libdevice functions # # All [CUDA libdevice functions](https://docs.nvidia.com/cuda/libdevice-users-guide/index.html) (with the exception of `__nv_nan` and `__nv_nanf`) are now available in the `cuda.libdevice` module. The leading `__nv_` is stripped from the names - for example, [`__nv_fast_cosf`](https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_cosf.html#__nv_fast_cosf) is available as `libdevice.fast_cosf`. Here's an example showing the use of some of the fast trigonomentric functions in use: # In[7]: from numba.cuda import libdevice # Implementation using the standard trigonometric functions @cuda.jit(void(float32[::1], float32[::1], float32[::1])) def trig_functions(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = math.sin(x[i]) * math.cos(y[i]) + math.tan(x[i] + y[i]) # Implementation using the fast trigonometric functions @cuda.jit(void(float32[::1], float32[::1], float32[::1])) def fast_trig_functions(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = libdevice.fast_sinf(x[i]) * libdevice.fast_cosf(y[i]) + libdevice.fast_tanf(x[i] + y[i]) # Create some random input N = 100000 np.random.seed(1) x = np.random.random(N).astype(np.float32) y = np.random.random(N).astype(np.float32) # Copy input to the device and allocate space for output d_x = cuda.to_device(x) d_y = cuda.to_device(y) r_math = cuda.device_array_like(x) r_libdevice = cuda.device_array_like(x) n_runs = 100 n_threads = 256 n_blocks = math.ceil(N / n_threads) # Run and time the normal version start_math = time.perf_counter() for i in range(n_runs): trig_functions[n_blocks, n_threads](r_math, d_x, d_y) cuda.synchronize() end_math = time.perf_counter() # Run and time the version using fast trig functions start_libdevice = time.perf_counter() for i in range(n_runs): fast_trig_functions[n_blocks, n_threads](r_libdevice, d_x, d_y) cuda.synchronize() end_libdevice = time.perf_counter() # Note that the fast versions of the functions sacrifice accuracy for speed, # so a lower-than-default relative tolerance is required for this sanity check. np.testing.assert_allclose(r_math.copy_to_host(), r_libdevice.copy_to_host(), rtol=1.0e-2) # Note that timings will be fairly similar for this example, as the execution time will be # dominated by the kernel launch time. print(f"Standard version time {(end_math - start_math) / n_runs}ms") print(f"Libdevice version time {(end_libdevice - start_libdevice) / n_runs}ms") # ## Atomic subtract # # Atomic subtraction is now supported. The following example subtracts several values from an element of an array with every thread contending on the same location: # In[8]: @cuda.jit def subtract_example(x, values): i = cuda.grid(1) cuda.atomic.sub(x, 0, values[i]) initial = 12345.0 n_blocks = 4 n_threads = 32 n_values = n_blocks * n_threads values = np.arange(n_values, dtype=np.float32) x = np.zeros(1, dtype=np.float32) x[0] = initial subtract_example[n_blocks, n_threads](x, values) # Floating point subtraction is not associative - the order in which subtractions # occur can cause a slight variation, so we use assert_allclose instead of checking # for exact equality. np.testing.assert_allclose(x, [initial - np.sum(values)]) print(f"Result: {x[0]}") # ## Math library functions # # The functions `math.frexp` and `math.ldexp` are now supported in CUDA kernels. Example: # In[9]: @cuda.jit def cuda_frexp_ldexp(x, y): i = cuda.grid(1) if i < len(x): fractional, exponent = math.frexp(x[i]) y[i] = math.ldexp(fractional, exponent) np.random.seed(2) n_values = 16384 n_threads = 256 n_blocks = n_values // n_threads values = np.random.random(16384).astype(np.float32) results = np.zeros_like(values) cuda_frexp_ldexp[n_blocks, n_threads](values, results) # Sanity check np.testing.assert_equal(values, results) # Print the first few values and results print(values[:10]) print(results[:10]) # ## Powers of complex numbers # # It is now possible to write a kernel using the power operator on complex numbers. For example: # In[10]: @cuda.jit def complex_power(r, x, y): i = cuda.grid(1) if i < len(r): r[i] = x[i] ** y[i] np.random.seed(3) n_values = 16384 n_threads = 256 n_blocks = n_values // n_threads def random_complex(): "Generate an array of random complex values" real = np.random.random(n_values) imag = np.random.random(n_values) return real + imag * 1j x = random_complex() y = random_complex() r = np.zeros_like(x) complex_power[n_blocks, n_threads](r, x, y) # Sanity check np.testing.assert_allclose(r, x ** y) # Print the first few results and the same computed on the CPU for comparison print(r[:10]) print(x[:10] ** y[:10]) # ## `mapped_array_like` and `pinned_array_like` # # In addition to `device_array_like`, `mapped_array_like` and `pinned_array_like` can be used for creating arrays like existing arrays: # In[11]: x = np.zeros(16384, dtype=np.int16) d_x = cuda.device_array_like(x) m_x = cuda.mapped_array_like(x) p_x = cuda.pinned_array_like(x)