!pip install numba # https://medium.com/@iphoenix179/running-cuda-c-c-in-jupyter-or-how-to-run-nvcc-in-google-colab-663d33f53772 # Needed for using cuda with Numba (https://numba.pydata.org/numba-doc/dev/user/installing.html#installing-using-pip-on-x86-x86-64-platforms) # Numba only supports cuda till 9.1 !apt update -qq; !wget https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb !mv cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb !dpkg -i cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb !apt-get update -qq; !apt-get install cuda-8-0 gcc-5 g++-5 -y -qq; !ln -s /usr/bin/gcc-5 /usr/local/cuda/bin/gcc; !ln -s /usr/bin/g++-5 /usr/local/cuda/bin/g++; import os os.environ["NUMBAPRO_CUDA_DRIVER"] = "/usr/lib64-nvidia/libcuda.so" os.environ["NUMBAPRO_NVVM"] = "/usr/local/cuda-8.0/nvvm/lib64/libnvvm.so" os.environ["NUMBAPRO_LIBDEVICE"] = "/usr/local/cuda-8.0/nvvm/libdevice/" #os.environ["LD_LIBRARY_PATH"] = "/usr/local/cuda/lib64" !numba -s import os import numpy as np # Numba supports many functions from numpy (https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html) from numba import jit, njit, vectorize, cuda import math # Numba supports many functions from math (http://numba.pydata.org/numba-doc/0.17.0/reference/pysupported.html) import matplotlib.pyplot as plt a = np.ones((1, 100), dtype=np.float64) b = np.ones((100, 1), dtype=np.float64) # # Simple Python function # def func(a, b): for i in range(100000): constant = math.pow((a@b)[0][0], 1./2)/math.exp((a@b)[0][0]/1000) a = np.array([[constant]*100], dtype=np.float64) return a %timeit res = func(a, b) # # Numba with nopython = True # @njit # or @jit(nopython=True) def njit_func(a, b): for i in range(100000): constant = math.pow((a@b)[0][0], 1./2)/math.exp((a@b)[0][0]/1000) a = np.array([[constant]*100], dtype=np.float64) return a %timeit res = njit_func(a, b) # # Basic Numba compiler with type information provided # @jit('float64(float64, float64)') def jit_func(a, b): for i in range(100000): constant = math.pow((a@b)[0][0], 1./2)/math.exp((a@b)[0][0]/1000) a = np.array([[constant]*100], dtype=np.float64) return a %timeit res = jit_func(a, b) fig, ax = plt.subplots(1, 1) ax.plot(["a", "b", "c"], [821, 447, 440], "-o") # Results without caching ax.set_xticklabels(["pyFunc", "Jit", "Njit"]) fig.suptitle("Basic Numba Functionalities", fontsize=17) ax.set_ylabel("Time (ms)") ax.set_xlabel("Method") # # Making Ufuncs with Numba. # @vectorize def vec_func(a, b): # Now we can pass arrays too, and operate # inside like they are scalars: for i in range(100000): a = math.pow(a*b, 1./2)/math.exp(a*b/1000) return a # This is similar to functions before, for comparison. But... %timeit res = vec_func(a, b) # # This is slow because previously we were doing some # operations on 1,00,000 scalars obtained by multiplying # (a@b), but now we are multiplying individual elements # of a and b for 1,00,000 times. Also numba is taking care # of broadcasting too. So, in this case we are applying this # loop for 100 times. # res.shape # Previously it was (1, 100) @vectorize(['float64(float64, float64)'], target='parallel') def vecParallel_func(a, b): for i in range(100000): a = math.pow(a*b, 1./2)/math.exp(a*b/1000) return a %timeit res = vecParallel_func(a, b) @vectorize(['float64(float64, float64)'], target='cuda') def vecCuda_func(a, b): for i in range(100000): a = math.pow(a*b, 1./2)/math.exp(a*b/1000) return a %timeit res = vecCuda_func(a, b) # Woah!! fig, ax = plt.subplots(1, 1) ax.plot(["a", "b", "c"], [35.8, 19.2, 0.568], "-o") ax.set_xticklabels(["Vectorize", "Vectorize Parallel", "Vectorize Cuda"]) fig.suptitle("@vectorize wrapper", fontsize=17) ax.set_ylabel("Time (sec)") ax.set_xlabel("Method") @cuda.jit def cudaKernal_func(a, b, result): # cuda.jit does not return result yet pos = cuda.grid(1) if (pos < a.shape[1]) and (pos < b.shape[0]): for i in range(100000): result[pos] = math.exp(a[0][pos]*b[pos][0]) result = np.zeros((100,), dtype=np.float64) threadsperblock = 32 blockspergrid = (100 + 31) // 32 # blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock %timeit cudaKernal_func[threadsperblock, blockspergrid](a, b, result) result.shape # # Here, we have only used it for 1D arrays. You can use it for any Tensor. For eg: # For 2D array operations you would have used: x, y = cuda.grid(2) # @cuda.jit(device=True) def cudaDevice_func(a, b): for i in range(100000): a = math.exp(a*b) return a @cuda.jit def cudaKernal_func2(a, b, result): # cuda.jit does not return result yet pos = cuda.grid(1) if (pos < a.shape[1]) and (pos < b.shape[0]): result[pos] = cudaDevice_func(a[0][pos], b[pos][0]) %timeit cudaKernal_func2[threadsperblock, blockspergrid](a, b, result)