!pip install numba

# https://medium.com/@iphoenix179/running-cuda-c-c-in-jupyter-or-how-to-run-nvcc-in-google-colab-663d33f53772
# Needed for using cuda with Numba (https://numba.pydata.org/numba-doc/dev/user/installing.html#installing-using-pip-on-x86-x86-64-platforms)
# Numba only supports cuda till 9.1
!apt update -qq;
!wget https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb
!mv cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb
!apt-get update -qq;
!apt-get install cuda-8-0 gcc-5 g++-5 -y -qq;
!ln -s /usr/bin/gcc-5 /usr/local/cuda/bin/gcc;
!ln -s /usr/bin/g++-5 /usr/local/cuda/bin/g++;

import os
os.environ["NUMBAPRO_CUDA_DRIVER"] = "/usr/lib64-nvidia/libcuda.so"
os.environ["NUMBAPRO_NVVM"] = "/usr/local/cuda-8.0/nvvm/lib64/libnvvm.so"
os.environ["NUMBAPRO_LIBDEVICE"] = "/usr/local/cuda-8.0/nvvm/libdevice/"
#os.environ["LD_LIBRARY_PATH"] = "/usr/local/cuda/lib64"

!numba -s

import os
import numpy as np # Numba supports many functions from numpy (https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html)
from numba import jit, njit, vectorize, cuda
import math # Numba supports many functions from math (http://numba.pydata.org/numba-doc/0.17.0/reference/pysupported.html)

import matplotlib.pyplot as plt

a = np.ones((1, 100), dtype=np.float64)
b = np.ones((100, 1), dtype=np.float64)

#
# Simple Python function
#

def func(a, b):
  for i in range(100000):
    constant = math.pow((a@b)[0][0], 1./2)/math.exp((a@b)[0][0]/1000)
    a = np.array([[constant]*100], dtype=np.float64)
  return a

%timeit res = func(a, b)

#
# Numba with nopython = True
#

@njit      # or @jit(nopython=True)
def njit_func(a, b):
  for i in range(100000):
    constant = math.pow((a@b)[0][0], 1./2)/math.exp((a@b)[0][0]/1000)
    a = np.array([[constant]*100], dtype=np.float64)
  return a

%timeit res = njit_func(a, b)

#
# Basic Numba compiler with type information provided
#

@jit('float64(float64, float64)')
def jit_func(a, b):
  for i in range(100000):
    constant = math.pow((a@b)[0][0], 1./2)/math.exp((a@b)[0][0]/1000)
    a = np.array([[constant]*100], dtype=np.float64)
  return a

%timeit res = jit_func(a, b)

fig, ax = plt.subplots(1, 1)
ax.plot(["a", "b", "c"], [821, 447, 440], "-o") # Results without caching
ax.set_xticklabels(["pyFunc", "Jit", "Njit"])
fig.suptitle("Basic Numba Functionalities", fontsize=17)
ax.set_ylabel("Time (ms)")
ax.set_xlabel("Method")

#
# Making Ufuncs with Numba.
#

@vectorize
def vec_func(a, b):
  # Now we can pass arrays too, and operate
  # inside like they are scalars:
  for i in range(100000):
    a = math.pow(a*b, 1./2)/math.exp(a*b/1000)
  return a
# This is similar to functions before, for comparison. But...

%timeit res = vec_func(a, b)

#
# This is slow because previously we were doing some
# operations on 1,00,000 scalars obtained by multiplying
# (a@b), but now we are multiplying individual elements 
# of a and b for 1,00,000 times. Also numba is taking care
# of broadcasting too. So, in this case we are applying this
# loop for 100 times.
#
res.shape # Previously it was (1, 100)

@vectorize(['float64(float64, float64)'], target='parallel')
def vecParallel_func(a, b):
  for i in range(100000):
    a = math.pow(a*b, 1./2)/math.exp(a*b/1000)
  return a

%timeit res = vecParallel_func(a, b)

@vectorize(['float64(float64, float64)'], target='cuda')
def vecCuda_func(a, b):
  for i in range(100000):
    a = math.pow(a*b, 1./2)/math.exp(a*b/1000)
  return a

%timeit res = vecCuda_func(a, b)

# Woah!!

fig, ax = plt.subplots(1, 1)
ax.plot(["a", "b", "c"], [35.8, 19.2, 0.568], "-o")
ax.set_xticklabels(["Vectorize", "Vectorize Parallel", "Vectorize Cuda"])
fig.suptitle("@vectorize wrapper", fontsize=17)
ax.set_ylabel("Time (sec)")
ax.set_xlabel("Method")

@cuda.jit
def cudaKernal_func(a, b, result): # cuda.jit does not return result yet
  pos = cuda.grid(1)
  if (pos < a.shape[1]) and (pos < b.shape[0]):
    for i in range(100000):
      result[pos] = math.exp(a[0][pos]*b[pos][0])

result = np.zeros((100,), dtype=np.float64)

threadsperblock = 32
blockspergrid = (100 + 31) // 32 # blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock

%timeit cudaKernal_func[threadsperblock, blockspergrid](a, b, result)

result.shape

#
# Here, we have only used it for 1D arrays. You can use it for any Tensor. For eg:
# For 2D array operations you would have used: x, y = cuda.grid(2)
#

@cuda.jit(device=True)
def cudaDevice_func(a, b):
  for i in range(100000):
    a = math.exp(a*b)
  return a

@cuda.jit
def cudaKernal_func2(a, b, result): # cuda.jit does not return result yet
  pos = cuda.grid(1)
  if (pos < a.shape[1]) and (pos < b.shape[0]):
    result[pos] = cudaDevice_func(a[0][pos], b[pos][0])

%timeit cudaKernal_func2[threadsperblock, blockspergrid](a, b, result)