#!/usr/bin/env python
# coding: utf-8

# # Profiling and Optimising
# 
# IPython provides some tools for making it a bit easier to profile and optimise your code.

# In[1]:


import numpy as np
import matplotlib.pyplot as plt


# ## `%timeit`
# 
# The main IPython tool we are going to use here is `%timeit`,
# a magic that automates measuring how long it takes to run a snippet of code.

# In[2]:


for N in (100, 500, 1000, 2000):
    print("Size: {0} x {0}".format(N))
    A = np.random.random((N, N))
    get_ipython().run_line_magic('timeit', 'A.dot(A)')


# Let's look at what options `%timeit` can take.

# In[ ]:


get_ipython().run_line_magic('pinfo', '%timeit')


# We can save the result in an object with `%timeit -o`,
# and specify to only run one group of 100 iterations.

# In[3]:


A = np.random.random((100, 100))
tr = get_ipython().run_line_magic('timeit', '-o -n 1 -r 100 A.dot(A)')


# In[4]:


tr


# In[5]:


tr.best


# In[6]:


tr.best, tr.worst


# In[7]:


tr.all_runs


# In[8]:


plt.hist(np.array(tr.all_runs) * 1e6)
plt.xlabel("t (µs)")


# ## Diffusing a wave

# Our task is to optimise a 1-D diffusion algorithm,
# using numpy and Cython.
# 
# Our input signal is a sawtooth wave:
# 
# $$
# x_\mathrm{sawtooth}(t) = \frac{A}{2}-\frac {A}{\pi}\sum_{k=1}^{\infty}\frac {\sin (2\pi kft)}{k}
# $$

# In[9]:


from scipy.signal import sawtooth

T = 8 * np.pi
t = np.linspace(0, T, 512)
x = sawtooth(t)
plt.plot(t, x)

steps = 2048


# We are going to diffuse the wave by evolving the heat equation:
# 
# $$
# \frac{\delta x}{\delta t} = \alpha \frac{\delta^2 x}{\delta^2}{t}
# $$
# 
# Which we can discretize for our arrays:
# 
# \begin{align}
# x_{k} =& \frac{1}{4} \left( 
#     x_{k-1}[i-1] +
#     2 x_{k-1}[i] +
#     x_{k-1}[i+1]
#     \right) \\
# x_{k}[0] =& x_{0}[0] \\
# x_{k}[N] =& x_{0}[N] \\
# \end{align}

# ## Pure Python
# 
# We'll start with a pure Python implementation,
# to use as a reference.

# In[10]:


def blur_py(x, steps=1024):
    x = 1 * x # copy
    y = np.empty_like(x)
    y[0] = x[0]
    y[-1] = x[-1]
    for _ in range(steps):
        for i in range(1, len(x)-1):
            y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )
        x, y = y, x # swap for next step
    return x


# In[11]:


y = blur_py(x, steps)
plt.plot(t, x, '--')
plt.plot(t, y);


# Now we can measure how long it takes to run evolve this system:

# In[12]:


ref_run = get_ipython().run_line_magic('timeit', '-o y = blur_py(x, steps)')
t_ref = ref_run.best

times = [t_ref]
labels = ['python']


# So it takes about one second.
# We can also see how it changes with different times and resolutions.

# ## Vectorizing with numpy
# 
# We can vectorize the inner loop with a single numpy operation:

# In[13]:


import numpy as np

def blur_np(x, steps=1024):
    x = 1 * x
    y = np.empty_like(x)
    y[0] = x[0]
    y[-1] = x[-1]
    for _ in range(steps):
        y[1:-1] = .25 * (x[:-2] + 2 * x[1:-1] + x[2:])
        x, y = y, x
    return x


# In[14]:


y = blur_np(x, steps)
plt.plot(t, x, '--')
plt.plot(t, y)


# In[15]:


np_r = get_ipython().run_line_magic('timeit', '-o blur_np(x, steps)')
t_np = np_r.best


# In[16]:


times.append(t_np)
labels.append('numpy')


# In[17]:


def plot_times():
    ind = np.arange(len(times))
    plt.bar(ind, times, log=True)
    plt.xticks(ind + 0.3, labels, rotation=30)
    plt.ylim(.1 * min(times), times[0])
plot_times()


# So vectorizing the inner loop brings us from 1 second to 25 milliseconds,
# an improvement of 40x:

# In[18]:


t_ref / t_np


# # Cython
# 
# [Cython](http://cython.org/) provides an IPython extension,
# which defines a magic we can use to inline bits of Cython code in the notebook:

# In[19]:


get_ipython().run_line_magic('load_ext', 'Cython')


# In[20]:


get_ipython().run_cell_magic('cython', '', '\ndef csum(n):\n    cs = 0\n    for i in range(n):\n        cs += i\n    return cs\n')


# In[21]:


get_ipython().run_line_magic('timeit', 'csum(5)')


# `%%cython -a` shows you annotations about the generated sourcecode.
# The key to writing Cython is to minimize the amount of Python calls in the generated code. In general: yellow = slow.

# In[22]:


def psum(n):
    cs = 0
    for i in range(n):
        cs += i
    return cs


# In[23]:


get_ipython().run_cell_magic('cython', '-a', '\ndef csum(n):\n    cs = 0\n    for i in range(n):\n        cs += i\n    return cs\n')


# Uh oh, that looks like a lot of yellow.
# We can reduce it by adding some type annotations:

# In[24]:


get_ipython().run_cell_magic('cython', '-a', '\ndef csum2(int n):\n    cdef int i\n    cs = 0\n    for i in range(n):\n        cs += i\n    return cs\n')


# Almost there, but I still see yellow on the lines with `cs`:

# In[25]:


get_ipython().run_cell_magic('cython', '-a', '\ncpdef int csum3(int n):\n    cdef int i\n    cdef int cs = 0\n    for i in range(n):\n        cs += i\n    return cs\n')


# Much better!
# Now there's only Python when entering the function,
# which is about as good as we can do.

# In[26]:


N = 1000000
print('psum ', end=' ')
get_ipython().run_line_magic('timeit', 'psum (N)')
print('csum ', end=' ')
get_ipython().run_line_magic('timeit', 'csum (N)')
print('csum2', end=' ')
get_ipython().run_line_magic('timeit', 'csum2(N)')
print('csum3', end=' ')
get_ipython().run_line_magic('timeit', 'csum3(N)')


# ## Blurring with Cython
# 
# Now we can apply the same principles to writing a blur
# in Cython.

# In[27]:


get_ipython().run_cell_magic('cython', '-a', '\nimport numpy as np\n\ndef blur_cython(x, steps=1024):\n    x = 1 * x # copy\n    y = np.empty_like(x)\n    y[0] = x[0]\n    y[-1] = x[-1]\n    for _ in range(steps):\n        for i in range(1, len(x)-1):\n            y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )\n        x, y = y, x # swap for next step\n    return x\n')


# In[28]:


c1 = get_ipython().run_line_magic('timeit', '-o y = blur_cython(x, steps)')
t_c1 = c1.best
times.append(t_c1)
labels.append("cython (no hints)")


# In[29]:


plot_times()


# Without annotations, we don't get much improvement over the pure Python version.
# We can note the types of the input arguments, to get some improvements:

# In[30]:


get_ipython().run_cell_magic('cython', '-a', '\nimport numpy as np\ncimport numpy as np\n\ndef blur_cython2(x, int steps=1024):\n    x = 1 * x # copy\n    y = np.empty_like(x)\n    y[0] = x[0]\n    y[-1] = x[-1]\n    cdef int i, N = len(x)\n    for _ in range(steps):\n        for i in range(1, N-1):\n            y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )\n        x, y = y, x # swap for next step\n    return x\n')


# In[31]:


c2 = get_ipython().run_line_magic('timeit', '-o blur_cython2(x, steps)')
t_c2 = c2.best
times.append(t_c2)
labels.append("cython (loops)")
plot_times()


# Just by making sure the iteration variables are defined as integers, we can save about 25% of the time.
# 
# The biggest key to optimizing with Cython is getting that yellow out of your loops.
# The more deeply nested a bit of code is within a loop,
# the more often it is called, and the more value you can get out of making it fast.
# In Cython, fast means avoiding Python (getting rid of yellow).
# To get rid of Python calls, we need to tell Python about the numpy arrays `x` and `y`:
# 

# In[32]:


get_ipython().run_cell_magic('cython', '-a', '\nimport numpy as np\ncimport numpy as np\n\ndef blur_cython_typed(np.ndarray[double, ndim=1] x_, int steps=1024):\n#     x = 1 * x # copy\n    cdef size_t i, N = x_.shape[0]\n    cdef np.ndarray[double, ndim=1] x\n    cdef np.ndarray[double, ndim=1] y\n    x = 1 * x_\n    y = np.empty_like(x_)\n    y[0] = x[0]\n    y[-1] = x[-1]\n    for _ in range(steps):\n        for i in range(1, N-1):\n            y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )\n        x, y = y, x # swap for next step\n    return x\n')


# In[33]:


ct = get_ipython().run_line_magic('timeit', '-o y = blur_cython_typed(x, steps)')
t_ct = ct.best

times.append(t_ct)
labels.append("cython (types)")
plot_times()


# We can furter optimize with Cython macros,
# which disable bounds checking and negative indexing,
# and avoiding the Python variable swaping by using indices into a single array:

# In[34]:


get_ipython().run_cell_magic('cython', '-a', '#cython: boundscheck=False\n#cython: wraparound=False\n\nimport numpy as np\ncimport numpy as np\n\ndef blur_cython_optimized(np.ndarray[double, ndim=1] x, int steps=1024):\n    cdef size_t N = x.shape[0]\n    cdef np.ndarray[double, ndim=2] y\n    y = np.empty((2, N), dtype=np.float64)\n    y[0,:] = x\n    y[1,0] = x[0]\n    y[1,N-1] = x[N-1]\n    \n    cdef size_t _, i, j=0, k=1\n    for _ in range(steps):\n        j = _ % 2\n        k = 1 - j\n        for i in range(1, N-1):\n            y[k,i] = .25 * ( y[j,i-1] + 2 * y[j,i] + y[j,i+1] )\n    return y[k]\n')


# Note how there is now zero yellow called in any of the loops,
# only in the initial copy of the input array.

# In[35]:


copt = get_ipython().run_line_magic('timeit', '-o y = blur_cython_optimized(x, steps)')
t_copt = copt.best
times.append(t_copt)
labels.append("cython (optimized)")
plot_times()


# In[36]:


y = blur_cython_optimized(x, steps)
plt.plot(t, x, '--')
plt.plot(t, y)


# ## numba
# 
# [numba](http://numba.pydata.org/) is a library that attempts to automatically do type-based optimizations like we did with Cython.
# To use numba, you decorate functions with `@autojit`.

# In[39]:


import numba

@numba.autojit
def blur_numba(x, steps=1024):
    """identical to blur_py, other than the decorator"""
    x = 1 * x # copy
    y = np.empty_like(x)
    y[0] = x[0]
    y[-1] = x[-1]
    for _ in range(steps):
        for i in range(1, len(x)-1):
            y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )
        x, y = y, x # swap for next step
    return x

# y = blur_numba(x, steps)


# In[40]:


get_ipython().run_line_magic('time', 'blur_numba(x, steps)')


# In[38]:


nb = get_ipython().run_line_magic('timeit', '-o blur_numba(x, steps)')
t_nb = nb.best
times.append(t_nb)
labels.append("numba")
plot_times()


# In[41]:


blur_numba.inspect_types()


# In[42]:


print(list(blur_numba.inspect_llvm().values())[0])


# What's impressive about numba in this case
# is that it is able to beat all but the most optimized of our implementations without any help.
# Like Cython, numba can do an even better job when you provide it with more information about how a function will be called.

# ## Profiling

# In[ ]:


get_ipython().run_cell_magic('writefile', 'profileme.py', "import os\nimport glob\nlist(os.walk('/tmp'))\n")


# In[ ]:


get_ipython().system('python -m cProfile profileme.py')


# In[ ]:


import os
import cProfile
cProfile.run("list(os.walk('/tmp'))")


# In[ ]:


get_ipython().run_line_magic('prun', "list(os.walk('/tmp'))")


# In[ ]:


get_ipython().run_line_magic('load_ext', 'line_profiler')


# In[ ]:


get_ipython().run_line_magic('lprun', '-f blur_py blur_py(x, steps)')


# In[ ]:


get_ipython().run_line_magic('lprun', '-f blur_numba blur_numba(x, steps)')


# In[ ]:


get_ipython().run_line_magic('lprun', '-f blur_np blur_np(x, steps)')


# # Links
# 
# https://nbviewer.jupyter.org
# 
# https://beta.mybinder.org