#!/usr/bin/env python # coding: utf-8 # # Profiling and Optimising # # IPython provides some tools for making it a bit easier to profile and optimise your code. # In[1]: import numpy as np import matplotlib.pyplot as plt # ## `%timeit` # # The main IPython tool we are going to use here is `%timeit`, # a magic that automates measuring how long it takes to run a snippet of code. # In[2]: for N in (100, 500, 1000, 2000): print("Size: {0} x {0}".format(N)) A = np.random.random((N, N)) get_ipython().run_line_magic('timeit', 'A.dot(A)') # Let's look at what options `%timeit` can take. # In[ ]: get_ipython().run_line_magic('pinfo', '%timeit') # We can save the result in an object with `%timeit -o`, # and specify to only run one group of 100 iterations. # In[3]: A = np.random.random((100, 100)) tr = get_ipython().run_line_magic('timeit', '-o -n 1 -r 100 A.dot(A)') # In[4]: tr # In[5]: tr.best # In[6]: tr.best, tr.worst # In[7]: tr.all_runs # In[8]: plt.hist(np.array(tr.all_runs) * 1e6) plt.xlabel("t (µs)") # ## Diffusing a wave # Our task is to optimise a 1-D diffusion algorithm, # using numpy and Cython. # # Our input signal is a sawtooth wave: # # $$ # x_\mathrm{sawtooth}(t) = \frac{A}{2}-\frac {A}{\pi}\sum_{k=1}^{\infty}\frac {\sin (2\pi kft)}{k} # $$ # In[9]: from scipy.signal import sawtooth T = 8 * np.pi t = np.linspace(0, T, 512) x = sawtooth(t) plt.plot(t, x) steps = 2048 # We are going to diffuse the wave by evolving the heat equation: # # $$ # \frac{\delta x}{\delta t} = \alpha \frac{\delta^2 x}{\delta^2}{t} # $$ # # Which we can discretize for our arrays: # # \begin{align} # x_{k} =& \frac{1}{4} \left( # x_{k-1}[i-1] + # 2 x_{k-1}[i] + # x_{k-1}[i+1] # \right) \\ # x_{k}[0] =& x_{0}[0] \\ # x_{k}[N] =& x_{0}[N] \\ # \end{align} # ## Pure Python # # We'll start with a pure Python implementation, # to use as a reference. # In[10]: def blur_py(x, steps=1024): x = 1 * x # copy y = np.empty_like(x) y[0] = x[0] y[-1] = x[-1] for _ in range(steps): for i in range(1, len(x)-1): y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] ) x, y = y, x # swap for next step return x # In[11]: y = blur_py(x, steps) plt.plot(t, x, '--') plt.plot(t, y); # Now we can measure how long it takes to run evolve this system: # In[12]: ref_run = get_ipython().run_line_magic('timeit', '-o y = blur_py(x, steps)') t_ref = ref_run.best times = [t_ref] labels = ['python'] # So it takes about one second. # We can also see how it changes with different times and resolutions. # ## Vectorizing with numpy # # We can vectorize the inner loop with a single numpy operation: # In[13]: import numpy as np def blur_np(x, steps=1024): x = 1 * x y = np.empty_like(x) y[0] = x[0] y[-1] = x[-1] for _ in range(steps): y[1:-1] = .25 * (x[:-2] + 2 * x[1:-1] + x[2:]) x, y = y, x return x # In[14]: y = blur_np(x, steps) plt.plot(t, x, '--') plt.plot(t, y) # In[15]: np_r = get_ipython().run_line_magic('timeit', '-o blur_np(x, steps)') t_np = np_r.best # In[16]: times.append(t_np) labels.append('numpy') # In[17]: def plot_times(): ind = np.arange(len(times)) plt.bar(ind, times, log=True) plt.xticks(ind + 0.3, labels, rotation=30) plt.ylim(.1 * min(times), times[0]) plot_times() # So vectorizing the inner loop brings us from 1 second to 25 milliseconds, # an improvement of 40x: # In[18]: t_ref / t_np # # Cython # # [Cython](http://cython.org/) provides an IPython extension, # which defines a magic we can use to inline bits of Cython code in the notebook: # In[19]: get_ipython().run_line_magic('load_ext', 'Cython') # In[20]: get_ipython().run_cell_magic('cython', '', '\ndef csum(n):\n cs = 0\n for i in range(n):\n cs += i\n return cs\n') # In[21]: get_ipython().run_line_magic('timeit', 'csum(5)') # `%%cython -a` shows you annotations about the generated sourcecode. # The key to writing Cython is to minimize the amount of Python calls in the generated code. In general: yellow = slow. # In[22]: def psum(n): cs = 0 for i in range(n): cs += i return cs # In[23]: get_ipython().run_cell_magic('cython', '-a', '\ndef csum(n):\n cs = 0\n for i in range(n):\n cs += i\n return cs\n') # Uh oh, that looks like a lot of yellow. # We can reduce it by adding some type annotations: # In[24]: get_ipython().run_cell_magic('cython', '-a', '\ndef csum2(int n):\n cdef int i\n cs = 0\n for i in range(n):\n cs += i\n return cs\n') # Almost there, but I still see yellow on the lines with `cs`: # In[25]: get_ipython().run_cell_magic('cython', '-a', '\ncpdef int csum3(int n):\n cdef int i\n cdef int cs = 0\n for i in range(n):\n cs += i\n return cs\n') # Much better! # Now there's only Python when entering the function, # which is about as good as we can do. # In[26]: N = 1000000 print('psum ', end=' ') get_ipython().run_line_magic('timeit', 'psum (N)') print('csum ', end=' ') get_ipython().run_line_magic('timeit', 'csum (N)') print('csum2', end=' ') get_ipython().run_line_magic('timeit', 'csum2(N)') print('csum3', end=' ') get_ipython().run_line_magic('timeit', 'csum3(N)') # ## Blurring with Cython # # Now we can apply the same principles to writing a blur # in Cython. # In[27]: get_ipython().run_cell_magic('cython', '-a', '\nimport numpy as np\n\ndef blur_cython(x, steps=1024):\n x = 1 * x # copy\n y = np.empty_like(x)\n y[0] = x[0]\n y[-1] = x[-1]\n for _ in range(steps):\n for i in range(1, len(x)-1):\n y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )\n x, y = y, x # swap for next step\n return x\n') # In[28]: c1 = get_ipython().run_line_magic('timeit', '-o y = blur_cython(x, steps)') t_c1 = c1.best times.append(t_c1) labels.append("cython (no hints)") # In[29]: plot_times() # Without annotations, we don't get much improvement over the pure Python version. # We can note the types of the input arguments, to get some improvements: # In[30]: get_ipython().run_cell_magic('cython', '-a', '\nimport numpy as np\ncimport numpy as np\n\ndef blur_cython2(x, int steps=1024):\n x = 1 * x # copy\n y = np.empty_like(x)\n y[0] = x[0]\n y[-1] = x[-1]\n cdef int i, N = len(x)\n for _ in range(steps):\n for i in range(1, N-1):\n y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )\n x, y = y, x # swap for next step\n return x\n') # In[31]: c2 = get_ipython().run_line_magic('timeit', '-o blur_cython2(x, steps)') t_c2 = c2.best times.append(t_c2) labels.append("cython (loops)") plot_times() # Just by making sure the iteration variables are defined as integers, we can save about 25% of the time. # # The biggest key to optimizing with Cython is getting that yellow out of your loops. # The more deeply nested a bit of code is within a loop, # the more often it is called, and the more value you can get out of making it fast. # In Cython, fast means avoiding Python (getting rid of yellow). # To get rid of Python calls, we need to tell Python about the numpy arrays `x` and `y`: # # In[32]: get_ipython().run_cell_magic('cython', '-a', '\nimport numpy as np\ncimport numpy as np\n\ndef blur_cython_typed(np.ndarray[double, ndim=1] x_, int steps=1024):\n# x = 1 * x # copy\n cdef size_t i, N = x_.shape[0]\n cdef np.ndarray[double, ndim=1] x\n cdef np.ndarray[double, ndim=1] y\n x = 1 * x_\n y = np.empty_like(x_)\n y[0] = x[0]\n y[-1] = x[-1]\n for _ in range(steps):\n for i in range(1, N-1):\n y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )\n x, y = y, x # swap for next step\n return x\n') # In[33]: ct = get_ipython().run_line_magic('timeit', '-o y = blur_cython_typed(x, steps)') t_ct = ct.best times.append(t_ct) labels.append("cython (types)") plot_times() # We can furter optimize with Cython macros, # which disable bounds checking and negative indexing, # and avoiding the Python variable swaping by using indices into a single array: # In[34]: get_ipython().run_cell_magic('cython', '-a', '#cython: boundscheck=False\n#cython: wraparound=False\n\nimport numpy as np\ncimport numpy as np\n\ndef blur_cython_optimized(np.ndarray[double, ndim=1] x, int steps=1024):\n cdef size_t N = x.shape[0]\n cdef np.ndarray[double, ndim=2] y\n y = np.empty((2, N), dtype=np.float64)\n y[0,:] = x\n y[1,0] = x[0]\n y[1,N-1] = x[N-1]\n \n cdef size_t _, i, j=0, k=1\n for _ in range(steps):\n j = _ % 2\n k = 1 - j\n for i in range(1, N-1):\n y[k,i] = .25 * ( y[j,i-1] + 2 * y[j,i] + y[j,i+1] )\n return y[k]\n') # Note how there is now zero yellow called in any of the loops, # only in the initial copy of the input array. # In[35]: copt = get_ipython().run_line_magic('timeit', '-o y = blur_cython_optimized(x, steps)') t_copt = copt.best times.append(t_copt) labels.append("cython (optimized)") plot_times() # In[36]: y = blur_cython_optimized(x, steps) plt.plot(t, x, '--') plt.plot(t, y) # ## numba # # [numba](http://numba.pydata.org/) is a library that attempts to automatically do type-based optimizations like we did with Cython. # To use numba, you decorate functions with `@autojit`. # In[39]: import numba @numba.autojit def blur_numba(x, steps=1024): """identical to blur_py, other than the decorator""" x = 1 * x # copy y = np.empty_like(x) y[0] = x[0] y[-1] = x[-1] for _ in range(steps): for i in range(1, len(x)-1): y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] ) x, y = y, x # swap for next step return x # y = blur_numba(x, steps) # In[40]: get_ipython().run_line_magic('time', 'blur_numba(x, steps)') # In[38]: nb = get_ipython().run_line_magic('timeit', '-o blur_numba(x, steps)') t_nb = nb.best times.append(t_nb) labels.append("numba") plot_times() # In[41]: blur_numba.inspect_types() # In[42]: print(list(blur_numba.inspect_llvm().values())[0]) # What's impressive about numba in this case # is that it is able to beat all but the most optimized of our implementations without any help. # Like Cython, numba can do an even better job when you provide it with more information about how a function will be called. # ## Profiling # In[ ]: get_ipython().run_cell_magic('writefile', 'profileme.py', "import os\nimport glob\nlist(os.walk('/tmp'))\n") # In[ ]: get_ipython().system('python -m cProfile profileme.py') # In[ ]: import os import cProfile cProfile.run("list(os.walk('/tmp'))") # In[ ]: get_ipython().run_line_magic('prun', "list(os.walk('/tmp'))") # In[ ]: get_ipython().run_line_magic('load_ext', 'line_profiler') # In[ ]: get_ipython().run_line_magic('lprun', '-f blur_py blur_py(x, steps)') # In[ ]: get_ipython().run_line_magic('lprun', '-f blur_numba blur_numba(x, steps)') # In[ ]: get_ipython().run_line_magic('lprun', '-f blur_np blur_np(x, steps)') # # Links # # https://nbviewer.jupyter.org # # https://beta.mybinder.org