IPython provides some tools for making it a bit easier to profile and optimise your code.
import numpy as np
import matplotlib.pyplot as plt
%timeit
¶The main IPython tool we are going to use here is %timeit
,
a magic that automates measuring how long it takes to run a snippet of code.
for N in (100, 500, 1000, 2000):
print("Size: {0} x {0}".format(N))
A = np.random.random((N, N))
%timeit A.dot(A)
Size: 100 x 100 58.7 µs ± 1.26 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) Size: 500 x 500 3.52 ms ± 29.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) Size: 1000 x 1000 28.6 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) Size: 2000 x 2000 229 ms ± 21.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Let's look at what options %timeit
can take.
%timeit?
We can save the result in an object with %timeit -o
,
and specify to only run one group of 100 iterations.
A = np.random.random((100, 100))
tr = %timeit -o -n 1 -r 100 A.dot(A)
The slowest run took 4.39 times longer than the fastest. This could mean that an intermediate result is being cached. 100 µs ± 29.6 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)
tr
<TimeitResult : 100 µs ± 29.6 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)>
tr.best
5.874503403902054e-05
tr.best, tr.worst
(5.874503403902054e-05, 0.0002581239677965641)
tr.all_runs
[0.0002581239677965641, 0.0002479089889675379, 0.00010618893429636955, 8.952501229941845e-05, 8.979497943073511e-05, 9.034993126988411e-05, 9.569607209414244e-05, 9.590701665729284e-05, 9.957596193999052e-05, 0.00010354106780141592, 9.874103125184774e-05, 9.229499846696854e-05, 8.74069519340992e-05, 9.263097308576107e-05, 0.00010646891314536333, 8.739903569221497e-05, 7.834495045244694e-05, 0.00011559098493307829, 0.00012580992188304663, 8.461810648441315e-05, 8.053903002291918e-05, 0.00015645497478544712, 9.29820816963911e-05, 9.66809457167983e-05, 0.00010572304017841816, 9.05659981071949e-05, 9.549304377287626e-05, 8.518900722265244e-05, 9.558198507875204e-05, 9.323901031166315e-05, 9.264692198485136e-05, 9.200000204145908e-05, 9.610806591808796e-05, 9.622506331652403e-05, 0.00010252802167087793, 9.584799408912659e-05, 9.439408313483e-05, 9.482295718044043e-05, 9.574799332767725e-05, 9.632005821913481e-05, 9.610701818019152e-05, 9.628001134842634e-05, 9.797897655516863e-05, 9.532307740300894e-05, 9.053503163158894e-05, 9.074900299310684e-05, 8.971302304416895e-05, 8.61859880387783e-05, 9.29500674828887e-05, 0.00010948698036372662, 9.006005711853504e-05, 0.00012439896818250418, 0.0001060620415955782, 0.00010029703844338655, 9.830202907323837e-05, 9.765394497662783e-05, 9.900995064526796e-05, 0.00011357699986547232, 9.214004967361689e-05, 8.571695070713758e-05, 8.538097608834505e-05, 9.078998118638992e-05, 9.166507516056299e-05, 9.080301970243454e-05, 8.942594286054373e-05, 8.24810704216361e-05, 6.25409884378314e-05, 5.9447018429636955e-05, 5.874503403902054e-05, 7.43260607123375e-05, 0.00011498795356601477, 7.05299898982048e-05, 7.85220181569457e-05, 7.82129354774952e-05, 0.0001304119359701872, 9.570200927555561e-05, 9.407999459654093e-05, 9.330001194030046e-05, 9.450200013816357e-05, 9.450397919863462e-05, 9.244505781680346e-05, 9.347591549158096e-05, 9.245902765542269e-05, 9.335798677057028e-05, 9.447196498513222e-05, 9.486195631325245e-05, 9.298999793827534e-05, 9.398208931088448e-05, 9.451503865420818e-05, 0.00012433098163455725, 0.00014190201181918383, 0.0001132030738517642, 0.00018185796216130257, 0.00021138100419193506, 9.827804751694202e-05, 8.827797137200832e-05, 9.161094203591347e-05, 9.056192357093096e-05, 9.003700688481331e-05, 8.991500362753868e-05]
plt.hist(np.array(tr.all_runs) * 1e6)
plt.xlabel("t (µs)")
<matplotlib.text.Text at 0x10ab8be10>
Our task is to optimise a 1-D diffusion algorithm, using numpy and Cython.
Our input signal is a sawtooth wave:
$$ x_\mathrm{sawtooth}(t) = \frac{A}{2}-\frac {A}{\pi}\sum_{k=1}^{\infty}\frac {\sin (2\pi kft)}{k} $$from scipy.signal import sawtooth
T = 8 * np.pi
t = np.linspace(0, T, 512)
x = sawtooth(t)
plt.plot(t, x)
steps = 2048
We are going to diffuse the wave by evolving the heat equation:
$$ \frac{\delta x}{\delta t} = \alpha \frac{\delta^2 x}{\delta^2}{t} $$Which we can discretize for our arrays:
\begin{align} x_{k} =& \frac{1}{4} \left( x_{k-1}[i-1] + 2 x_{k-1}[i] + x_{k-1}[i+1] \right) \\ x_{k}[0] =& x_{0}[0] \\ x_{k}[N] =& x_{0}[N] \\ \end{align}We'll start with a pure Python implementation, to use as a reference.
def blur_py(x, steps=1024):
x = 1 * x # copy
y = np.empty_like(x)
y[0] = x[0]
y[-1] = x[-1]
for _ in range(steps):
for i in range(1, len(x)-1):
y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )
x, y = y, x # swap for next step
return x
y = blur_py(x, steps)
plt.plot(t, x, '--')
plt.plot(t, y);
Now we can measure how long it takes to run evolve this system:
ref_run = %timeit -o y = blur_py(x, steps)
t_ref = ref_run.best
times = [t_ref]
labels = ['python']
/Users/minrk/dev/ip/kernel/ipykernel_launcher.py:8: RuntimeWarning: invalid value encountered in double_scalars
747 ms ± 30.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
So it takes about one second. We can also see how it changes with different times and resolutions.
We can vectorize the inner loop with a single numpy operation:
import numpy as np
def blur_np(x, steps=1024):
x = 1 * x
y = np.empty_like(x)
y[0] = x[0]
y[-1] = x[-1]
for _ in range(steps):
y[1:-1] = .25 * (x[:-2] + 2 * x[1:-1] + x[2:])
x, y = y, x
return x
y = blur_np(x, steps)
plt.plot(t, x, '--')
plt.plot(t, y)
[<matplotlib.lines.Line2D at 0x119d8b518>]
np_r = %timeit -o blur_np(x, steps)
t_np = np_r.best
10.3 ms ± 436 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
times.append(t_np)
labels.append('numpy')
def plot_times():
ind = np.arange(len(times))
plt.bar(ind, times, log=True)
plt.xticks(ind + 0.3, labels, rotation=30)
plt.ylim(.1 * min(times), times[0])
plot_times()
So vectorizing the inner loop brings us from 1 second to 25 milliseconds, an improvement of 40x:
t_ref / t_np
72.72423881702095
%load_ext Cython
%%cython
def csum(n):
cs = 0
for i in range(n):
cs += i
return cs
%timeit csum(5)
329 ns ± 15.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%%cython -a
shows you annotations about the generated sourcecode.
The key to writing Cython is to minimize the amount of Python calls in the generated code. In general: yellow = slow.
def psum(n):
cs = 0
for i in range(n):
cs += i
return cs
%%cython -a
def csum(n):
cs = 0
for i in range(n):
cs += i
return cs
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
1:
+2: def csum(n):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_1csum(PyObject *__pyx_self, PyObject *__pyx_v_n); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_1csum = {"csum", (PyCFunction)__pyx_pw_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_1csum, METH_O, 0}; static PyObject *__pyx_pw_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_1csum(PyObject *__pyx_self, PyObject *__pyx_v_n) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum (wrapper)", 0); __pyx_r = __pyx_pf_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_csum(__pyx_self, ((PyObject *)__pyx_v_n)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_csum(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_n) { PyObject *__pyx_v_cs = NULL; PyObject *__pyx_v_i = NULL; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum", 0); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_AddTraceback("_cython_magic_1ab2764d7e16b03ae03991c6ff07958b.csum", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XDECREF(__pyx_v_cs); __Pyx_XDECREF(__pyx_v_i); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple_ = PyTuple_Pack(3, __pyx_n_s_n, __pyx_n_s_cs, __pyx_n_s_i); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_1ab2764d7e16b03ae03991c6ff07958b_1csum, NULL, __pyx_n_s_cython_magic_1ab2764d7e16b03ae0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_csum, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+3: cs = 0
__Pyx_INCREF(__pyx_int_0);
__pyx_v_cs = __pyx_int_0;
+4: for i in range(n):
__pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_3 = 0; __pyx_t_4 = NULL; } else { __pyx_t_3 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_4 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 4, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_4)) { if (likely(PyList_CheckExact(__pyx_t_2))) { if (__pyx_t_3 >= PyList_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_1); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 4, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_3 >= PyTuple_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_1); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 4, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { __pyx_t_1 = __pyx_t_4(__pyx_t_2); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); else __PYX_ERR(0, 4, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_1); } __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1); __pyx_t_1 = 0; /* … */ } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+5: cs += i
__pyx_t_1 = PyNumber_InPlaceAdd(__pyx_v_cs, __pyx_v_i); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF_SET(__pyx_v_cs, __pyx_t_1); __pyx_t_1 = 0;
+6: return cs
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(__pyx_v_cs); __pyx_r = __pyx_v_cs; goto __pyx_L0;
Uh oh, that looks like a lot of yellow. We can reduce it by adding some type annotations:
%%cython -a
def csum2(int n):
cdef int i
cs = 0
for i in range(n):
cs += i
return cs
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
1:
+2: def csum2(int n):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_1csum2(PyObject *__pyx_self, PyObject *__pyx_arg_n); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_1csum2 = {"csum2", (PyCFunction)__pyx_pw_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_1csum2, METH_O, 0}; static PyObject *__pyx_pw_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_1csum2(PyObject *__pyx_self, PyObject *__pyx_arg_n) { int __pyx_v_n; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum2 (wrapper)", 0); assert(__pyx_arg_n); { __pyx_v_n = __Pyx_PyInt_As_int(__pyx_arg_n); if (unlikely((__pyx_v_n == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 2, __pyx_L3_error) } goto __pyx_L4_argument_unpacking_done; __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_6796a2fd03fa2622101f9ccd25f59c92.csum2", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_csum2(__pyx_self, ((int)__pyx_v_n)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_csum2(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_v_n) { int __pyx_v_i; PyObject *__pyx_v_cs = NULL; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum2", 0); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_4); __Pyx_XDECREF(__pyx_t_5); __Pyx_AddTraceback("_cython_magic_6796a2fd03fa2622101f9ccd25f59c92.csum2", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XDECREF(__pyx_v_cs); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple_ = PyTuple_Pack(4, __pyx_n_s_n, __pyx_n_s_n, __pyx_n_s_i, __pyx_n_s_cs); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_6796a2fd03fa2622101f9ccd25f59c92_1csum2, NULL, __pyx_n_s_cython_magic_6796a2fd03fa262210); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_csum2, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
3: cdef int i
+4: cs = 0
__Pyx_INCREF(__pyx_int_0);
__pyx_v_cs = __pyx_int_0;
+5: for i in range(n):
__pyx_t_1 = __pyx_v_n; __pyx_t_2 = __pyx_t_1; for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) { __pyx_v_i = __pyx_t_3;
+6: cs += i
__pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_i); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_5 = PyNumber_InPlaceAdd(__pyx_v_cs, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_DECREF_SET(__pyx_v_cs, __pyx_t_5); __pyx_t_5 = 0; }
+7: return cs
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(__pyx_v_cs); __pyx_r = __pyx_v_cs; goto __pyx_L0;
Almost there, but I still see yellow on the lines with cs
:
%%cython -a
cpdef int csum3(int n):
cdef int i
cdef int cs = 0
for i in range(n):
cs += i
return cs
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
1:
+2: cpdef int csum3(int n):
static PyObject *__pyx_pw_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_1csum3(PyObject *__pyx_self, PyObject *__pyx_arg_n); /*proto*/ static int __pyx_f_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_csum3(int __pyx_v_n, CYTHON_UNUSED int __pyx_skip_dispatch) { int __pyx_v_i; int __pyx_v_cs; int __pyx_r; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum3", 0); /* … */ /* function exit code */ __pyx_L0:; __Pyx_RefNannyFinishContext(); return __pyx_r; } /* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_1csum3(PyObject *__pyx_self, PyObject *__pyx_arg_n); /*proto*/ static PyObject *__pyx_pw_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_1csum3(PyObject *__pyx_self, PyObject *__pyx_arg_n) { int __pyx_v_n; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum3 (wrapper)", 0); assert(__pyx_arg_n); { __pyx_v_n = __Pyx_PyInt_As_int(__pyx_arg_n); if (unlikely((__pyx_v_n == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 2, __pyx_L3_error) } goto __pyx_L4_argument_unpacking_done; __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171.csum3", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_csum3(__pyx_self, ((int)__pyx_v_n)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_csum3(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_v_n) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("csum3", 0); __Pyx_XDECREF(__pyx_r); __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_f_46_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171_csum3(__pyx_v_n, 0)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_AddTraceback("_cython_magic_ebc2c28c3e7904b9b25db8629c4fb171.csum3", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; }
3: cdef int i
+4: cdef int cs = 0
__pyx_v_cs = 0;
+5: for i in range(n):
__pyx_t_1 = __pyx_v_n; __pyx_t_2 = __pyx_t_1; for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) { __pyx_v_i = __pyx_t_3;
+6: cs += i
__pyx_v_cs = (__pyx_v_cs + __pyx_v_i); }
+7: return cs
__pyx_r = __pyx_v_cs; goto __pyx_L0;
Much better! Now there's only Python when entering the function, which is about as good as we can do.
N = 1000000
print('psum ', end=' ')
%timeit psum (N)
print('csum ', end=' ')
%timeit csum (N)
print('csum2', end=' ')
%timeit csum2(N)
print('csum3', end=' ')
%timeit csum3(N)
psum 57.1 ms ± 2.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) csum 42 ms ± 928 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) csum2 40.9 ms ± 647 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) csum3 66.5 ns ± 1.59 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
Now we can apply the same principles to writing a blur in Cython.
%%cython -a
import numpy as np
def blur_cython(x, steps=1024):
x = 1 * x # copy
y = np.empty_like(x)
y[0] = x[0]
y[-1] = x[-1]
for _ in range(steps):
for i in range(1, len(x)-1):
y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )
x, y = y, x # swap for next step
return x
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01:
+02: import numpy as np
__pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
03:
+04: def blur_cython(x, steps=1024):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_1blur_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_1blur_cython = {"blur_cython", (PyCFunction)__pyx_pw_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_1blur_cython, METH_VARARGS|METH_KEYWORDS, 0}; static PyObject *__pyx_pw_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_1blur_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_x = 0; PyObject *__pyx_v_steps = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_x,&__pyx_n_s_steps,0}; PyObject* values[2] = {0,0}; values[1] = ((PyObject *)__pyx_int_1024); if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_x)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (kw_args > 0) { PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_steps); if (value) { values[1] = value; kw_args--; } } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "blur_cython") < 0)) __PYX_ERR(0, 4, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); break; default: goto __pyx_L5_argtuple_error; } } __pyx_v_x = values[0]; __pyx_v_steps = values[1]; } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("blur_cython", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 4, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc.blur_cython", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_blur_cython(__pyx_self, __pyx_v_x, __pyx_v_steps); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_blur_cython(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, PyObject *__pyx_v_steps) { PyObject *__pyx_v_y = NULL; CYTHON_UNUSED PyObject *__pyx_v__ = NULL; PyObject *__pyx_v_i = NULL; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython", 0); __Pyx_INCREF(__pyx_v_x); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); __Pyx_XDECREF(__pyx_t_4); __Pyx_XDECREF(__pyx_t_9); __Pyx_AddTraceback("_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc.blur_cython", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XDECREF(__pyx_v_y); __Pyx_XDECREF(__pyx_v__); __Pyx_XDECREF(__pyx_v_i); __Pyx_XDECREF(__pyx_v_x); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple__2 = PyTuple_Pack(5, __pyx_n_s_x, __pyx_n_s_steps, __pyx_n_s_y, __pyx_n_s_, __pyx_n_s_i); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__2); __Pyx_GIVEREF(__pyx_tuple__2); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_d25d7f68c90bd19271e0f6ed96501fcc_1blur_cython, NULL, __pyx_n_s_cython_magic_d25d7f68c90bd19271); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_blur_cython, __pyx_t_1) < 0) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+05: x = 1 * x # copy
__pyx_t_1 = PyNumber_Multiply(__pyx_int_1, __pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF_SET(__pyx_v_x, __pyx_t_1); __pyx_t_1 = 0;
+06: y = np.empty_like(x)
__pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_empty_like); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); if (likely(__pyx_t_2)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); __Pyx_INCREF(__pyx_t_2); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_3, function); } } if (!__pyx_t_2) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_v_x}; __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_v_x}; __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); __pyx_t_2 = NULL; __Pyx_INCREF(__pyx_v_x); __Pyx_GIVEREF(__pyx_v_x); PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_x); __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_y = __pyx_t_1; __pyx_t_1 = 0;
+07: y[0] = x[0]
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_x, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (unlikely(__Pyx_SetItemInt(__pyx_v_y, 0, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1) < 0)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+08: y[-1] = x[-1]
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_x, -1L, long, 1, __Pyx_PyInt_From_long, 0, 1, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (unlikely(__Pyx_SetItemInt(__pyx_v_y, -1L, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 1, 1) < 0)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+09: for _ in range(steps):
__pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_steps); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_5 = 0; __pyx_t_6 = NULL; } else { __pyx_t_5 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_6 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 9, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_6)) { if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 9, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_5 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_5); __Pyx_INCREF(__pyx_t_1); __pyx_t_5++; if (unlikely(0 < 0)) __PYX_ERR(0, 9, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { __pyx_t_1 = __pyx_t_6(__pyx_t_3); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); else __PYX_ERR(0, 9, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_1); } __Pyx_XDECREF_SET(__pyx_v__, __pyx_t_1); __pyx_t_1 = 0; /* … */ } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+10: for i in range(1, len(x)-1):
__pyx_t_7 = PyObject_Length(__pyx_v_x); if (unlikely(__pyx_t_7 == ((Py_ssize_t)-1))) __PYX_ERR(0, 10, __pyx_L1_error) __pyx_t_1 = PyInt_FromSsize_t((__pyx_t_7 - 1)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_INCREF(__pyx_int_1); __Pyx_GIVEREF(__pyx_int_1); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_int_1); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_range, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { __pyx_t_4 = __pyx_t_1; __Pyx_INCREF(__pyx_t_4); __pyx_t_7 = 0; __pyx_t_8 = NULL; } else { __pyx_t_7 = -1; __pyx_t_4 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_8 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 10, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_8)) { if (likely(PyList_CheckExact(__pyx_t_4))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 10, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 10, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { __pyx_t_1 = __pyx_t_8(__pyx_t_4); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); else __PYX_ERR(0, 10, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_1); } __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1); __pyx_t_1 = 0; /* … */ } __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+11: y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )
__pyx_t_1 = __Pyx_PyInt_SubtractObjC(__pyx_v_i, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_v_x, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_PyObject_GetItem(__pyx_v_x, __pyx_v_i); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_9 = PyNumber_Multiply(__pyx_int_2, __pyx_t_1); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_9); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = PyNumber_Add(__pyx_t_2, __pyx_t_9); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; __pyx_t_9 = __Pyx_PyInt_AddObjC(__pyx_v_i, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_9); __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_v_x, __pyx_t_9); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; __pyx_t_9 = PyNumber_Add(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_9); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = PyNumber_Multiply(__pyx_float__25, __pyx_t_9); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; if (unlikely(PyObject_SetItem(__pyx_v_y, __pyx_v_i, __pyx_t_2) < 0)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+12: x, y = y, x # swap for next step
__pyx_t_10 = __pyx_v_y; __pyx_t_11 = __pyx_v_x; __pyx_v_x = __pyx_t_10; __pyx_t_10 = 0; __pyx_v_y = __pyx_t_11; __pyx_t_11 = 0;
+13: return x
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(__pyx_v_x); __pyx_r = __pyx_v_x; goto __pyx_L0;
c1 = %timeit -o y = blur_cython(x, steps)
t_c1 = c1.best
times.append(t_c1)
labels.append("cython (no hints)")
599 ms ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
plot_times()
Without annotations, we don't get much improvement over the pure Python version. We can note the types of the input arguments, to get some improvements:
%%cython -a
import numpy as np
cimport numpy as np
def blur_cython2(x, int steps=1024):
x = 1 * x # copy
y = np.empty_like(x)
y[0] = x[0]
y[-1] = x[-1]
cdef int i, N = len(x)
for _ in range(steps):
for i in range(1, N-1):
y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )
x, y = y, x # swap for next step
return x
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01:
+02: import numpy as np
__pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* … */ __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
03: cimport numpy as np
04:
+05: def blur_cython2(x, int steps=1024):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_1blur_cython2(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_1blur_cython2 = {"blur_cython2", (PyCFunction)__pyx_pw_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_1blur_cython2, METH_VARARGS|METH_KEYWORDS, 0}; static PyObject *__pyx_pw_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_1blur_cython2(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_x = 0; int __pyx_v_steps; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython2 (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_x,&__pyx_n_s_steps,0}; PyObject* values[2] = {0,0}; if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_x)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (kw_args > 0) { PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_steps); if (value) { values[1] = value; kw_args--; } } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "blur_cython2") < 0)) __PYX_ERR(0, 5, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); break; default: goto __pyx_L5_argtuple_error; } } __pyx_v_x = values[0]; if (values[1]) { __pyx_v_steps = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_steps == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 5, __pyx_L3_error) } else { __pyx_v_steps = ((int)0x400); } } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("blur_cython2", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 5, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4.blur_cython2", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_blur_cython2(__pyx_self, __pyx_v_x, __pyx_v_steps); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_blur_cython2(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, int __pyx_v_steps) { PyObject *__pyx_v_y = NULL; int __pyx_v_i; int __pyx_v_N; CYTHON_UNUSED int __pyx_v__; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython2", 0); __Pyx_INCREF(__pyx_v_x); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); __Pyx_XDECREF(__pyx_t_4); __Pyx_AddTraceback("_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4.blur_cython2", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XDECREF(__pyx_v_y); __Pyx_XDECREF(__pyx_v_x); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple__11 = PyTuple_Pack(6, __pyx_n_s_x, __pyx_n_s_steps, __pyx_n_s_y, __pyx_n_s_i, __pyx_n_s_N, __pyx_n_s__10); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__11); __Pyx_GIVEREF(__pyx_tuple__11); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_5b78eca6873d3224dbaab9f8b6075ac4_1blur_cython2, NULL, __pyx_n_s_cython_magic_5b78eca6873d3224db); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_blur_cython2, __pyx_t_1) < 0) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+06: x = 1 * x # copy
__pyx_t_1 = PyNumber_Multiply(__pyx_int_1, __pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF_SET(__pyx_v_x, __pyx_t_1); __pyx_t_1 = 0;
+07: y = np.empty_like(x)
__pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_empty_like); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); if (likely(__pyx_t_2)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); __Pyx_INCREF(__pyx_t_2); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_3, function); } } if (!__pyx_t_2) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_v_x}; __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_v_x}; __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); __pyx_t_2 = NULL; __Pyx_INCREF(__pyx_v_x); __Pyx_GIVEREF(__pyx_v_x); PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_x); __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_y = __pyx_t_1; __pyx_t_1 = 0;
+08: y[0] = x[0]
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_x, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (unlikely(__Pyx_SetItemInt(__pyx_v_y, 0, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1) < 0)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+09: y[-1] = x[-1]
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_x, -1L, long, 1, __Pyx_PyInt_From_long, 0, 1, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (unlikely(__Pyx_SetItemInt(__pyx_v_y, -1L, __pyx_t_1, long, 1, __Pyx_PyInt_From_long, 0, 1, 1) < 0)) __PYX_ERR(0, 9, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+10: cdef int i, N = len(x)
__pyx_t_5 = PyObject_Length(__pyx_v_x); if (unlikely(__pyx_t_5 == ((Py_ssize_t)-1))) __PYX_ERR(0, 10, __pyx_L1_error) __pyx_v_N = __pyx_t_5;
+11: for _ in range(steps):
__pyx_t_6 = __pyx_v_steps; __pyx_t_7 = __pyx_t_6; for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) { __pyx_v__ = __pyx_t_8;
+12: for i in range(1, N-1):
__pyx_t_9 = (__pyx_v_N - 1); __pyx_t_10 = __pyx_t_9; for (__pyx_t_11 = 1; __pyx_t_11 < __pyx_t_10; __pyx_t_11+=1) { __pyx_v_i = __pyx_t_11;
+13: y[i] = .25 * ( x[i-1] + 2 * x[i] + x[i+1] )
__pyx_t_12 = (__pyx_v_i - 1); __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_x, __pyx_t_12, long, 1, __Pyx_PyInt_From_long, 0, 1, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_x, __pyx_v_i, int, 1, __Pyx_PyInt_From_int, 0, 1, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_4 = PyNumber_Multiply(__pyx_int_2, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_t_3 = PyNumber_Add(__pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_12 = (__pyx_v_i + 1); __pyx_t_4 = __Pyx_GetItemInt(__pyx_v_x, __pyx_t_12, long, 1, __Pyx_PyInt_From_long, 0, 1, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_1 = PyNumber_Add(__pyx_t_3, __pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_4 = PyNumber_Multiply(__pyx_float__25, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; if (unlikely(__Pyx_SetItemInt(__pyx_v_y, __pyx_v_i, __pyx_t_4, int, 1, __Pyx_PyInt_From_int, 0, 1, 1) < 0)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; }
+14: x, y = y, x # swap for next step
__pyx_t_13 = __pyx_v_y; __pyx_t_14 = __pyx_v_x; __pyx_v_x = __pyx_t_13; __pyx_t_13 = 0; __pyx_v_y = __pyx_t_14; __pyx_t_14 = 0; }
+15: return x
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(__pyx_v_x); __pyx_r = __pyx_v_x; goto __pyx_L0;
c2 = %timeit -o blur_cython2(x, steps)
t_c2 = c2.best
times.append(t_c2)
labels.append("cython (loops)")
plot_times()
468 ms ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Just by making sure the iteration variables are defined as integers, we can save about 25% of the time.
The biggest key to optimizing with Cython is getting that yellow out of your loops.
The more deeply nested a bit of code is within a loop,
the more often it is called, and the more value you can get out of making it fast.
In Cython, fast means avoiding Python (getting rid of yellow).
To get rid of Python calls, we need to tell Python about the numpy arrays x
and y
:
%%cython -a
import numpy as np
cimport numpy as np
def blur_cython_typed(np.ndarray[double, ndim=1] x_, int steps=1024):
# x = 1 * x # copy
cdef size_t i, N = x_.shape[0]
cdef np.ndarray[double, ndim=1] x
cdef np.ndarray[double, ndim=1] y
x = 1 * x_
y = np.empty_like(x_)
y[0] = x[0]
y[-1] = x[-1]
for _ in range(steps):
for i in range(1, N-1):
y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )
x, y = y, x # swap for next step
return x
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01:
+02: import numpy as np
__pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* … */ __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
03: cimport numpy as np
04:
+05: def blur_cython_typed(np.ndarray[double, ndim=1] x_, int steps=1024):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_75fcd861579744caad1bf0811063d4a0_1blur_cython_typed(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_75fcd861579744caad1bf0811063d4a0_1blur_cython_typed = {"blur_cython_typed", (PyCFunction)__pyx_pw_46_cython_magic_75fcd861579744caad1bf0811063d4a0_1blur_cython_typed, METH_VARARGS|METH_KEYWORDS, 0}; static PyObject *__pyx_pw_46_cython_magic_75fcd861579744caad1bf0811063d4a0_1blur_cython_typed(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyArrayObject *__pyx_v_x_ = 0; int __pyx_v_steps; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython_typed (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_x,&__pyx_n_s_steps,0}; PyObject* values[2] = {0,0}; if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_x)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (kw_args > 0) { PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_steps); if (value) { values[1] = value; kw_args--; } } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "blur_cython_typed") < 0)) __PYX_ERR(0, 5, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); break; default: goto __pyx_L5_argtuple_error; } } __pyx_v_x_ = ((PyArrayObject *)values[0]); if (values[1]) { __pyx_v_steps = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_steps == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 5, __pyx_L3_error) } else { __pyx_v_steps = ((int)0x400); } } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("blur_cython_typed", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 5, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_75fcd861579744caad1bf0811063d4a0.blur_cython_typed", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_x_), __pyx_ptype_5numpy_ndarray, 1, "x_", 0))) __PYX_ERR(0, 5, __pyx_L1_error) __pyx_r = __pyx_pf_46_cython_magic_75fcd861579744caad1bf0811063d4a0_blur_cython_typed(__pyx_self, __pyx_v_x_, __pyx_v_steps); /* function exit code */ goto __pyx_L0; __pyx_L1_error:; __pyx_r = NULL; __pyx_L0:; __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_75fcd861579744caad1bf0811063d4a0_blur_cython_typed(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_x_, int __pyx_v_steps) { size_t __pyx_v_i; size_t __pyx_v_N; PyArrayObject *__pyx_v_x = 0; PyArrayObject *__pyx_v_y = 0; CYTHON_UNUSED int __pyx_v__; __Pyx_LocalBuf_ND __pyx_pybuffernd_x; __Pyx_Buffer __pyx_pybuffer_x; __Pyx_LocalBuf_ND __pyx_pybuffernd_x_; __Pyx_Buffer __pyx_pybuffer_x_; __Pyx_LocalBuf_ND __pyx_pybuffernd_y; __Pyx_Buffer __pyx_pybuffer_y; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython_typed", 0); __pyx_pybuffer_x.pybuffer.buf = NULL; __pyx_pybuffer_x.refcount = 0; __pyx_pybuffernd_x.data = NULL; __pyx_pybuffernd_x.rcbuffer = &__pyx_pybuffer_x; __pyx_pybuffer_y.pybuffer.buf = NULL; __pyx_pybuffer_y.refcount = 0; __pyx_pybuffernd_y.data = NULL; __pyx_pybuffernd_y.rcbuffer = &__pyx_pybuffer_y; __pyx_pybuffer_x_.pybuffer.buf = NULL; __pyx_pybuffer_x_.refcount = 0; __pyx_pybuffernd_x_.data = NULL; __pyx_pybuffernd_x_.rcbuffer = &__pyx_pybuffer_x_; { __Pyx_BufFmt_StackElem __pyx_stack[1]; if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x_.rcbuffer->pybuffer, (PyObject*)__pyx_v_x_, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 5, __pyx_L1_error) } __pyx_pybuffernd_x_.diminfo[0].strides = __pyx_pybuffernd_x_.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_x_.diminfo[0].shape = __pyx_pybuffernd_x_.rcbuffer->pybuffer.shape[0]; /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_7); __Pyx_XDECREF(__pyx_t_8); __Pyx_XDECREF(__pyx_t_9); { PyObject *__pyx_type, *__pyx_value, *__pyx_tb; __Pyx_PyThreadState_declare __Pyx_PyThreadState_assign __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x.rcbuffer->pybuffer); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x_.rcbuffer->pybuffer); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);} __Pyx_AddTraceback("_cython_magic_75fcd861579744caad1bf0811063d4a0.blur_cython_typed", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; goto __pyx_L2; __pyx_L0:; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x.rcbuffer->pybuffer); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x_.rcbuffer->pybuffer); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __pyx_L2:; __Pyx_XDECREF((PyObject *)__pyx_v_x); __Pyx_XDECREF((PyObject *)__pyx_v_y); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple__11 = PyTuple_Pack(7, __pyx_n_s_x, __pyx_n_s_steps, __pyx_n_s_i, __pyx_n_s_N, __pyx_n_s_x_2, __pyx_n_s_y, __pyx_n_s__10); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__11); __Pyx_GIVEREF(__pyx_tuple__11); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_75fcd861579744caad1bf0811063d4a0_1blur_cython_typed, NULL, __pyx_n_s_cython_magic_75fcd861579744caad); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_blur_cython_typed, __pyx_t_1) < 0) __PYX_ERR(0, 5, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
06: # x = 1 * x # copy
+07: cdef size_t i, N = x_.shape[0]
__pyx_v_N = (__pyx_v_x_->dimensions[0]);
08: cdef np.ndarray[double, ndim=1] x
09: cdef np.ndarray[double, ndim=1] y
+10: x = 1 * x_
__pyx_t_1 = PyNumber_Multiply(__pyx_int_1, ((PyObject *)__pyx_v_x_)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 10, __pyx_L1_error) __pyx_t_2 = ((PyArrayObject *)__pyx_t_1); { __Pyx_BufFmt_StackElem __pyx_stack[1]; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x.rcbuffer->pybuffer); __pyx_t_3 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x.rcbuffer->pybuffer, (PyObject*)__pyx_t_2, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack); if (unlikely(__pyx_t_3 < 0)) { PyErr_Fetch(&__pyx_t_4, &__pyx_t_5, &__pyx_t_6); if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x.rcbuffer->pybuffer, (PyObject*)__pyx_v_x, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) { Py_XDECREF(__pyx_t_4); Py_XDECREF(__pyx_t_5); Py_XDECREF(__pyx_t_6); __Pyx_RaiseBufferFallbackError(); } else { PyErr_Restore(__pyx_t_4, __pyx_t_5, __pyx_t_6); } __pyx_t_4 = __pyx_t_5 = __pyx_t_6 = 0; } __pyx_pybuffernd_x.diminfo[0].strides = __pyx_pybuffernd_x.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_x.diminfo[0].shape = __pyx_pybuffernd_x.rcbuffer->pybuffer.shape[0]; if (unlikely(__pyx_t_3 < 0)) __PYX_ERR(0, 10, __pyx_L1_error) } __pyx_t_2 = 0; __pyx_v_x = ((PyArrayObject *)__pyx_t_1); __pyx_t_1 = 0;
+11: y = np.empty_like(x_)
__pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_7); __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_empty_like); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; __pyx_t_7 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_8))) { __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_8); if (likely(__pyx_t_7)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_8); __Pyx_INCREF(__pyx_t_7); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_8, function); } } if (!__pyx_t_7) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_8, ((PyObject *)__pyx_v_x_)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_8)) { PyObject *__pyx_temp[2] = {__pyx_t_7, ((PyObject *)__pyx_v_x_)}; __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_8, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_8)) { PyObject *__pyx_temp[2] = {__pyx_t_7, ((PyObject *)__pyx_v_x_)}; __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_8, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { __pyx_t_9 = PyTuple_New(1+1); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_9); __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_9, 0, __pyx_t_7); __pyx_t_7 = NULL; __Pyx_INCREF(((PyObject *)__pyx_v_x_)); __Pyx_GIVEREF(((PyObject *)__pyx_v_x_)); PyTuple_SET_ITEM(__pyx_t_9, 0+1, ((PyObject *)__pyx_v_x_)); __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_9, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0; } } __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 11, __pyx_L1_error) __pyx_t_10 = ((PyArrayObject *)__pyx_t_1); { __Pyx_BufFmt_StackElem __pyx_stack[1]; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __pyx_t_3 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y.rcbuffer->pybuffer, (PyObject*)__pyx_t_10, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack); if (unlikely(__pyx_t_3 < 0)) { PyErr_Fetch(&__pyx_t_6, &__pyx_t_5, &__pyx_t_4); if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y.rcbuffer->pybuffer, (PyObject*)__pyx_v_y, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack) == -1)) { Py_XDECREF(__pyx_t_6); Py_XDECREF(__pyx_t_5); Py_XDECREF(__pyx_t_4); __Pyx_RaiseBufferFallbackError(); } else { PyErr_Restore(__pyx_t_6, __pyx_t_5, __pyx_t_4); } __pyx_t_6 = __pyx_t_5 = __pyx_t_4 = 0; } __pyx_pybuffernd_y.diminfo[0].strides = __pyx_pybuffernd_y.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_y.diminfo[0].shape = __pyx_pybuffernd_y.rcbuffer->pybuffer.shape[0]; if (unlikely(__pyx_t_3 < 0)) __PYX_ERR(0, 11, __pyx_L1_error) } __pyx_t_10 = 0; __pyx_v_y = ((PyArrayObject *)__pyx_t_1); __pyx_t_1 = 0;
+12: y[0] = x[0]
__pyx_t_11 = 0; __pyx_t_3 = -1; if (__pyx_t_11 < 0) { __pyx_t_11 += __pyx_pybuffernd_x.diminfo[0].shape; if (unlikely(__pyx_t_11 < 0)) __pyx_t_3 = 0; } else if (unlikely(__pyx_t_11 >= __pyx_pybuffernd_x.diminfo[0].shape)) __pyx_t_3 = 0; if (unlikely(__pyx_t_3 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_3); __PYX_ERR(0, 12, __pyx_L1_error) } __pyx_t_12 = 0; __pyx_t_3 = -1; if (__pyx_t_12 < 0) { __pyx_t_12 += __pyx_pybuffernd_y.diminfo[0].shape; if (unlikely(__pyx_t_12 < 0)) __pyx_t_3 = 0; } else if (unlikely(__pyx_t_12 >= __pyx_pybuffernd_y.diminfo[0].shape)) __pyx_t_3 = 0; if (unlikely(__pyx_t_3 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_3); __PYX_ERR(0, 12, __pyx_L1_error) } *__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_12, __pyx_pybuffernd_y.diminfo[0].strides) = (*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_x.rcbuffer->pybuffer.buf, __pyx_t_11, __pyx_pybuffernd_x.diminfo[0].strides));
+13: y[-1] = x[-1]
__pyx_t_13 = -1L; __pyx_t_3 = -1; if (__pyx_t_13 < 0) { __pyx_t_13 += __pyx_pybuffernd_x.diminfo[0].shape; if (unlikely(__pyx_t_13 < 0)) __pyx_t_3 = 0; } else if (unlikely(__pyx_t_13 >= __pyx_pybuffernd_x.diminfo[0].shape)) __pyx_t_3 = 0; if (unlikely(__pyx_t_3 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_3); __PYX_ERR(0, 13, __pyx_L1_error) } __pyx_t_14 = -1L; __pyx_t_3 = -1; if (__pyx_t_14 < 0) { __pyx_t_14 += __pyx_pybuffernd_y.diminfo[0].shape; if (unlikely(__pyx_t_14 < 0)) __pyx_t_3 = 0; } else if (unlikely(__pyx_t_14 >= __pyx_pybuffernd_y.diminfo[0].shape)) __pyx_t_3 = 0; if (unlikely(__pyx_t_3 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_3); __PYX_ERR(0, 13, __pyx_L1_error) } *__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_14, __pyx_pybuffernd_y.diminfo[0].strides) = (*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_x.rcbuffer->pybuffer.buf, __pyx_t_13, __pyx_pybuffernd_x.diminfo[0].strides));
+14: for _ in range(steps):
__pyx_t_3 = __pyx_v_steps; __pyx_t_15 = __pyx_t_3; for (__pyx_t_16 = 0; __pyx_t_16 < __pyx_t_15; __pyx_t_16+=1) { __pyx_v__ = __pyx_t_16;
+15: for i in range(1, N-1):
__pyx_t_17 = (__pyx_v_N - 1); __pyx_t_18 = __pyx_t_17; for (__pyx_t_19 = 1; __pyx_t_19 < __pyx_t_18; __pyx_t_19+=1) { __pyx_v_i = __pyx_t_19;
+16: y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )
__pyx_t_20 = (__pyx_v_i - 1); __pyx_t_21 = -1; if (unlikely(__pyx_t_20 >= (size_t)__pyx_pybuffernd_y.diminfo[0].shape)) __pyx_t_21 = 0; if (unlikely(__pyx_t_21 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_21); __PYX_ERR(0, 16, __pyx_L1_error) } __pyx_t_22 = __pyx_v_i; __pyx_t_21 = -1; if (unlikely(__pyx_t_22 >= (size_t)__pyx_pybuffernd_y.diminfo[0].shape)) __pyx_t_21 = 0; if (unlikely(__pyx_t_21 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_21); __PYX_ERR(0, 16, __pyx_L1_error) } __pyx_t_23 = (__pyx_v_i + 1); __pyx_t_21 = -1; if (unlikely(__pyx_t_23 >= (size_t)__pyx_pybuffernd_y.diminfo[0].shape)) __pyx_t_21 = 0; if (unlikely(__pyx_t_21 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_21); __PYX_ERR(0, 16, __pyx_L1_error) } __pyx_t_24 = __pyx_v_i; __pyx_t_21 = -1; if (unlikely(__pyx_t_24 >= (size_t)__pyx_pybuffernd_y.diminfo[0].shape)) __pyx_t_21 = 0; if (unlikely(__pyx_t_21 != -1)) { __Pyx_RaiseBufferIndexError(__pyx_t_21); __PYX_ERR(0, 16, __pyx_L1_error) } *__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_24, __pyx_pybuffernd_y.diminfo[0].strides) = (.25 * (((*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_20, __pyx_pybuffernd_y.diminfo[0].strides)) + (2.0 * (*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_22, __pyx_pybuffernd_y.diminfo[0].strides)))) + (*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_23, __pyx_pybuffernd_y.diminfo[0].strides)))); }
+17: x, y = y, x # swap for next step
__pyx_t_4 = ((PyObject *)__pyx_v_y); __pyx_t_5 = ((PyObject *)__pyx_v_x); { __Pyx_BufFmt_StackElem __pyx_stack[1]; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x.rcbuffer->pybuffer); __pyx_t_21 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x.rcbuffer->pybuffer, (PyObject*)((PyArrayObject *)__pyx_t_4), &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack); if (unlikely(__pyx_t_21 < 0)) { PyErr_Fetch(&__pyx_t_6, &__pyx_t_25, &__pyx_t_26); if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x.rcbuffer->pybuffer, (PyObject*)__pyx_v_x, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) { Py_XDECREF(__pyx_t_6); Py_XDECREF(__pyx_t_25); Py_XDECREF(__pyx_t_26); __Pyx_RaiseBufferFallbackError(); } else { PyErr_Restore(__pyx_t_6, __pyx_t_25, __pyx_t_26); } __pyx_t_6 = __pyx_t_25 = __pyx_t_26 = 0; } __pyx_pybuffernd_x.diminfo[0].strides = __pyx_pybuffernd_x.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_x.diminfo[0].shape = __pyx_pybuffernd_x.rcbuffer->pybuffer.shape[0]; if (unlikely(__pyx_t_21 < 0)) __PYX_ERR(0, 17, __pyx_L1_error) } __pyx_v_x = ((PyArrayObject *)__pyx_t_4); __pyx_t_4 = 0; { __Pyx_BufFmt_StackElem __pyx_stack[1]; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __pyx_t_21 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y.rcbuffer->pybuffer, (PyObject*)((PyArrayObject *)__pyx_t_5), &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack); if (unlikely(__pyx_t_21 < 0)) { PyErr_Fetch(&__pyx_t_4, &__pyx_t_26, &__pyx_t_25); if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y.rcbuffer->pybuffer, (PyObject*)__pyx_v_y, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack) == -1)) { Py_XDECREF(__pyx_t_4); Py_XDECREF(__pyx_t_26); Py_XDECREF(__pyx_t_25); __Pyx_RaiseBufferFallbackError(); } else { PyErr_Restore(__pyx_t_4, __pyx_t_26, __pyx_t_25); } __pyx_t_4 = __pyx_t_26 = __pyx_t_25 = 0; } __pyx_pybuffernd_y.diminfo[0].strides = __pyx_pybuffernd_y.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_y.diminfo[0].shape = __pyx_pybuffernd_y.rcbuffer->pybuffer.shape[0]; if (unlikely(__pyx_t_21 < 0)) __PYX_ERR(0, 17, __pyx_L1_error) } __pyx_v_y = ((PyArrayObject *)__pyx_t_5); __pyx_t_5 = 0; }
+18: return x
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(((PyObject *)__pyx_v_x)); __pyx_r = ((PyObject *)__pyx_v_x); goto __pyx_L0;
ct = %timeit -o y = blur_cython_typed(x, steps)
t_ct = ct.best
times.append(t_ct)
labels.append("cython (types)")
plot_times()
7.15 ms ± 216 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
We can furter optimize with Cython macros, which disable bounds checking and negative indexing, and avoiding the Python variable swaping by using indices into a single array:
%%cython -a
#cython: boundscheck=False
#cython: wraparound=False
import numpy as np
cimport numpy as np
def blur_cython_optimized(np.ndarray[double, ndim=1] x, int steps=1024):
cdef size_t N = x.shape[0]
cdef np.ndarray[double, ndim=2] y
y = np.empty((2, N), dtype=np.float64)
y[0,:] = x
y[1,0] = x[0]
y[1,N-1] = x[N-1]
cdef size_t _, i, j=0, k=1
for _ in range(steps):
j = _ % 2
k = 1 - j
for i in range(1, N-1):
y[k,i] = .25 * ( y[j,i-1] + 2 * y[j,i] + y[j,i+1] )
return y[k]
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
+01: #cython: boundscheck=False
__pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
02: #cython: wraparound=False
03:
+04: import numpy as np
__pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
05: cimport numpy as np
06:
+07: def blur_cython_optimized(np.ndarray[double, ndim=1] x, int steps=1024):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_1blur_cython_optimized(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyMethodDef __pyx_mdef_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_1blur_cython_optimized = {"blur_cython_optimized", (PyCFunction)__pyx_pw_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_1blur_cython_optimized, METH_VARARGS|METH_KEYWORDS, 0}; static PyObject *__pyx_pw_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_1blur_cython_optimized(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyArrayObject *__pyx_v_x = 0; int __pyx_v_steps; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython_optimized (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_x,&__pyx_n_s_steps,0}; PyObject* values[2] = {0,0}; if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_x)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (kw_args > 0) { PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_steps); if (value) { values[1] = value; kw_args--; } } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "blur_cython_optimized") < 0)) __PYX_ERR(0, 7, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); break; default: goto __pyx_L5_argtuple_error; } } __pyx_v_x = ((PyArrayObject *)values[0]); if (values[1]) { __pyx_v_steps = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_steps == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 7, __pyx_L3_error) } else { __pyx_v_steps = ((int)0x400); } } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("blur_cython_optimized", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 7, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_4df3d84884a77cd6ac694979bff1c004.blur_cython_optimized", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_x), __pyx_ptype_5numpy_ndarray, 1, "x", 0))) __PYX_ERR(0, 7, __pyx_L1_error) __pyx_r = __pyx_pf_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_blur_cython_optimized(__pyx_self, __pyx_v_x, __pyx_v_steps); /* function exit code */ goto __pyx_L0; __pyx_L1_error:; __pyx_r = NULL; __pyx_L0:; __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_blur_cython_optimized(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_x, int __pyx_v_steps) { size_t __pyx_v_N; PyArrayObject *__pyx_v_y = 0; size_t __pyx_v__; size_t __pyx_v_i; size_t __pyx_v_j; size_t __pyx_v_k; __Pyx_LocalBuf_ND __pyx_pybuffernd_x; __Pyx_Buffer __pyx_pybuffer_x; __Pyx_LocalBuf_ND __pyx_pybuffernd_y; __Pyx_Buffer __pyx_pybuffer_y; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("blur_cython_optimized", 0); __pyx_pybuffer_y.pybuffer.buf = NULL; __pyx_pybuffer_y.refcount = 0; __pyx_pybuffernd_y.data = NULL; __pyx_pybuffernd_y.rcbuffer = &__pyx_pybuffer_y; __pyx_pybuffer_x.pybuffer.buf = NULL; __pyx_pybuffer_x.refcount = 0; __pyx_pybuffernd_x.data = NULL; __pyx_pybuffernd_x.rcbuffer = &__pyx_pybuffer_x; { __Pyx_BufFmt_StackElem __pyx_stack[1]; if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x.rcbuffer->pybuffer, (PyObject*)__pyx_v_x, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 7, __pyx_L1_error) } __pyx_pybuffernd_x.diminfo[0].strides = __pyx_pybuffernd_x.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_x.diminfo[0].shape = __pyx_pybuffernd_x.rcbuffer->pybuffer.shape[0]; /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); __Pyx_XDECREF(__pyx_t_4); __Pyx_XDECREF(__pyx_t_5); { PyObject *__pyx_type, *__pyx_value, *__pyx_tb; __Pyx_PyThreadState_declare __Pyx_PyThreadState_assign __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x.rcbuffer->pybuffer); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);} __Pyx_AddTraceback("_cython_magic_4df3d84884a77cd6ac694979bff1c004.blur_cython_optimized", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; goto __pyx_L2; __pyx_L0:; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x.rcbuffer->pybuffer); __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __pyx_L2:; __Pyx_XDECREF((PyObject *)__pyx_v_y); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple__13 = PyTuple_Pack(8, __pyx_n_s_x, __pyx_n_s_steps, __pyx_n_s_N, __pyx_n_s_y, __pyx_n_s__12, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__13); __Pyx_GIVEREF(__pyx_tuple__13); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_4df3d84884a77cd6ac694979bff1c004_1blur_cython_optimized, NULL, __pyx_n_s_cython_magic_4df3d84884a77cd6ac); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_blur_cython_optimized, __pyx_t_1) < 0) __PYX_ERR(0, 7, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+08: cdef size_t N = x.shape[0]
__pyx_v_N = (__pyx_v_x->dimensions[0]);
09: cdef np.ndarray[double, ndim=2] y
+10: y = np.empty((2, N), dtype=np.float64)
__pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_empty); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_PyInt_FromSize_t(__pyx_v_N); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_INCREF(__pyx_int_2); __Pyx_GIVEREF(__pyx_int_2); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_int_2); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_3); __pyx_t_3 = 0; __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_float64); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_5) < 0) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 10, __pyx_L1_error) __pyx_t_6 = ((PyArrayObject *)__pyx_t_5); { __Pyx_BufFmt_StackElem __pyx_stack[1]; __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y.rcbuffer->pybuffer); __pyx_t_7 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 2, 0, __pyx_stack); if (unlikely(__pyx_t_7 < 0)) { PyErr_Fetch(&__pyx_t_8, &__pyx_t_9, &__pyx_t_10); if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y.rcbuffer->pybuffer, (PyObject*)__pyx_v_y, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 2, 0, __pyx_stack) == -1)) { Py_XDECREF(__pyx_t_8); Py_XDECREF(__pyx_t_9); Py_XDECREF(__pyx_t_10); __Pyx_RaiseBufferFallbackError(); } else { PyErr_Restore(__pyx_t_8, __pyx_t_9, __pyx_t_10); } __pyx_t_8 = __pyx_t_9 = __pyx_t_10 = 0; } __pyx_pybuffernd_y.diminfo[0].strides = __pyx_pybuffernd_y.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_y.diminfo[0].shape = __pyx_pybuffernd_y.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_y.diminfo[1].strides = __pyx_pybuffernd_y.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_y.diminfo[1].shape = __pyx_pybuffernd_y.rcbuffer->pybuffer.shape[1]; if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 10, __pyx_L1_error) } __pyx_t_6 = 0; __pyx_v_y = ((PyArrayObject *)__pyx_t_5); __pyx_t_5 = 0;
+11: y[0,:] = x
__pyx_slice_ = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice_)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_slice_); __Pyx_GIVEREF(__pyx_slice_); /* … */ if (unlikely(PyObject_SetItem(((PyObject *)__pyx_v_y), __pyx_tuple__2, ((PyObject *)__pyx_v_x)) < 0)) __PYX_ERR(0, 11, __pyx_L1_error) __pyx_tuple__2 = PyTuple_Pack(2, __pyx_int_0, __pyx_slice_); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__2); __Pyx_GIVEREF(__pyx_tuple__2);
+12: y[1,0] = x[0]
__pyx_t_11 = 0; __pyx_t_12 = 1; __pyx_t_13 = 0; *__Pyx_BufPtrStrided2d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_12, __pyx_pybuffernd_y.diminfo[0].strides, __pyx_t_13, __pyx_pybuffernd_y.diminfo[1].strides) = (*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_x.rcbuffer->pybuffer.buf, __pyx_t_11, __pyx_pybuffernd_x.diminfo[0].strides));
+13: y[1,N-1] = x[N-1]
__pyx_t_14 = (__pyx_v_N - 1); __pyx_t_15 = 1; __pyx_t_16 = (__pyx_v_N - 1); *__Pyx_BufPtrStrided2d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_15, __pyx_pybuffernd_y.diminfo[0].strides, __pyx_t_16, __pyx_pybuffernd_y.diminfo[1].strides) = (*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_x.rcbuffer->pybuffer.buf, __pyx_t_14, __pyx_pybuffernd_x.diminfo[0].strides));
14:
+15: cdef size_t _, i, j=0, k=1
__pyx_v_j = 0; __pyx_v_k = 1;
+16: for _ in range(steps):
__pyx_t_7 = __pyx_v_steps; __pyx_t_17 = __pyx_t_7; for (__pyx_t_18 = 0; __pyx_t_18 < __pyx_t_17; __pyx_t_18+=1) { __pyx_v__ = __pyx_t_18;
+17: j = _ % 2
__pyx_v_j = (__pyx_v__ % 2);
+18: k = 1 - j
__pyx_v_k = (1 - __pyx_v_j);
+19: for i in range(1, N-1):
__pyx_t_19 = (__pyx_v_N - 1); __pyx_t_20 = __pyx_t_19; for (__pyx_t_21 = 1; __pyx_t_21 < __pyx_t_20; __pyx_t_21+=1) { __pyx_v_i = __pyx_t_21;
+20: y[k,i] = .25 * ( y[j,i-1] + 2 * y[j,i] + y[j,i+1] )
__pyx_t_22 = __pyx_v_j; __pyx_t_23 = (__pyx_v_i - 1); __pyx_t_24 = __pyx_v_j; __pyx_t_25 = __pyx_v_i; __pyx_t_26 = __pyx_v_j; __pyx_t_27 = (__pyx_v_i + 1); __pyx_t_28 = __pyx_v_k; __pyx_t_29 = __pyx_v_i; *__Pyx_BufPtrStrided2d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_28, __pyx_pybuffernd_y.diminfo[0].strides, __pyx_t_29, __pyx_pybuffernd_y.diminfo[1].strides) = (.25 * (((*__Pyx_BufPtrStrided2d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_22, __pyx_pybuffernd_y.diminfo[0].strides, __pyx_t_23, __pyx_pybuffernd_y.diminfo[1].strides)) + (2.0 * (*__Pyx_BufPtrStrided2d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_24, __pyx_pybuffernd_y.diminfo[0].strides, __pyx_t_25, __pyx_pybuffernd_y.diminfo[1].strides)))) + (*__Pyx_BufPtrStrided2d(double *, __pyx_pybuffernd_y.rcbuffer->pybuffer.buf, __pyx_t_26, __pyx_pybuffernd_y.diminfo[0].strides, __pyx_t_27, __pyx_pybuffernd_y.diminfo[1].strides)))); } }
+21: return y[k]
__Pyx_XDECREF(__pyx_r); __pyx_t_5 = __Pyx_GetItemInt(((PyObject *)__pyx_v_y), __pyx_v_k, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 21, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __pyx_r = __pyx_t_5; __pyx_t_5 = 0; goto __pyx_L0;
Note how there is now zero yellow called in any of the loops, only in the initial copy of the input array.
copt = %timeit -o y = blur_cython_optimized(x, steps)
t_copt = copt.best
times.append(t_copt)
labels.append("cython (optimized)")
plot_times()
853 µs ± 55.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
y = blur_cython_optimized(x, steps)
plt.plot(t, x, '--')
plt.plot(t, y)
[<matplotlib.lines.Line2D at 0x10c911b70>]
import numba
@numba.autojit
def blur_numba(x, steps=1024):
"""identical to blur_py, other than the decorator"""
x = 1 * x # copy
y = np.empty_like(x)
y[0] = x[0]
y[-1] = x[-1]
for _ in range(steps):
for i in range(1, len(x)-1):
y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] )
x, y = y, x # swap for next step
return x
# y = blur_numba(x, steps)
%time blur_numba(x, steps)
CPU times: user 176 ms, sys: 4.98 ms, total: 181 ms Wall time: 184 ms
array([ -1.00000000e+00, -9.84344628e-01, -9.68689267e-01, -9.53033925e-01, -9.37378610e-01, -9.21723332e-01, -9.06068103e-01, -8.90412936e-01, -8.74757844e-01, -8.59102845e-01, -8.43447958e-01, -8.27793208e-01, -8.12138622e-01, -7.96484230e-01, -7.80830072e-01, -7.65176191e-01, -7.49522638e-01, -7.33869475e-01, -7.18216771e-01, -7.02564609e-01, -6.86913083e-01, -6.71262307e-01, -6.55612407e-01, -6.39963533e-01, -6.24315857e-01, -6.08669578e-01, -5.93024923e-01, -5.77382156e-01, -5.61741577e-01, -5.46103529e-01, -5.30468407e-01, -5.14836658e-01, -4.99208791e-01, -4.83585385e-01, -4.67967096e-01, -4.52354665e-01, -4.36748929e-01, -4.21150830e-01, -4.05561429e-01, -3.89981915e-01, -3.74413619e-01, -3.58858029e-01, -3.43316806e-01, -3.27791798e-01, -3.12285056e-01, -2.96798857e-01, -2.81335717e-01, -2.65898417e-01, -2.50490019e-01, -2.35113890e-01, -2.19773724e-01, -2.04473568e-01, -1.89217842e-01, -1.74011364e-01, -1.58859379e-01, -1.43767579e-01, -1.28742130e-01, -1.13789699e-01, -9.89174745e-02, -8.41331917e-02, -6.94451570e-02, -5.48622677e-02, -4.03940331e-02, -2.60505922e-02, -1.18427300e-02, 2.21810884e-03, 1.61198096e-02, 2.98495798e-02, 4.33939458e-02, 5.67387526e-02, 6.98691690e-02, 8.27696967e-02, 9.54241849e-02, 1.07815850e-01, 1.19927302e-01, 1.31740575e-01, 1.43237166e-01, 1.54398080e-01, 1.65203880e-01, 1.75634744e-01, 1.85670532e-01, 1.95290857e-01, 2.04475160e-01, 2.13202800e-01, 2.21453140e-01, 2.29205649e-01, 2.36440002e-01, 2.43136187e-01, 2.49274624e-01, 2.54836276e-01, 2.59802772e-01, 2.64156534e-01, 2.67880898e-01, 2.70960244e-01, 2.73380119e-01, 2.75127372e-01, 2.76190267e-01, 2.76558615e-01, 2.76223887e-01, 2.75179327e-01, 2.73420061e-01, 2.70943195e-01, 2.67747910e-01, 2.63835541e-01, 2.59209651e-01, 2.53876095e-01, 2.47843067e-01, 2.41121139e-01, 2.33723282e-01, 2.25664881e-01, 2.16963728e-01, 2.07640002e-01, 1.97716241e-01, 1.87217286e-01, 1.76170225e-01, 1.64604311e-01, 1.52550869e-01, 1.40043195e-01, 1.27116431e-01, 1.13807437e-01, 1.00154644e-01, 8.61979023e-02, 7.19783150e-02, 5.75380663e-02, 4.29202405e-02, 2.81686355e-02, 1.33275714e-02, -1.55830374e-03, -1.64442112e-02, -3.12854381e-02, -4.60375341e-02, -6.06565048e-02, -7.50990022e-02, -8.93225091e-02, -1.03285518e-01, -1.16947703e-01, -1.30270080e-01, -1.43215160e-01, -1.55747089e-01, -1.67831784e-01, -1.79437042e-01, -1.90532650e-01, -2.01090475e-01, -2.11084542e-01, -2.20491097e-01, -2.29288657e-01, -2.37458043e-01, -2.44982406e-01, -2.51847233e-01, -2.58040340e-01, -2.63551857e-01, -2.68374196e-01, -2.72502013e-01, -2.75932151e-01, -2.78663583e-01, -2.80697338e-01, -2.82036426e-01, -2.82685747e-01, -2.82652003e-01, -2.81943602e-01, -2.80570555e-01, -2.78544372e-01, -2.75877957e-01, -2.72585503e-01, -2.68682381e-01, -2.64185038e-01, -2.59110889e-01, -2.53478219e-01, -2.47306080e-01, -2.40614199e-01, -2.33422886e-01, -2.25752948e-01, -2.17625610e-01, -2.09062443e-01, -2.00085289e-01, -1.90716209e-01, -1.80977419e-01, -1.70891247e-01, -1.60480090e-01, -1.49766376e-01, -1.38772541e-01, -1.27521003e-01, -1.16034144e-01, -1.04334304e-01, -9.24437728e-02, -8.03847905e-02, -6.81795522e-02, -5.58502159e-02, -4.34189147e-02, -3.09077712e-02, -1.83389146e-02, -5.73449951e-03, 6.88327456e-03, 1.94921430e-02, 3.20697565e-02, 4.45936619e-02, 5.70412854e-02, 6.93899175e-02, 8.16167007e-02, 9.36986214e-02, 1.05612505e-01, 1.17335018e-01, 1.28842669e-01, 1.40111826e-01, 1.51118727e-01, 1.61839510e-01, 1.72250238e-01, 1.82326939e-01, 1.92045650e-01, 2.01382470e-01, 2.10313614e-01, 2.18815485e-01, 2.26864744e-01, 2.34438392e-01, 2.41513852e-01, 2.48069067e-01, 2.54082594e-01, 2.59533709e-01, 2.64402515e-01, 2.68670048e-01, 2.72318398e-01, 2.75330820e-01, 2.77691851e-01, 2.79387428e-01, 2.80405001e-01, 2.80733649e-01, 2.80364190e-01, 2.79289285e-01, 2.77503539e-01, 2.75003595e-01, 2.71788223e-01, 2.67858391e-01, 2.63217339e-01, 2.57870636e-01, 2.51826221e-01, 2.45094443e-01, 2.37688078e-01, 2.29622335e-01, 2.20914855e-01, 2.11585684e-01, 2.01657242e-01, 1.91154269e-01, 1.80103761e-01, 1.68534895e-01, 1.56478929e-01, 1.43969098e-01, 1.31040494e-01, 1.17729932e-01, 1.04075804e-01, 9.01179271e-02, 7.58973761e-02, 6.14563102e-02, 4.68377921e-02, 3.20856014e-02, 1.72440425e-02, 2.35774965e-03, -1.25285098e-02, -2.73700332e-02, -4.21223784e-02, -5.67415585e-02, -7.11842314e-02, -8.54078855e-02, -9.93710180e-02, -1.13033306e-01, -1.26355768e-01, -1.39300920e-01, -1.51832909e-01, -1.63917653e-01, -1.75522952e-01, -1.86618595e-01, -1.97176448e-01, -2.07170539e-01, -2.16577113e-01, -2.25374689e-01, -2.33544088e-01, -2.41068463e-01, -2.47933298e-01, -2.54126413e-01, -2.59637936e-01, -2.64460280e-01, -2.68588101e-01, -2.72018242e-01, -2.74749677e-01, -2.76783435e-01, -2.78122524e-01, -2.78771846e-01, -2.78738104e-01, -2.78029704e-01, -2.76656658e-01, -2.74630475e-01, -2.71964061e-01, -2.68671607e-01, -2.64768485e-01, -2.60271142e-01, -2.55196994e-01, -2.49564324e-01, -2.43392186e-01, -2.36700305e-01, -2.29508991e-01, -2.21839053e-01, -2.13711716e-01, -2.05148548e-01, -1.96171395e-01, -1.86802315e-01, -1.77063525e-01, -1.66977353e-01, -1.56566195e-01, -1.45852482e-01, -1.34858647e-01, -1.23607108e-01, -1.12120250e-01, -1.00420410e-01, -8.85298784e-02, -7.64708962e-02, -6.42656578e-02, -5.19363216e-02, -3.95050204e-02, -2.69938769e-02, -1.44250203e-02, -1.82060518e-03, 1.07971689e-02, 2.34060373e-02, 3.59836508e-02, 4.85075562e-02, 6.09551797e-02, 7.33038118e-02, 8.55305950e-02, 9.76125157e-02, 1.09526400e-01, 1.21248912e-01, 1.32756563e-01, 1.44025720e-01, 1.55032621e-01, 1.65753404e-01, 1.76164132e-01, 1.86240833e-01, 1.95959545e-01, 2.05296364e-01, 2.14227508e-01, 2.22729379e-01, 2.30778638e-01, 2.38352286e-01, 2.45427746e-01, 2.51982961e-01, 2.57996488e-01, 2.63447604e-01, 2.68316409e-01, 2.72583942e-01, 2.76232292e-01, 2.79244714e-01, 2.81605745e-01, 2.83301322e-01, 2.84318895e-01, 2.84647543e-01, 2.84278084e-01, 2.83203179e-01, 2.81417432e-01, 2.78917489e-01, 2.75702116e-01, 2.71772283e-01, 2.67131231e-01, 2.61784527e-01, 2.55740112e-01, 2.49008333e-01, 2.41601966e-01, 2.33536222e-01, 2.24828739e-01, 2.15499566e-01, 2.05571120e-01, 1.95068143e-01, 1.84017630e-01, 1.72448757e-01, 1.60392783e-01, 1.47882941e-01, 1.34954324e-01, 1.21643745e-01, 1.07989596e-01, 9.40316940e-02, 7.98111114e-02, 6.53700062e-02, 5.07514397e-02, 3.59991893e-02, 2.11575569e-02, 6.27117368e-03, -8.61519668e-03, -2.34568558e-02, -3.82093670e-02, -5.28287497e-02, -6.72716695e-02, -8.14956237e-02, -9.54591206e-02, -1.09121850e-01, -1.22444847e-01, -1.35390644e-01, -1.47923412e-01, -1.60009093e-01, -1.71615517e-01, -1.82712509e-01, -1.93271980e-01, -2.03268003e-01, -2.12676883e-01, -2.21477206e-01, -2.29649872e-01, -2.37178125e-01, -2.44047557e-01, -2.50246110e-01, -2.55764057e-01, -2.60593976e-01, -2.64730712e-01, -2.68171332e-01, -2.70915059e-01, -2.72963213e-01, -2.74319135e-01, -2.74988104e-01, -2.74977257e-01, -2.74295491e-01, -2.72953377e-01, -2.70963057e-01, -2.68338153e-01, -2.65093667e-01, -2.61245882e-01, -2.56812270e-01, -2.51811399e-01, -2.46262842e-01, -2.40187095e-01, -2.33605492e-01, -2.26540137e-01, -2.19013828e-01, -2.11050001e-01, -2.02672669e-01, -1.93906377e-01, -1.84776162e-01, -1.75307516e-01, -1.65526363e-01, -1.55459037e-01, -1.45132275e-01, -1.34573207e-01, -1.23809366e-01, -1.12868687e-01, -1.01779531e-01, -9.05706963e-02, -7.92714498e-02, -6.79115503e-02, -5.65212822e-02, -4.51314884e-02, -3.37736061e-02, -2.24797018e-02, -1.12825074e-02, -2.15453478e-04, 1.06872988e-02, 2.13908298e-02, 3.18594377e-02, 4.20566203e-02, 5.19450636e-02, 6.14866379e-02, 7.06424018e-02, 7.93726166e-02, 8.76367701e-02, 9.53936119e-02, 1.02601201e-01, 1.09216964e-01, 1.15197770e-01, 1.20500013e-01, 1.25079717e-01, 1.28892642e-01, 1.31894420e-01, 1.34040690e-01, 1.35287256e-01, 1.35590255e-01, 1.34906337e-01, 1.33192860e-01, 1.30408089e-01, 1.26511416e-01, 1.21463579e-01, 1.15226889e-01, 1.07765473e-01, 9.90455065e-02, 8.90354604e-02, 7.77063417e-02, 6.50319357e-02, 5.09890433e-02, 3.55577134e-02, 1.87214675e-02, 4.67513472e-04, -1.92130513e-02, -4.03250526e-02, -6.28690676e-02, -8.68412713e-02, -1.12233304e-01, -1.39032161e-01, -1.67220104e-01, -1.96774601e-01, -2.27668289e-01, -2.59868969e-01, -2.93339623e-01, -3.28038467e-01, -3.63919027e-01, -4.00930251e-01, -4.39016643e-01, -4.78118433e-01, -5.18171769e-01, -5.59108942e-01, -6.00858630e-01, -6.43346175e-01, -6.86493875e-01, -7.30221302e-01, -7.74445634e-01, -8.19082010e-01, -8.64043891e-01, -9.09243439e-01, -9.54591902e-01, -1.00000000e+00])
nb = %timeit -o blur_numba(x, steps)
t_nb = nb.best
times.append(t_nb)
labels.append("numba")
plot_times()
3.72 ms ± 64 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
blur_numba.inspect_types()
blur_numba (array(float64, 1d, C), int64) -------------------------------------------------------------------------------- # File: <ipython-input-39-9ca80f9c79ef> # --- LINE 3 --- # label 0 # del x # del $const0.1 # del $0.3 # del $0.4 # del $0.5 # del $0.7 # del $const0.9 # del $const0.12 # del $0.10 # del $const0.14 # del $const0.17 # del $0.15 @numba.autojit # --- LINE 4 --- def blur_numba(x, steps=1024): # --- LINE 5 --- """identical to blur_py, other than the decorator""" # --- LINE 6 --- # x = arg(0, name=x) :: array(float64, 1d, C) # steps = arg(1, name=steps) :: int64 # $const0.1 = const(int, 1) :: int64 # $0.3 = arrayexpr(expr=('*', [const(int, 1), Var(x, <ipython-input-39-9ca80f9c79ef> (6))]), ty=array(float64, 1d, C)) :: array(float64, 1d, C) # x.1 = $0.3 :: array(float64, 1d, C) x = 1 * x # copy # --- LINE 7 --- # $0.4 = global(np: <module 'numpy' from '/Users/minrk/conda/lib/python3.6/site-packages/numpy/__init__.py'>) :: Module(<module 'numpy' from '/Users/minrk/conda/lib/python3.6/site-packages/numpy/__init__.py'>) # $0.5 = getattr(value=$0.4, attr=empty_like) :: Function(<built-in function empty_like>) # $0.7 = call $0.5(x.1, func=$0.5, args=[Var(x.1, <ipython-input-39-9ca80f9c79ef> (6))], kws=(), vararg=None) :: (array(float64, 1d, C),) -> array(float64, 1d, C) # y = $0.7 :: array(float64, 1d, C) y = np.empty_like(x) # --- LINE 8 --- # $const0.9 = const(int, 0) :: int64 # $0.10 = static_getitem(value=x.1, index=0, index_var=$const0.9) :: float64 # $const0.12 = const(int, 0) :: int64 # y[0] = $0.10 y[0] = x[0] # --- LINE 9 --- # $const0.14 = const(int, -1) :: int64 # $0.15 = static_getitem(value=x.1, index=-1, index_var=$const0.14) :: float64 # $const0.17 = const(int, -1) :: int64 # y[-1] = $0.15 # jump 42 # label 42 y[-1] = x[-1] # --- LINE 10 --- # jump 44 # label 44 # $44.1 = global(range: <class 'range'>) :: Function(<class 'range'>) # $44.3 = call $44.1(steps, func=$44.1, args=[Var(steps, <ipython-input-39-9ca80f9c79ef> (6))], kws=(), vararg=None) :: (int64,) -> range_state_int64 # del steps # del $44.1 # $44.4 = getiter(value=$44.3) :: range_iter_int64 # del $44.3 # $phi52.1 = $44.4 :: range_iter_int64 # del $44.4 # jump 52 # label 52 # $52.2 = iternext(value=$phi52.1) :: pair<int64, bool> # $52.3 = pair_first(value=$52.2) :: int64 # $52.4 = pair_second(value=$52.2) :: bool # del $52.2 # $phi54.1 = $52.3 :: int64 # $phi140.1 = $52.3 :: int64 # del $phi140.1 # del $52.3 # $phi140.2 = $phi52.1 :: range_iter_int64 # del $phi140.2 # branch $52.4, 54, 140 # label 54 # del $52.4 # _ = $phi54.1 :: int64 # del _ # del $phi54.1 # jump 56 # label 56 for _ in range(steps): # --- LINE 11 --- # jump 58 # label 58 # $58.1 = global(range: <class 'range'>) :: Function(<class 'range'>) # $const58.2 = const(int, 1) :: int64 # $58.3 = global(len: <built-in function len>) :: Function(<built-in function len>) # $58.5 = call $58.3(x.1, func=$58.3, args=[Var(x.1, <ipython-input-39-9ca80f9c79ef> (6))], kws=(), vararg=None) :: (array(float64, 1d, C),) -> int64 # del $58.3 # $const58.6 = const(int, 1) :: int64 # $58.7 = $58.5 - $const58.6 :: int64 # del $const58.6 # del $58.5 # $58.8 = call $58.1($const58.2, $58.7, func=$58.1, args=[Var($const58.2, <ipython-input-39-9ca80f9c79ef> (11)), Var($58.7, <ipython-input-39-9ca80f9c79ef> (11))], kws=(), vararg=None) :: (int64, int64) -> range_state_int64 # del $const58.2 # del $58.7 # del $58.1 # $58.9 = getiter(value=$58.8) :: range_iter_int64 # del $58.8 # $phi76.1 = $58.9 :: range_iter_int64 # del $58.9 # jump 76 # label 76 # $76.2 = iternext(value=$phi76.1) :: pair<int64, bool> # $76.3 = pair_first(value=$76.2) :: int64 # $76.4 = pair_second(value=$76.2) :: bool # del $76.2 # $phi78.1 = $76.3 :: int64 # $phi126.1 = $76.3 :: int64 # del $phi126.1 # del $76.3 # $phi126.2 = $phi76.1 :: range_iter_int64 # del $phi126.2 # branch $76.4, 78, 126 # label 78 # del $76.4 # i = $phi78.1 :: int64 # del $phi78.1 # del $const78.5 # del $78.6 # del $const78.8 # del $78.11 # del $78.7 # del $78.12 # del $const78.16 # del $78.17 # del $78.18 # del $78.13 # del $const78.2 # del $78.19 # del i # del $78.20 for i in range(1, len(x)-1): # --- LINE 12 --- # $const78.2 = const(float, 0.25) :: float64 # $const78.5 = const(int, 1) :: int64 # $78.6 = i - $const78.5 :: int64 # $78.7 = getitem(value=y, index=$78.6) :: float64 # $const78.8 = const(int, 2) :: int64 # $78.11 = getitem(value=y, index=i) :: float64 # $78.12 = $const78.8 * $78.11 :: float64 # $78.13 = $78.7 + $78.12 :: float64 # $const78.16 = const(int, 1) :: int64 # $78.17 = i + $const78.16 :: int64 # $78.18 = getitem(value=y, index=$78.17) :: float64 # $78.19 = $78.13 + $78.18 :: float64 # $78.20 = $const78.2 * $78.19 :: float64 # y[i] = $78.20 :: (array(float64, 1d, C), int64, float64) -> none # jump 76 # label 126 # del $phi78.1 # del $phi76.1 # del $76.4 # jump 128 # label 128 # del $x128.2 y[i] = .25 * ( y[i-1] + 2 * y[i] + y[i+1] ) # --- LINE 13 --- # $x128.2 = x.1 :: array(float64, 1d, C) # x.1 = y :: array(float64, 1d, C) # y = $x128.2 :: array(float64, 1d, C) # jump 52 # label 140 # del y # del $phi54.1 # del $phi52.1 # del $52.4 # jump 142 # label 142 # del x.1 x, y = y, x # swap for next step # --- LINE 14 --- # $142.2 = cast(value=x.1) :: array(float64, 1d, C) # return $142.2 return x ================================================================================
print(list(blur_numba.inspect_llvm().values())[0])
; ModuleID = 'blur_numba' source_filename = "<string>" target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin17.7.0" @.const.blur_numba = internal constant [11 x i8] c"blur_numba\00" @".const.Fatal error: missing _dynfunc.Closure" = internal constant [38 x i8] c"Fatal error: missing _dynfunc.Closure\00" @PyExc_RuntimeError = external global i8 @".const.missing Environment" = internal constant [20 x i8] c"missing Environment\00" @_Py_NoneStruct = external global i8 @PyExc_StopIteration = external global i8 @PyExc_SystemError = external global i8 @".const.unknown error when calling native function" = internal constant [43 x i8] c"unknown error when calling native function\00" define i32 @"_ZN8__main__14blur_numba$244E5ArrayIdLi1E1C7mutable7alignedEx"({ i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* noalias nocapture %retptr, { i8*, i32 }** noalias nocapture %excinfo, i8* noalias nocapture readnone %env, i8* %arg.x.0, i8* nocapture readnone %arg.x.1, i64 %arg.x.2, i64 %arg.x.3, double* nocapture readonly %arg.x.4, i64 %arg.x.5.0, i64 %arg.x.6.0, i64 %arg.steps) local_unnamed_addr { B86.i: tail call void @NRT_incref(i8* %arg.x.0) %.68 = shl i64 %arg.x.5.0, 3 %.69 = tail call i8* @NRT_MemInfo_alloc_safe_aligned(i64 %.68, i32 32) %.5.i = getelementptr i8, i8* %.69, i64 24 %0 = bitcast i8* %.5.i to i8** %.6.i = load i8*, i8** %0, align 8 %.10487 = icmp sgt i64 %arg.x.5.0, 0 br i1 %.10487, label %for.body.endif.endif.lr.ph, label %for.end for.body.endif.endif.lr.ph: ; preds = %B86.i %.106 = icmp eq i64 %arg.x.5.0, 1 br i1 %.106, label %for.end.loopexit92, label %for.body.endif.endif.lr.ph.split.us for.body.endif.endif.lr.ph.split.us: ; preds = %for.body.endif.endif.lr.ph %min.iters.check = icmp ult i64 %arg.x.5.0, 16 br i1 %min.iters.check, label %for.body.endif.endif.us.preheader, label %vector.memcheck vector.memcheck: ; preds = %for.body.endif.endif.lr.ph.split.us %1 = bitcast double* %arg.x.4 to i8* %2 = shl i64 %arg.x.5.0, 3 %scevgep = getelementptr i8, i8* %.6.i, i64 %2 %scevgep103 = getelementptr double, double* %arg.x.4, i64 %arg.x.5.0 %scevgep103104 = bitcast double* %scevgep103 to i8* %bound0 = icmp ult i8* %.6.i, %scevgep103104 %bound1 = icmp ugt i8* %scevgep, %1 %memcheck.conflict = and i1 %bound0, %bound1 br i1 %memcheck.conflict, label %for.body.endif.endif.us.preheader, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i64 %arg.x.5.0, -16 %3 = add i64 %n.vec, -16 %4 = lshr exact i64 %3, 4 %5 = add nuw nsw i64 %4, 1 %xtraiter113 = and i64 %5, 3 %6 = icmp ult i64 %3, 48 br i1 %6, label %middle.block.unr-lcssa, label %vector.ph.new vector.ph.new: ; preds = %vector.ph %7 = add i64 %xtraiter113, -1 %8 = sub i64 %7, %4 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph.new %lsr.iv180 = phi i64 [ %lsr.iv.next181, %vector.body ], [ %8, %vector.ph.new ] %index = phi i64 [ 0, %vector.ph.new ], [ %index.next.3, %vector.body ] %sunkaddr = mul i64 %index, 8 %9 = bitcast double* %arg.x.4 to i8* %sunkaddr182 = getelementptr i8, i8* %9, i64 %sunkaddr %10 = bitcast i8* %sunkaddr182 to <4 x i64>* %wide.load = load <4 x i64>, <4 x i64>* %10, align 8, !alias.scope !0 %sunkaddr183 = mul i64 %index, 8 %11 = bitcast double* %arg.x.4 to i8* %sunkaddr184 = getelementptr i8, i8* %11, i64 %sunkaddr183 %sunkaddr185 = getelementptr i8, i8* %sunkaddr184, i64 32 %12 = bitcast i8* %sunkaddr185 to <4 x i64>* %wide.load108 = load <4 x i64>, <4 x i64>* %12, align 8, !alias.scope !0 %sunkaddr186 = mul i64 %index, 8 %13 = bitcast double* %arg.x.4 to i8* %sunkaddr187 = getelementptr i8, i8* %13, i64 %sunkaddr186 %sunkaddr188 = getelementptr i8, i8* %sunkaddr187, i64 64 %14 = bitcast i8* %sunkaddr188 to <4 x i64>* %wide.load109 = load <4 x i64>, <4 x i64>* %14, align 8, !alias.scope !0 %sunkaddr189 = mul i64 %index, 8 %15 = bitcast double* %arg.x.4 to i8* %sunkaddr190 = getelementptr i8, i8* %15, i64 %sunkaddr189 %sunkaddr191 = getelementptr i8, i8* %sunkaddr190, i64 96 %16 = bitcast i8* %sunkaddr191 to <4 x i64>* %wide.load110 = load <4 x i64>, <4 x i64>* %16, align 8, !alias.scope !0 %sunkaddr192 = mul i64 %index, 8 %sunkaddr193 = getelementptr i8, i8* %.6.i, i64 %sunkaddr192 %17 = bitcast i8* %sunkaddr193 to <4 x i64>* store <4 x i64> %wide.load, <4 x i64>* %17, align 8, !alias.scope !3, !noalias !0 %sunkaddr194 = mul i64 %index, 8 %sunkaddr195 = getelementptr i8, i8* %.6.i, i64 %sunkaddr194 %sunkaddr196 = getelementptr i8, i8* %sunkaddr195, i64 32 %18 = bitcast i8* %sunkaddr196 to <4 x i64>* store <4 x i64> %wide.load108, <4 x i64>* %18, align 8, !alias.scope !3, !noalias !0 %sunkaddr197 = mul i64 %index, 8 %sunkaddr198 = getelementptr i8, i8* %.6.i, i64 %sunkaddr197 %sunkaddr199 = getelementptr i8, i8* %sunkaddr198, i64 64 %19 = bitcast i8* %sunkaddr199 to <4 x i64>* store <4 x i64> %wide.load109, <4 x i64>* %19, align 8, !alias.scope !3, !noalias !0 %sunkaddr200 = mul i64 %index, 8 %sunkaddr201 = getelementptr i8, i8* %.6.i, i64 %sunkaddr200 %sunkaddr202 = getelementptr i8, i8* %sunkaddr201, i64 96 %20 = bitcast i8* %sunkaddr202 to <4 x i64>* store <4 x i64> %wide.load110, <4 x i64>* %20, align 8, !alias.scope !3, !noalias !0 %sunkaddr203 = mul i64 %index, 8 %21 = bitcast double* %arg.x.4 to i8* %sunkaddr204 = getelementptr i8, i8* %21, i64 %sunkaddr203 %sunkaddr205 = getelementptr i8, i8* %sunkaddr204, i64 128 %22 = bitcast i8* %sunkaddr205 to <4 x i64>* %wide.load.1 = load <4 x i64>, <4 x i64>* %22, align 8, !alias.scope !0 %sunkaddr206 = mul i64 %index, 8 %23 = bitcast double* %arg.x.4 to i8* %sunkaddr207 = getelementptr i8, i8* %23, i64 %sunkaddr206 %sunkaddr208 = getelementptr i8, i8* %sunkaddr207, i64 160 %24 = bitcast i8* %sunkaddr208 to <4 x i64>* %wide.load108.1 = load <4 x i64>, <4 x i64>* %24, align 8, !alias.scope !0 %sunkaddr209 = mul i64 %index, 8 %25 = bitcast double* %arg.x.4 to i8* %sunkaddr210 = getelementptr i8, i8* %25, i64 %sunkaddr209 %sunkaddr211 = getelementptr i8, i8* %sunkaddr210, i64 192 %26 = bitcast i8* %sunkaddr211 to <4 x i64>* %wide.load109.1 = load <4 x i64>, <4 x i64>* %26, align 8, !alias.scope !0 %sunkaddr212 = mul i64 %index, 8 %27 = bitcast double* %arg.x.4 to i8* %sunkaddr213 = getelementptr i8, i8* %27, i64 %sunkaddr212 %sunkaddr214 = getelementptr i8, i8* %sunkaddr213, i64 224 %28 = bitcast i8* %sunkaddr214 to <4 x i64>* %wide.load110.1 = load <4 x i64>, <4 x i64>* %28, align 8, !alias.scope !0 %sunkaddr215 = mul i64 %index, 8 %sunkaddr216 = getelementptr i8, i8* %.6.i, i64 %sunkaddr215 %sunkaddr217 = getelementptr i8, i8* %sunkaddr216, i64 128 %29 = bitcast i8* %sunkaddr217 to <4 x i64>* store <4 x i64> %wide.load.1, <4 x i64>* %29, align 8, !alias.scope !3, !noalias !0 %sunkaddr218 = mul i64 %index, 8 %sunkaddr219 = getelementptr i8, i8* %.6.i, i64 %sunkaddr218 %sunkaddr220 = getelementptr i8, i8* %sunkaddr219, i64 160 %30 = bitcast i8* %sunkaddr220 to <4 x i64>* store <4 x i64> %wide.load108.1, <4 x i64>* %30, align 8, !alias.scope !3, !noalias !0 %sunkaddr221 = mul i64 %index, 8 %sunkaddr222 = getelementptr i8, i8* %.6.i, i64 %sunkaddr221 %sunkaddr223 = getelementptr i8, i8* %sunkaddr222, i64 192 %31 = bitcast i8* %sunkaddr223 to <4 x i64>* store <4 x i64> %wide.load109.1, <4 x i64>* %31, align 8, !alias.scope !3, !noalias !0 %sunkaddr224 = mul i64 %index, 8 %sunkaddr225 = getelementptr i8, i8* %.6.i, i64 %sunkaddr224 %sunkaddr226 = getelementptr i8, i8* %sunkaddr225, i64 224 %32 = bitcast i8* %sunkaddr226 to <4 x i64>* store <4 x i64> %wide.load110.1, <4 x i64>* %32, align 8, !alias.scope !3, !noalias !0 %sunkaddr227 = mul i64 %index, 8 %33 = bitcast double* %arg.x.4 to i8* %sunkaddr228 = getelementptr i8, i8* %33, i64 %sunkaddr227 %sunkaddr229 = getelementptr i8, i8* %sunkaddr228, i64 256 %34 = bitcast i8* %sunkaddr229 to <4 x i64>* %wide.load.2 = load <4 x i64>, <4 x i64>* %34, align 8, !alias.scope !0 %sunkaddr230 = mul i64 %index, 8 %35 = bitcast double* %arg.x.4 to i8* %sunkaddr231 = getelementptr i8, i8* %35, i64 %sunkaddr230 %sunkaddr232 = getelementptr i8, i8* %sunkaddr231, i64 288 %36 = bitcast i8* %sunkaddr232 to <4 x i64>* %wide.load108.2 = load <4 x i64>, <4 x i64>* %36, align 8, !alias.scope !0 %sunkaddr233 = mul i64 %index, 8 %37 = bitcast double* %arg.x.4 to i8* %sunkaddr234 = getelementptr i8, i8* %37, i64 %sunkaddr233 %sunkaddr235 = getelementptr i8, i8* %sunkaddr234, i64 320 %38 = bitcast i8* %sunkaddr235 to <4 x i64>* %wide.load109.2 = load <4 x i64>, <4 x i64>* %38, align 8, !alias.scope !0 %sunkaddr236 = mul i64 %index, 8 %39 = bitcast double* %arg.x.4 to i8* %sunkaddr237 = getelementptr i8, i8* %39, i64 %sunkaddr236 %sunkaddr238 = getelementptr i8, i8* %sunkaddr237, i64 352 %40 = bitcast i8* %sunkaddr238 to <4 x i64>* %wide.load110.2 = load <4 x i64>, <4 x i64>* %40, align 8, !alias.scope !0 %sunkaddr239 = mul i64 %index, 8 %sunkaddr240 = getelementptr i8, i8* %.6.i, i64 %sunkaddr239 %sunkaddr241 = getelementptr i8, i8* %sunkaddr240, i64 256 %41 = bitcast i8* %sunkaddr241 to <4 x i64>* store <4 x i64> %wide.load.2, <4 x i64>* %41, align 8, !alias.scope !3, !noalias !0 %sunkaddr242 = mul i64 %index, 8 %sunkaddr243 = getelementptr i8, i8* %.6.i, i64 %sunkaddr242 %sunkaddr244 = getelementptr i8, i8* %sunkaddr243, i64 288 %42 = bitcast i8* %sunkaddr244 to <4 x i64>* store <4 x i64> %wide.load108.2, <4 x i64>* %42, align 8, !alias.scope !3, !noalias !0 %sunkaddr245 = mul i64 %index, 8 %sunkaddr246 = getelementptr i8, i8* %.6.i, i64 %sunkaddr245 %sunkaddr247 = getelementptr i8, i8* %sunkaddr246, i64 320 %43 = bitcast i8* %sunkaddr247 to <4 x i64>* store <4 x i64> %wide.load109.2, <4 x i64>* %43, align 8, !alias.scope !3, !noalias !0 %sunkaddr248 = mul i64 %index, 8 %sunkaddr249 = getelementptr i8, i8* %.6.i, i64 %sunkaddr248 %sunkaddr250 = getelementptr i8, i8* %sunkaddr249, i64 352 %44 = bitcast i8* %sunkaddr250 to <4 x i64>* store <4 x i64> %wide.load110.2, <4 x i64>* %44, align 8, !alias.scope !3, !noalias !0 %sunkaddr251 = mul i64 %index, 8 %45 = bitcast double* %arg.x.4 to i8* %sunkaddr252 = getelementptr i8, i8* %45, i64 %sunkaddr251 %sunkaddr253 = getelementptr i8, i8* %sunkaddr252, i64 384 %46 = bitcast i8* %sunkaddr253 to <4 x i64>* %wide.load.3 = load <4 x i64>, <4 x i64>* %46, align 8, !alias.scope !0 %sunkaddr254 = mul i64 %index, 8 %47 = bitcast double* %arg.x.4 to i8* %sunkaddr255 = getelementptr i8, i8* %47, i64 %sunkaddr254 %sunkaddr256 = getelementptr i8, i8* %sunkaddr255, i64 416 %48 = bitcast i8* %sunkaddr256 to <4 x i64>* %wide.load108.3 = load <4 x i64>, <4 x i64>* %48, align 8, !alias.scope !0 %sunkaddr257 = mul i64 %index, 8 %49 = bitcast double* %arg.x.4 to i8* %sunkaddr258 = getelementptr i8, i8* %49, i64 %sunkaddr257 %sunkaddr259 = getelementptr i8, i8* %sunkaddr258, i64 448 %50 = bitcast i8* %sunkaddr259 to <4 x i64>* %wide.load109.3 = load <4 x i64>, <4 x i64>* %50, align 8, !alias.scope !0 %sunkaddr260 = mul i64 %index, 8 %51 = bitcast double* %arg.x.4 to i8* %sunkaddr261 = getelementptr i8, i8* %51, i64 %sunkaddr260 %sunkaddr262 = getelementptr i8, i8* %sunkaddr261, i64 480 %52 = bitcast i8* %sunkaddr262 to <4 x i64>* %wide.load110.3 = load <4 x i64>, <4 x i64>* %52, align 8, !alias.scope !0 %sunkaddr263 = mul i64 %index, 8 %sunkaddr264 = getelementptr i8, i8* %.6.i, i64 %sunkaddr263 %sunkaddr265 = getelementptr i8, i8* %sunkaddr264, i64 384 %53 = bitcast i8* %sunkaddr265 to <4 x i64>* store <4 x i64> %wide.load.3, <4 x i64>* %53, align 8, !alias.scope !3, !noalias !0 %sunkaddr266 = mul i64 %index, 8 %sunkaddr267 = getelementptr i8, i8* %.6.i, i64 %sunkaddr266 %sunkaddr268 = getelementptr i8, i8* %sunkaddr267, i64 416 %54 = bitcast i8* %sunkaddr268 to <4 x i64>* store <4 x i64> %wide.load108.3, <4 x i64>* %54, align 8, !alias.scope !3, !noalias !0 %sunkaddr269 = mul i64 %index, 8 %sunkaddr270 = getelementptr i8, i8* %.6.i, i64 %sunkaddr269 %sunkaddr271 = getelementptr i8, i8* %sunkaddr270, i64 448 %55 = bitcast i8* %sunkaddr271 to <4 x i64>* store <4 x i64> %wide.load109.3, <4 x i64>* %55, align 8, !alias.scope !3, !noalias !0 %sunkaddr272 = mul i64 %index, 8 %sunkaddr273 = getelementptr i8, i8* %.6.i, i64 %sunkaddr272 %sunkaddr274 = getelementptr i8, i8* %sunkaddr273, i64 480 %56 = bitcast i8* %sunkaddr274 to <4 x i64>* store <4 x i64> %wide.load110.3, <4 x i64>* %56, align 8, !alias.scope !3, !noalias !0 %index.next.3 = add i64 %index, 64 %lsr.iv.next181 = add i64 %lsr.iv180, 4 %niter.ncmp.3 = icmp eq i64 %lsr.iv.next181, 0 br i1 %niter.ncmp.3, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !5 middle.block.unr-lcssa: ; preds = %vector.body, %vector.ph %index.unr = phi i64 [ 0, %vector.ph ], [ %index.next.3, %vector.body ] %lcmp.mod114 = icmp eq i64 %xtraiter113, 0 br i1 %lcmp.mod114, label %middle.block, label %vector.body.epil.preheader vector.body.epil.preheader: ; preds = %middle.block.unr-lcssa %57 = shl i64 %index.unr, 3 %58 = add i64 %57, 96 %59 = sub i64 0, %xtraiter113 br label %vector.body.epil vector.body.epil: ; preds = %vector.body.epil, %vector.body.epil.preheader %lsr.iv79 = phi i64 [ %lsr.iv.next80, %vector.body.epil ], [ %59, %vector.body.epil.preheader ] %lsr.iv54 = phi i64 [ %lsr.iv.next55, %vector.body.epil ], [ %58, %vector.body.epil.preheader ] %60 = bitcast double* %arg.x.4 to i8* %sunkaddr275 = getelementptr i8, i8* %60, i64 %lsr.iv54 %sunkaddr276 = getelementptr i8, i8* %sunkaddr275, i64 -96 %61 = bitcast i8* %sunkaddr276 to <4 x i64>* %wide.load.epil = load <4 x i64>, <4 x i64>* %61, align 8, !alias.scope !0 %62 = bitcast double* %arg.x.4 to i8* %sunkaddr277 = getelementptr i8, i8* %62, i64 %lsr.iv54 %sunkaddr278 = getelementptr i8, i8* %sunkaddr277, i64 -64 %63 = bitcast i8* %sunkaddr278 to <4 x i64>* %wide.load108.epil = load <4 x i64>, <4 x i64>* %63, align 8, !alias.scope !0 %64 = bitcast double* %arg.x.4 to i8* %sunkaddr279 = getelementptr i8, i8* %64, i64 %lsr.iv54 %sunkaddr280 = getelementptr i8, i8* %sunkaddr279, i64 -32 %65 = bitcast i8* %sunkaddr280 to <4 x i64>* %wide.load109.epil = load <4 x i64>, <4 x i64>* %65, align 8, !alias.scope !0 %66 = bitcast double* %arg.x.4 to i8* %sunkaddr281 = getelementptr i8, i8* %66, i64 %lsr.iv54 %67 = bitcast i8* %sunkaddr281 to <4 x i64>* %wide.load110.epil = load <4 x i64>, <4 x i64>* %67, align 8, !alias.scope !0 %uglygep63 = getelementptr i8, i8* %.6.i, i64 %lsr.iv54 %uglygep6364 = bitcast i8* %uglygep63 to <4 x i64>* %scevgep65 = getelementptr <4 x i64>, <4 x i64>* %uglygep6364, i64 -3 store <4 x i64> %wide.load.epil, <4 x i64>* %scevgep65, align 8, !alias.scope !3, !noalias !0 %uglygep60 = getelementptr i8, i8* %.6.i, i64 %lsr.iv54 %uglygep6061 = bitcast i8* %uglygep60 to <4 x i64>* %scevgep62 = getelementptr <4 x i64>, <4 x i64>* %uglygep6061, i64 -2 store <4 x i64> %wide.load108.epil, <4 x i64>* %scevgep62, align 8, !alias.scope !3, !noalias !0 %uglygep57 = getelementptr i8, i8* %.6.i, i64 %lsr.iv54 %uglygep5758 = bitcast i8* %uglygep57 to <4 x i64>* %scevgep59 = getelementptr <4 x i64>, <4 x i64>* %uglygep5758, i64 -1 store <4 x i64> %wide.load109.epil, <4 x i64>* %scevgep59, align 8, !alias.scope !3, !noalias !0 %uglygep = getelementptr i8, i8* %.6.i, i64 %lsr.iv54 %uglygep56 = bitcast i8* %uglygep to <4 x i64>* store <4 x i64> %wide.load110.epil, <4 x i64>* %uglygep56, align 8, !alias.scope !3, !noalias !0 %lsr.iv.next55 = add i64 %lsr.iv54, 128 %lsr.iv.next80 = add nsw i64 %lsr.iv79, 1 %epil.iter.cmp = icmp eq i64 %lsr.iv.next80, 0 br i1 %epil.iter.cmp, label %middle.block, label %vector.body.epil, !llvm.loop !7 middle.block: ; preds = %vector.body.epil, %middle.block.unr-lcssa %cmp.n = icmp eq i64 %n.vec, %arg.x.5.0 br i1 %cmp.n, label %for.end, label %for.body.endif.endif.us.preheader for.body.endif.endif.us.preheader: ; preds = %middle.block, %vector.memcheck, %for.body.endif.endif.lr.ph.split.us %loop.index89.us.ph = phi i64 [ 0, %vector.memcheck ], [ 0, %for.body.endif.endif.lr.ph.split.us ], [ %n.vec, %middle.block ] %68 = add i64 %arg.x.5.0, -1 %69 = sub i64 %68, %loop.index89.us.ph %xtraiter = and i64 %arg.x.5.0, 7 %lcmp.mod = icmp eq i64 %xtraiter, 0 br i1 %lcmp.mod, label %for.body.endif.endif.us.prol.loopexit, label %for.body.endif.endif.us.prol.preheader for.body.endif.endif.us.prol.preheader: ; preds = %for.body.endif.endif.us.preheader %70 = sub i64 0, %xtraiter br label %for.body.endif.endif.us.prol for.body.endif.endif.us.prol: ; preds = %for.body.endif.endif.us.prol, %for.body.endif.endif.us.prol.preheader %lsr.iv51 = phi i64 [ %lsr.iv.next52, %for.body.endif.endif.us.prol ], [ %70, %for.body.endif.endif.us.prol.preheader ] %loop.index89.us.prol = phi i64 [ %.135.us.prol, %for.body.endif.endif.us.prol ], [ %loop.index89.us.ph, %for.body.endif.endif.us.prol.preheader ] %71 = bitcast i8* %.6.i to i64* %72 = bitcast double* %arg.x.4 to i64* %scevgep50 = getelementptr i64, i64* %72, i64 %loop.index89.us.prol %.11430.us.prol = load i64, i64* %scevgep50, align 8 %scevgep49 = getelementptr i64, i64* %71, i64 %loop.index89.us.prol store i64 %.11430.us.prol, i64* %scevgep49, align 8 %.135.us.prol = add nuw nsw i64 %loop.index89.us.prol, 1 %lsr.iv.next52 = add nsw i64 %lsr.iv51, 1 %prol.iter.cmp = icmp eq i64 %lsr.iv.next52, 0 br i1 %prol.iter.cmp, label %for.body.endif.endif.us.prol.loopexit, label %for.body.endif.endif.us.prol, !llvm.loop !9 for.body.endif.endif.us.prol.loopexit: ; preds = %for.body.endif.endif.us.prol, %for.body.endif.endif.us.preheader %loop.index89.us.unr = phi i64 [ %loop.index89.us.ph, %for.body.endif.endif.us.preheader ], [ %.135.us.prol, %for.body.endif.endif.us.prol ] %73 = icmp ult i64 %69, 7 br i1 %73, label %for.end, label %for.body.endif.endif.us.preheader.new for.body.endif.endif.us.preheader.new: ; preds = %for.body.endif.endif.us.prol.loopexit br label %for.body.endif.endif.us for.body.endif.endif.us: ; preds = %for.body.endif.endif.us, %for.body.endif.endif.us.preheader.new %loop.index89.us = phi i64 [ %loop.index89.us.unr, %for.body.endif.endif.us.preheader.new ], [ %.135.us.7, %for.body.endif.endif.us ] %sunkaddr282 = mul i64 %loop.index89.us, 8 %74 = bitcast double* %arg.x.4 to i8* %sunkaddr283 = getelementptr i8, i8* %74, i64 %sunkaddr282 %75 = bitcast i8* %sunkaddr283 to i64* %.11430.us = load i64, i64* %75, align 8 %sunkaddr284 = mul i64 %loop.index89.us, 8 %sunkaddr285 = getelementptr i8, i8* %.6.i, i64 %sunkaddr284 %76 = bitcast i8* %sunkaddr285 to i64* store i64 %.11430.us, i64* %76, align 8 %sunkaddr286 = mul i64 %loop.index89.us, 8 %77 = bitcast double* %arg.x.4 to i8* %sunkaddr287 = getelementptr i8, i8* %77, i64 %sunkaddr286 %sunkaddr288 = getelementptr i8, i8* %sunkaddr287, i64 8 %78 = bitcast i8* %sunkaddr288 to i64* %.11430.us.1 = load i64, i64* %78, align 8 %sunkaddr289 = mul i64 %loop.index89.us, 8 %sunkaddr290 = getelementptr i8, i8* %.6.i, i64 %sunkaddr289 %sunkaddr291 = getelementptr i8, i8* %sunkaddr290, i64 8 %79 = bitcast i8* %sunkaddr291 to i64* store i64 %.11430.us.1, i64* %79, align 8 %sunkaddr292 = mul i64 %loop.index89.us, 8 %80 = bitcast double* %arg.x.4 to i8* %sunkaddr293 = getelementptr i8, i8* %80, i64 %sunkaddr292 %sunkaddr294 = getelementptr i8, i8* %sunkaddr293, i64 16 %81 = bitcast i8* %sunkaddr294 to i64* %.11430.us.2 = load i64, i64* %81, align 8 %sunkaddr295 = mul i64 %loop.index89.us, 8 %sunkaddr296 = getelementptr i8, i8* %.6.i, i64 %sunkaddr295 %sunkaddr297 = getelementptr i8, i8* %sunkaddr296, i64 16 %82 = bitcast i8* %sunkaddr297 to i64* store i64 %.11430.us.2, i64* %82, align 8 %sunkaddr298 = mul i64 %loop.index89.us, 8 %83 = bitcast double* %arg.x.4 to i8* %sunkaddr299 = getelementptr i8, i8* %83, i64 %sunkaddr298 %sunkaddr300 = getelementptr i8, i8* %sunkaddr299, i64 24 %84 = bitcast i8* %sunkaddr300 to i64* %.11430.us.3 = load i64, i64* %84, align 8 %sunkaddr301 = mul i64 %loop.index89.us, 8 %sunkaddr302 = getelementptr i8, i8* %.6.i, i64 %sunkaddr301 %sunkaddr303 = getelementptr i8, i8* %sunkaddr302, i64 24 %85 = bitcast i8* %sunkaddr303 to i64* store i64 %.11430.us.3, i64* %85, align 8 %sunkaddr304 = mul i64 %loop.index89.us, 8 %86 = bitcast double* %arg.x.4 to i8* %sunkaddr305 = getelementptr i8, i8* %86, i64 %sunkaddr304 %sunkaddr306 = getelementptr i8, i8* %sunkaddr305, i64 32 %87 = bitcast i8* %sunkaddr306 to i64* %.11430.us.4 = load i64, i64* %87, align 8 %sunkaddr307 = mul i64 %loop.index89.us, 8 %sunkaddr308 = getelementptr i8, i8* %.6.i, i64 %sunkaddr307 %sunkaddr309 = getelementptr i8, i8* %sunkaddr308, i64 32 %88 = bitcast i8* %sunkaddr309 to i64* store i64 %.11430.us.4, i64* %88, align 8 %sunkaddr310 = mul i64 %loop.index89.us, 8 %89 = bitcast double* %arg.x.4 to i8* %sunkaddr311 = getelementptr i8, i8* %89, i64 %sunkaddr310 %sunkaddr312 = getelementptr i8, i8* %sunkaddr311, i64 40 %90 = bitcast i8* %sunkaddr312 to i64* %.11430.us.5 = load i64, i64* %90, align 8 %sunkaddr313 = mul i64 %loop.index89.us, 8 %sunkaddr314 = getelementptr i8, i8* %.6.i, i64 %sunkaddr313 %sunkaddr315 = getelementptr i8, i8* %sunkaddr314, i64 40 %91 = bitcast i8* %sunkaddr315 to i64* store i64 %.11430.us.5, i64* %91, align 8 %sunkaddr316 = mul i64 %loop.index89.us, 8 %92 = bitcast double* %arg.x.4 to i8* %sunkaddr317 = getelementptr i8, i8* %92, i64 %sunkaddr316 %sunkaddr318 = getelementptr i8, i8* %sunkaddr317, i64 48 %93 = bitcast i8* %sunkaddr318 to i64* %.11430.us.6 = load i64, i64* %93, align 8 %sunkaddr319 = mul i64 %loop.index89.us, 8 %sunkaddr320 = getelementptr i8, i8* %.6.i, i64 %sunkaddr319 %sunkaddr321 = getelementptr i8, i8* %sunkaddr320, i64 48 %94 = bitcast i8* %sunkaddr321 to i64* store i64 %.11430.us.6, i64* %94, align 8 %sunkaddr322 = mul i64 %loop.index89.us, 8 %95 = bitcast double* %arg.x.4 to i8* %sunkaddr323 = getelementptr i8, i8* %95, i64 %sunkaddr322 %sunkaddr324 = getelementptr i8, i8* %sunkaddr323, i64 56 %96 = bitcast i8* %sunkaddr324 to i64* %.11430.us.7 = load i64, i64* %96, align 8 %sunkaddr325 = mul i64 %loop.index89.us, 8 %sunkaddr326 = getelementptr i8, i8* %.6.i, i64 %sunkaddr325 %sunkaddr327 = getelementptr i8, i8* %sunkaddr326, i64 56 %97 = bitcast i8* %sunkaddr327 to i64* store i64 %.11430.us.7, i64* %97, align 8 %.135.us.7 = add nsw i64 %loop.index89.us, 8 %exitcond.7 = icmp eq i64 %arg.x.5.0, %.135.us.7 br i1 %exitcond.7, label %for.end, label %for.body.endif.endif.us, !llvm.loop !10 B52.if.lr.ph: ; preds = %for.end %.612 = add i64 %arg.x.5.0, -2 %.613 = icmp slt i64 %.612, 1 %spec.select29 = select i1 %.613, i64 0, i64 %.612 br i1 %.613, label %B52.if.us.preheader, label %B52.if.lr.ph.split B52.if.us.preheader: ; preds = %B52.if.lr.ph %98 = bitcast i8* %.6.i21 to double* %99 = bitcast i8* %.6.i to double* %100 = add i64 %spec.select29, 1 br label %B52.if.us B52.if.us: ; preds = %B126.us, %B52.if.us.preheader %y.sroa.0.039.us = phi i8* [ %x.1.sroa.0.037.us, %B126.us ], [ %.177, %B52.if.us.preheader ] %y.sroa.56.038.us = phi double* [ %x.1.sroa.60.036.us, %B126.us ], [ %98, %B52.if.us.preheader ] %x.1.sroa.0.037.us = phi i8* [ %y.sroa.0.039.us, %B126.us ], [ %.69, %B52.if.us.preheader ] %x.1.sroa.60.036.us = phi double* [ %y.sroa.56.038.us, %B126.us ], [ %99, %B52.if.us.preheader ] %.404.035.us = phi i64 [ %.468.us, %B126.us ], [ %arg.steps, %B52.if.us.preheader ] %101 = icmp sgt i64 %spec.select29, 0 %.468.us = add nsw i64 %.404.035.us, -1 br i1 %101, label %B78.lr.ph.us, label %B126.us B126.us: ; preds = %B78.us, %B52.if.us %.459.us = icmp sgt i64 %.404.035.us, 1 br i1 %.459.us, label %B52.if.us, label %B140 B78.us: ; preds = %B78.lr.ph.us, %B78.us %lsr.iv4 = phi double* [ %scevgep3, %B78.lr.ph.us ], [ %scevgep5, %B78.us ] %lsr.iv = phi i64 [ %100, %B78.lr.ph.us ], [ %lsr.iv.next, %B78.us ] %.809.us = phi double [ %.809.us.pre, %B78.lr.ph.us ], [ %.874.us, %B78.us ] %.771.us = phi double [ %.771.us.pre, %B78.lr.ph.us ], [ %.892.us, %B78.us ] %scevgep6 = getelementptr double, double* %lsr.iv4, i64 -1 %.816.us = fmul double %.809.us, 2.000000e+00 %.826.us = fadd double %.771.us, %.816.us %.874.us = load double, double* %lsr.iv4, align 8 %.882.us = fadd double %.874.us, %.826.us %.892.us = fmul double %.882.us, 2.500000e-01 store double %.892.us, double* %scevgep6, align 8 %lsr.iv.next = add i64 %lsr.iv, -1 %scevgep5 = getelementptr double, double* %lsr.iv4, i64 1 %.658.us = icmp sgt i64 %lsr.iv.next, 1 br i1 %.658.us, label %B78.us, label %B126.us B78.lr.ph.us: ; preds = %B52.if.us %.771.us.pre = load double, double* %y.sroa.56.038.us, align 8 %.808.us.phi.trans.insert = getelementptr double, double* %y.sroa.56.038.us, i64 1 %.809.us.pre = load double, double* %.808.us.phi.trans.insert, align 8 %scevgep3 = getelementptr double, double* %y.sroa.56.038.us, i64 2 br label %B78.us B52.if.lr.ph.split: ; preds = %B52.if.lr.ph %102 = icmp sgt i64 %spec.select29, 0 br i1 %102, label %B52.if.us44.preheader, label %B52.if.preheader B52.if.preheader: ; preds = %B52.if.lr.ph.split %103 = bitcast i8* %.6.i21 to double* %104 = bitcast i8* %.6.i to double* %105 = add i64 %arg.steps, 1 br label %B52.if B52.if.us44.preheader: ; preds = %B52.if.lr.ph.split %106 = bitcast i8* %.6.i21 to double* %107 = bitcast i8* %.6.i to double* %108 = add i64 %spec.select29, 1 br label %B52.if.us44 B52.if.us44: ; preds = %B76.B126_crit_edge.us85, %B52.if.us44.preheader %y.sroa.0.039.us46 = phi i8* [ %x.1.sroa.0.037.us48, %B76.B126_crit_edge.us85 ], [ %.177, %B52.if.us44.preheader ] %y.sroa.56.038.us47 = phi double* [ %x.1.sroa.60.036.us49, %B76.B126_crit_edge.us85 ], [ %106, %B52.if.us44.preheader ] %x.1.sroa.0.037.us48 = phi i8* [ %y.sroa.0.039.us46, %B76.B126_crit_edge.us85 ], [ %.69, %B52.if.us44.preheader ] %x.1.sroa.60.036.us49 = phi double* [ %y.sroa.56.038.us47, %B76.B126_crit_edge.us85 ], [ %107, %B52.if.us44.preheader ] %.404.035.us50 = phi i64 [ %.468.us51, %B76.B126_crit_edge.us85 ], [ %arg.steps, %B52.if.us44.preheader ] %.771.us66.pre = load double, double* %y.sroa.56.038.us47, align 8 %.808.us70.phi.trans.insert = getelementptr double, double* %y.sroa.56.038.us47, i64 1 %.809.us71.pre = load double, double* %.808.us70.phi.trans.insert, align 8 %scevgep9 = getelementptr double, double* %y.sroa.56.038.us47, i64 2 br label %B78.us56 B78.us56: ; preds = %B78.us56, %B52.if.us44 %lsr.iv10 = phi double* [ %scevgep11, %B78.us56 ], [ %scevgep9, %B52.if.us44 ] %lsr.iv7 = phi i64 [ %lsr.iv.next8, %B78.us56 ], [ %108, %B52.if.us44 ] %.809.us71 = phi double [ %.809.us71.pre, %B52.if.us44 ], [ %.874.us79, %B78.us56 ] %.771.us66 = phi double [ %.771.us66.pre, %B52.if.us44 ], [ %.892.us81, %B78.us56 ] %scevgep12 = getelementptr double, double* %lsr.iv10, i64 -1 %.816.us72 = fmul double %.809.us71, 2.000000e+00 %.826.us73 = fadd double %.771.us66, %.816.us72 %.874.us79 = load double, double* %lsr.iv10, align 8 %.882.us80 = fadd double %.874.us79, %.826.us73 %.892.us81 = fmul double %.882.us80, 2.500000e-01 store double %.892.us81, double* %scevgep12, align 8 %lsr.iv.next8 = add i64 %lsr.iv7, -1 %scevgep11 = getelementptr double, double* %lsr.iv10, i64 1 %.658.us82 = icmp sgt i64 %lsr.iv.next8, 1 br i1 %.658.us82, label %B78.us56, label %B76.B126_crit_edge.us85 B76.B126_crit_edge.us85: ; preds = %B78.us56 %.468.us51 = add nsw i64 %.404.035.us50, -1 %.459.us55 = icmp sgt i64 %.404.035.us50, 1 br i1 %.459.us55, label %B52.if.us44, label %B140 B140: ; preds = %B52.if, %B76.B126_crit_edge.us85, %B126.us, %for.end %x.1.sroa.60.0.lcssa = phi double* [ %114, %for.end ], [ %y.sroa.56.038.us, %B126.us ], [ %y.sroa.56.038.us47, %B76.B126_crit_edge.us85 ], [ %y.sroa.56.038, %B52.if ] %x.1.sroa.0.0.lcssa = phi i8* [ %.69, %for.end ], [ %y.sroa.0.039.us, %B126.us ], [ %y.sroa.0.039.us46, %B76.B126_crit_edge.us85 ], [ %y.sroa.0.039, %B52.if ] %y.sroa.0.0.lcssa = phi i8* [ %.177, %for.end ], [ %x.1.sroa.0.037.us, %B126.us ], [ %x.1.sroa.0.037.us48, %B76.B126_crit_edge.us85 ], [ %x.1.sroa.0.037, %B52.if ] tail call void @NRT_decref(i8* %y.sroa.0.0.lcssa) %retptr.repack328 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr to i8** store i8* %x.1.sroa.0.0.lcssa, i8** %retptr.repack328, align 8 %retptr.repack4 = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 1 store i8* null, i8** %retptr.repack4, align 8 %retptr.repack6 = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 2 store i64 %arg.x.5.0, i64* %retptr.repack6, align 8 %retptr.repack8 = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 3 store i64 8, i64* %retptr.repack8, align 8 %retptr.repack10 = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 4 store double* %x.1.sroa.60.0.lcssa, double** %retptr.repack10, align 8 %109 = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 5, i64 0 store i64 %arg.x.5.0, i64* %109, align 8 %110 = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %retptr, i64 0, i32 6, i64 0 store i64 8, i64* %110, align 8 ret i32 0 for.end.loopexit92: ; preds = %for.body.endif.endif.lr.ph %111 = bitcast double* %arg.x.4 to i64* %112 = bitcast double* undef to i64* %.11430 = load i64, i64* %111, align 8 %113 = bitcast i8* %.6.i to i64* store i64 %.11430, i64* %113, align 8 br label %for.end for.end: ; preds = %for.body.endif.endif.us, %for.end.loopexit92, %for.body.endif.endif.us.prol.loopexit, %middle.block, %B86.i %114 = bitcast i8* %.6.i to double* %115 = bitcast i8* %.6.i to i64* %.177 = tail call i8* @NRT_MemInfo_alloc_safe_aligned(i64 %.68, i32 32) %.5.i20 = getelementptr i8, i8* %.177, i64 24 %116 = bitcast i8* %.5.i20 to i8** %.6.i21 = load i8*, i8** %116, align 8 %.181 = bitcast i8* %.6.i21 to double* tail call void @NRT_decref(i8* %arg.x.0) %117 = bitcast i8* undef to i64* %.2472 = load i64, i64* %115, align 8 %118 = bitcast i8* %.6.i21 to i64* store i64 %.2472, i64* %118, align 8 %.309 = add i64 %arg.x.5.0, -1 %.323 = getelementptr double, double* %114, i64 %.309 %119 = bitcast double* %.323 to i64* %.3243 = load i64, i64* %119, align 8 %.362 = getelementptr double, double* %.181, i64 %.309 %120 = bitcast double* %.362 to i64* store i64 %.3243, i64* %120, align 8 %.414 = icmp sgt i64 %arg.steps, 0 br i1 %.414, label %B52.if.lr.ph, label %B140 B52.if: ; preds = %B52.if, %B52.if.preheader %lsr.iv13 = phi i64 [ %lsr.iv.next14, %B52.if ], [ %105, %B52.if.preheader ] %y.sroa.0.039 = phi i8* [ %x.1.sroa.0.037, %B52.if ], [ %.177, %B52.if.preheader ] %y.sroa.56.038 = phi double* [ %x.1.sroa.60.036, %B52.if ], [ %103, %B52.if.preheader ] %x.1.sroa.0.037 = phi i8* [ %y.sroa.0.039, %B52.if ], [ %.69, %B52.if.preheader ] %x.1.sroa.60.036 = phi double* [ %y.sroa.56.038, %B52.if ], [ %104, %B52.if.preheader ] %lsr.iv.next14 = add i64 %lsr.iv13, -1 %.459 = icmp sgt i64 %lsr.iv.next14, 1 br i1 %.459, label %B52.if, label %B140 } declare noalias i8* @NRT_MemInfo_alloc_safe_aligned(i64, i32) local_unnamed_addr define i8* @"_ZN7cpython8__main__14blur_numba$244E5ArrayIdLi1E1C7mutable7alignedEx"(i8* %py_closure, i8* %py_args, i8* nocapture readnone %py_kws) local_unnamed_addr { entry: %.5 = alloca i8*, align 8 %.6 = alloca i8*, align 8 %.7 = call i32 (i8*, i8*, i64, i64, ...) @PyArg_UnpackTuple(i8* %py_args, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.const.blur_numba, i64 0, i64 0), i64 2, i64 2, i8** nonnull %.5, i8** nonnull %.6) %.8 = icmp eq i32 %.7, 0 %.31 = alloca { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, align 8 %.56 = alloca { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, align 8 %excinfo = alloca { i8*, i32 }*, align 8 %.93 = alloca { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, align 8 %0 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* call void @llvm.memset.p0i8.i64(i8* nonnull %0, i8 0, i64 56, i32 8, i1 false) br i1 %.8, label %entry.if, label %entry.endif, !prof !11 entry.if: ; preds = %entry.endif.endif.endif.endif.endif.endif.endif.endif.endif.endif, %entry.endif.endif.endif.endif.endif.endif.endif.endif.if, %entry.endif.endif.endif.endif.endif.endif.endif.if.if, %entry.endif.endif.endif.endif.endif.endif.endif.endif, %entry.endif.endif.endif.endif.endif.endif.endif.if, %entry.endif.endif.endif.endif.endif.if, %entry.endif.endif.endif, %entry ret i8* null entry.endif: ; preds = %entry %.12 = icmp eq i8* %py_closure, null br i1 %.12, label %entry.endif.if, label %entry.endif.endif, !prof !11 entry.endif.if: ; preds = %entry.endif %.14 = call i32 @puts(i8* getelementptr inbounds ([38 x i8], [38 x i8]* @".const.Fatal error: missing _dynfunc.Closure", i64 0, i64 0)) unreachable entry.endif.endif: ; preds = %entry.endif %.16 = ptrtoint i8* %py_closure to i64 %.17 = add i64 %.16, 24 %.19 = inttoptr i64 %.17 to { i8* }* %.202 = bitcast { i8* }* %.19 to i8** %.21 = load i8*, i8** %.202, align 8 %.26 = icmp eq i8* %.21, null br i1 %.26, label %entry.endif.endif.if, label %entry.endif.endif.endif, !prof !11 entry.endif.endif.if: ; preds = %entry.endif.endif call void @PyErr_SetString(i8* nonnull @PyExc_RuntimeError, i8* getelementptr inbounds ([20 x i8], [20 x i8]* @".const.missing Environment", i64 0, i64 0)) ret i8* null entry.endif.endif.endif: ; preds = %entry.endif.endif %1 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8** %.30 = load i8*, i8** %.5, align 8 %.33 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %.34 = call i32 @NRT_adapt_ndarray_from_python(i8* %.30, i8* nonnull %.33) %.35 = icmp eq i32 %.34, 0 %.36.fca.0.load = load i8*, i8** %1, align 8 %2 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %sunkaddr = getelementptr i8, i8* %2, i64 8 %3 = bitcast i8* %sunkaddr to i8** %.36.fca.1.load = load i8*, i8** %3, align 8 %4 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %sunkaddr3 = getelementptr i8, i8* %4, i64 16 %5 = bitcast i8* %sunkaddr3 to i64* %.36.fca.2.load = load i64, i64* %5, align 8 %6 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %sunkaddr4 = getelementptr i8, i8* %6, i64 24 %7 = bitcast i8* %sunkaddr4 to i64* %.36.fca.3.load = load i64, i64* %7, align 8 %8 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %sunkaddr5 = getelementptr i8, i8* %8, i64 32 %9 = bitcast i8* %sunkaddr5 to double** %.36.fca.4.load = load double*, double** %9, align 8 %10 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %sunkaddr6 = getelementptr i8, i8* %10, i64 40 %11 = bitcast i8* %sunkaddr6 to i64* %.36.fca.5.0.load = load i64, i64* %11, align 8 %12 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.31 to i8* %sunkaddr7 = getelementptr i8, i8* %12, i64 48 %13 = bitcast i8* %sunkaddr7 to i64* %.36.fca.6.0.load = load i64, i64* %13, align 8 br i1 %.35, label %entry.endif.endif.endif.endif, label %entry.if, !prof !12 entry.endif.endif.endif.endif: ; preds = %entry.endif.endif.endif %.41 = load i8*, i8** %.6, align 8 %.43 = call i8* @PyNumber_Long(i8* %.41) %.44 = icmp eq i8* %.43, null br i1 %.44, label %entry.endif.endif.endif.endif.endif, label %entry.endif.endif.endif.endif.if, !prof !11 entry.endif.endif.endif.endif.if: ; preds = %entry.endif.endif.endif.endif %.46 = call i64 @PyLong_AsLongLong(i8* nonnull %.43) call void @Py_DecRef(i8* nonnull %.43) br label %entry.endif.endif.endif.endif.endif entry.endif.endif.endif.endif.endif: ; preds = %entry.endif.endif.endif.endif.if, %entry.endif.endif.endif.endif %.42.0 = phi i64 [ %.46, %entry.endif.endif.endif.endif.if ], [ undef, %entry.endif.endif.endif.endif ] %.51 = call i8* @PyErr_Occurred() %.52 = icmp eq i8* %.51, null br i1 %.52, label %entry.endif.endif.endif.endif.endif.endif, label %entry.endif.endif.endif.endif.endif.if, !prof !12 entry.endif.endif.endif.endif.endif.if: ; preds = %entry.endif.endif.endif.endif.endif call void @NRT_decref(i8* %.36.fca.0.load) br label %entry.if entry.endif.endif.endif.endif.endif.endif: ; preds = %entry.endif.endif.endif.endif.endif %.fca.1.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56, i64 0, i32 1 %.fca.2.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56, i64 0, i32 2 %.fca.3.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56, i64 0, i32 3 %.fca.4.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56, i64 0, i32 4 %.fca.5.0.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56, i64 0, i32 5, i64 0 %.fca.6.0.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56, i64 0, i32 6, i64 0 %14 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56 to i8* call void @llvm.memset.p0i8.i64(i8* nonnull %14, i8 0, i64 56, i32 8, i1 false) %.60 = call i32 @"_ZN8__main__14blur_numba$244E5ArrayIdLi1E1C7mutable7alignedEx"({ i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* nonnull %.56, { i8*, i32 }** nonnull %excinfo, i8* undef, i8* %.36.fca.0.load, i8* %.36.fca.1.load, i64 %.36.fca.2.load, i64 %.36.fca.3.load, double* %.36.fca.4.load, i64 %.36.fca.5.0.load, i64 %.36.fca.6.0.load, i64 %.42.0) %.61 = load { i8*, i32 }*, { i8*, i32 }** %excinfo, align 8 %15 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.56 to i64* %.70.fca.0.load3 = load i64, i64* %15, align 8 %16 = bitcast i8** %.fca.1.gep to i64* %.70.fca.1.load2 = load i64, i64* %16, align 8 %.70.fca.2.load = load i64, i64* %.fca.2.gep, align 8 %.70.fca.3.load = load i64, i64* %.fca.3.gep, align 8 %17 = bitcast double** %.fca.4.gep to i64* %.70.fca.4.load1 = load i64, i64* %17, align 8 %.70.fca.5.0.load = load i64, i64* %.fca.5.0.gep, align 8 %.70.fca.6.0.load = load i64, i64* %.fca.6.0.gep, align 8 call void @NRT_decref(i8* %.36.fca.0.load) switch i32 %.60, label %entry.endif.endif.endif.endif.endif.endif.endif [ i32 -2, label %entry.endif.endif.endif.endif.endif.endif.if.if i32 0, label %entry.endif.endif.endif.endif.endif.endif.if.endif ] entry.endif.endif.endif.endif.endif.endif.endif: ; preds = %entry.endif.endif.endif.endif.endif.endif %.68 = icmp sgt i32 %.60, 0 br i1 %.68, label %entry.endif.endif.endif.endif.endif.endif.endif.if, label %entry.endif.endif.endif.endif.endif.endif.endif.endif entry.endif.endif.endif.endif.endif.endif.if.if: ; preds = %entry.endif.endif.endif.endif.endif.endif call void @Py_IncRef(i8* nonnull @_Py_NoneStruct) ret i8* @_Py_NoneStruct entry.endif.endif.endif.endif.endif.endif.if.endif: ; preds = %entry.endif.endif.endif.endif.endif.endif %sunkaddr8 = getelementptr i8, i8* %.21, i64 24 %18 = bitcast i8* %sunkaddr8 to i8** %.91 = load i8*, i8** %18, align 8 %.92 = call i8* @PyList_GetItem(i8* %.91, i64 0) %19 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93 to i64* store i64 %.70.fca.0.load3, i64* %19, align 8 %inserted.strides.fca.1.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93, i64 0, i32 1 %20 = bitcast i8** %inserted.strides.fca.1.gep to i64* store i64 %.70.fca.1.load2, i64* %20, align 8 %inserted.strides.fca.2.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93, i64 0, i32 2 store i64 %.70.fca.2.load, i64* %inserted.strides.fca.2.gep, align 8 %inserted.strides.fca.3.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93, i64 0, i32 3 store i64 %.70.fca.3.load, i64* %inserted.strides.fca.3.gep, align 8 %inserted.strides.fca.4.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93, i64 0, i32 4 %21 = bitcast double** %inserted.strides.fca.4.gep to i64* store i64 %.70.fca.4.load1, i64* %21, align 8 %inserted.strides.fca.5.0.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93, i64 0, i32 5, i64 0 store i64 %.70.fca.5.0.load, i64* %inserted.strides.fca.5.0.gep, align 8 %inserted.strides.fca.6.0.gep = getelementptr inbounds { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }, { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93, i64 0, i32 6, i64 0 store i64 %.70.fca.6.0.load, i64* %inserted.strides.fca.6.0.gep, align 8 %.95 = bitcast { i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %.93 to i8* %.96 = call i8* @NRT_adapt_ndarray_to_python(i8* nonnull %.95, i32 1, i32 1, i8* %.92) ret i8* %.96 entry.endif.endif.endif.endif.endif.endif.endif.if: ; preds = %entry.endif.endif.endif.endif.endif.endif.endif call void @PyErr_Clear() %.101 = load { i8*, i32 }, { i8*, i32 }* %.61, align 8 %.102 = extractvalue { i8*, i32 } %.101, 0 %.104 = extractvalue { i8*, i32 } %.101, 1 %.105 = call i8* @numba_unpickle(i8* %.102, i32 %.104) %.106 = icmp eq i8* %.105, null br i1 %.106, label %entry.if, label %entry.endif.endif.endif.endif.endif.endif.endif.if.if, !prof !11 entry.endif.endif.endif.endif.endif.endif.endif.endif: ; preds = %entry.endif.endif.endif.endif.endif.endif.endif switch i32 %.60, label %entry.endif.endif.endif.endif.endif.endif.endif.endif.endif.endif [ i32 -3, label %entry.endif.endif.endif.endif.endif.endif.endif.endif.if i32 -1, label %entry.if ] entry.endif.endif.endif.endif.endif.endif.endif.if.if: ; preds = %entry.endif.endif.endif.endif.endif.endif.endif.if call void @numba_do_raise(i8* nonnull %.105) br label %entry.if entry.endif.endif.endif.endif.endif.endif.endif.endif.if: ; preds = %entry.endif.endif.endif.endif.endif.endif.endif.endif call void @PyErr_SetNone(i8* nonnull @PyExc_StopIteration) br label %entry.if entry.endif.endif.endif.endif.endif.endif.endif.endif.endif.endif: ; preds = %entry.endif.endif.endif.endif.endif.endif.endif.endif call void @PyErr_SetString(i8* nonnull @PyExc_SystemError, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @".const.unknown error when calling native function", i64 0, i64 0)) br label %entry.if } declare i32 @PyArg_UnpackTuple(i8*, i8*, i64, i64, ...) local_unnamed_addr ; Function Attrs: nounwind declare i32 @puts(i8* nocapture readonly) local_unnamed_addr #0 declare void @PyErr_SetString(i8*, i8*) local_unnamed_addr declare i32 @NRT_adapt_ndarray_from_python(i8* nocapture, i8* nocapture) local_unnamed_addr declare i8* @PyNumber_Long(i8*) local_unnamed_addr declare i64 @PyLong_AsLongLong(i8*) local_unnamed_addr declare void @Py_DecRef(i8*) local_unnamed_addr declare i8* @PyErr_Occurred() local_unnamed_addr declare void @Py_IncRef(i8*) local_unnamed_addr declare i8* @PyList_GetItem(i8*, i64) local_unnamed_addr declare i8* @NRT_adapt_ndarray_to_python(i8* nocapture, i32, i32, i8*) local_unnamed_addr declare void @PyErr_Clear() local_unnamed_addr declare i8* @numba_unpickle(i8*, i32) local_unnamed_addr declare void @numba_do_raise(i8*) local_unnamed_addr declare void @PyErr_SetNone(i8*) local_unnamed_addr ; Function Attrs: noinline norecurse nounwind define linkonce_odr void @NRT_incref(i8* %.1) local_unnamed_addr #1 { .3: %.4 = icmp eq i8* %.1, null br i1 %.4, label %.3.if, label %.3.endif, !prof !11 .3.if: ; preds = %.3 ret void .3.endif: ; preds = %.3 %.7 = bitcast i8* %.1 to i64* %.4.i = atomicrmw add i64* %.7, i64 1 monotonic ret void } ; Function Attrs: noinline define linkonce_odr void @NRT_decref(i8* %.1) local_unnamed_addr #2 { .3: %.4 = icmp eq i8* %.1, null br i1 %.4, label %.3.if, label %.3.endif, !prof !11 .3.if: ; preds = %.3.endif, %.3 ret void .3.endif: ; preds = %.3 %.7 = bitcast i8* %.1 to i64* %.4.i = atomicrmw sub i64* %.7, i64 1 monotonic %.9 = icmp eq i64 %.4.i, 1 br i1 %.9, label %.3.endif.if, label %.3.if, !prof !11 .3.endif.if: ; preds = %.3.endif tail call void @NRT_MemInfo_call_dtor(i8* nonnull %.1) ret void } declare void @NRT_MemInfo_call_dtor(i8*) local_unnamed_addr ; Function Attrs: argmemonly nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #3 ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #0 attributes #0 = { nounwind } attributes #1 = { noinline norecurse nounwind } attributes #2 = { noinline } attributes #3 = { argmemonly nounwind } !0 = !{!1} !1 = distinct !{!1, !2} !2 = distinct !{!2, !"LVerDomain"} !3 = !{!4} !4 = distinct !{!4, !2} !5 = distinct !{!5, !6} !6 = !{!"llvm.loop.isvectorized", i32 1} !7 = distinct !{!7, !8} !8 = !{!"llvm.loop.unroll.disable"} !9 = distinct !{!9, !8} !10 = distinct !{!10, !6} !11 = !{!"branch_weights", i32 1, i32 99} !12 = !{!"branch_weights", i32 99, i32 1}
What's impressive about numba in this case is that it is able to beat all but the most optimized of our implementations without any help. Like Cython, numba can do an even better job when you provide it with more information about how a function will be called.
%%writefile profileme.py
import os
import glob
list(os.walk('/tmp'))
!python -m cProfile profileme.py
import os
import cProfile
cProfile.run("list(os.walk('/tmp'))")
%prun list(os.walk('/tmp'))
%load_ext line_profiler
%lprun -f blur_py blur_py(x, steps)
%lprun -f blur_numba blur_numba(x, steps)
%lprun -f blur_np blur_np(x, steps)