#!/usr/bin/env python
# coding: utf-8

# # Advanced NumPy

# In[ ]:


import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)


# ## ndarray Object Internals

# In[ ]:


np.ones((10, 5)).shape


# In[ ]:


np.ones((3, 4, 5), dtype=np.float64).strides


# ### NumPy dtype Hierarchy

# In[ ]:


ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)


# In[ ]:


np.float64.mro()


# In[ ]:


np.issubdtype(ints.dtype, np.number)


# ## Advanced Array Manipulation

# ### Reshaping Arrays

# In[ ]:


arr = np.arange(8)
arr
arr.reshape((4, 2))


# In[ ]:


arr.reshape((4, 2)).reshape((2, 4))


# In[ ]:


arr = np.arange(15)
arr.reshape((5, -1))


# In[ ]:


other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)


# In[ ]:


arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()


# In[ ]:


arr.flatten()


# ### C Versus Fortran Order

# In[ ]:


arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')


# ### Concatenating and Splitting Arrays

# In[ ]:


arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)


# In[ ]:


np.vstack((arr1, arr2))
np.hstack((arr1, arr2))


# In[ ]:


arr = np.random.randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third


# #### Stacking helpers: r_ and c_

# In[ ]:


arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]


# In[ ]:


np.c_[1:6, -10:-5]


# ### Repeating Elements: tile and repeat

# In[ ]:


arr = np.arange(3)
arr
arr.repeat(3)


# In[ ]:


arr.repeat([2, 3, 4])


# In[ ]:


arr = np.random.randn(2, 2)
arr
arr.repeat(2, axis=0)


# In[ ]:


arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)


# In[ ]:


arr
np.tile(arr, 2)


# In[ ]:


arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))


# ### Fancy Indexing Equivalents: take and put

# In[ ]:


arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]


# In[ ]:


arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr


# In[ ]:


inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr
arr.take(inds, axis=1)


# ## Broadcasting

# In[ ]:


arr = np.arange(5)
arr
arr * 4


# In[ ]:


arr = np.random.randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)


# In[ ]:


arr
row_means = arr.mean(1)
row_means.shape
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)


# ### Broadcasting Over Other Axes

# In[ ]:


arr - arr.mean(1)


# In[ ]:


arr - arr.mean(1).reshape((4, 1))


# In[ ]:


arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]


# In[ ]:


arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means
depth_means.shape
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)


# ```python
# def demean_axis(arr, axis=0):
#     means = arr.mean(axis)
# 
#     # This generalizes things like [:, :, np.newaxis] to N dimensions
#     indexer = [slice(None)] * arr.ndim
#     indexer[axis] = np.newaxis
#     return arr - means[indexer]
# ```

# ### Setting Array Values by Broadcasting

# In[ ]:


arr = np.zeros((4, 3))
arr[:] = 5
arr


# In[ ]:


col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr


# ## Advanced ufunc Usage

# ### ufunc Instance Methods

# In[ ]:


arr = np.arange(10)
np.add.reduce(arr)
arr.sum()


# In[ ]:


np.random.seed(12346)  # for reproducibility
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)


# In[ ]:


arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)


# In[ ]:


arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))


# In[ ]:


x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape


# In[ ]:


arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])


# In[ ]:


arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)


# ### Writing New ufuncs in Python

# In[ ]:


def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))


# In[ ]:


add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))


# In[ ]:


arr = np.random.randn(10000)
get_ipython().run_line_magic('timeit', 'add_them(arr, arr)')
get_ipython().run_line_magic('timeit', 'np.add(arr, arr)')


# ## Structured and Record Arrays

# In[ ]:


dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr


# In[ ]:


sarr[0]
sarr[0]['y']


# In[ ]:


sarr['x']


# ### Nested dtypes and Multidimensional Fields

# In[ ]:


dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr


# In[ ]:


arr[0]['x']


# In[ ]:


arr['x']


# In[ ]:


dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']


# ### Why Use Structured Arrays?

# ## More About Sorting

# In[ ]:


arr = np.random.randn(6)
arr.sort()
arr


# In[ ]:


arr = np.random.randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr


# In[ ]:


arr = np.random.randn(5)
arr
np.sort(arr)
arr


# In[ ]:


arr = np.random.randn(3, 5)
arr
arr.sort(axis=1)
arr


# In[ ]:


arr[:, ::-1]


# ### Indirect Sorts: argsort and lexsort

# In[ ]:


values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]


# In[ ]:


arr = np.random.randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]


# In[ ]:


first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter
zip(last_name[sorter], first_name[sorter])


# ### Alternative Sort Algorithms

# In[ ]:


values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)


# ### Partially Sorting Arrays

# In[ ]:


np.random.seed(12345)
arr = np.random.randn(20)
arr
np.partition(arr, 3)


# In[ ]:


indices = np.argpartition(arr, 3)
indices
arr.take(indices)


# ### numpy.searchsorted: Finding Elements in a Sorted Array

# In[ ]:


arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)


# In[ ]:


arr.searchsorted([0, 8, 11, 16])


# In[ ]:


arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')


# In[ ]:


data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data


# In[ ]:


labels = bins.searchsorted(data)
labels


# In[ ]:


pd.Series(data).groupby(labels).mean()


# ## Writing Fast NumPy Functions with Numba

# In[ ]:


import numpy as np

def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count


# ```python
# In [209]: x = np.random.randn(10000000)
# 
# In [210]: y = np.random.randn(10000000)
# 
# In [211]: %timeit mean_distance(x, y)
# 1 loop, best of 3: 2 s per loop
# 
# In [212]: %timeit (x - y).mean()
# 100 loops, best of 3: 14.7 ms per loop
# ```

# ```python
# In [213]: import numba as nb
# 
# In [214]: numba_mean_distance = nb.jit(mean_distance)
# ```

# ```python
# @nb.jit
# def mean_distance(x, y):
#     nx = len(x)
#     result = 0.0
#     count = 0
#     for i in range(nx):
#         result += x[i] - y[i]
#         count += 1
#     return result / count
# ```

# ```python
# In [215]: %timeit numba_mean_distance(x, y)
# 100 loops, best of 3: 10.3 ms per loop
# ```

# ```python
# from numba import float64, njit
# 
# @njit(float64(float64[:], float64[:]))
# def mean_distance(x, y):
#     return (x - y).mean()
# ```

# ### Creating Custom numpy.ufunc Objects with Numba

# ```python
# from numba import vectorize
# 
# @vectorize
# def nb_add(x, y):
#     return x + y
# ```

# ```python
# In [13]: x = np.arange(10)
# 
# In [14]: nb_add(x, x)
# Out[14]: array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.])
# 
# In [15]: nb_add.accumulate(x, 0)
# Out[15]: array([  0.,   1.,   3.,   6.,  10.,  15.,  21.,  28.,  36.,  45.])
# ```

# ## Advanced Array Input and Output

# ### Memory-Mapped Files

# In[ ]:


mmap = np.memmap('mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap


# In[ ]:


section = mmap[:5]


# In[ ]:


section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap


# In[ ]:


mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap


# In[ ]:


get_ipython().run_line_magic('xdel', 'mmap')
get_ipython().system('rm mymmap')


# ### HDF5 and Other Array Storage Options

# ## Performance Tips

# ### The Importance of Contiguous Memory

# In[ ]:


arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous


# In[ ]:


get_ipython().run_line_magic('timeit', 'arr_c.sum(1)')
get_ipython().run_line_magic('timeit', 'arr_f.sum(1)')


# In[ ]:


arr_f.copy('C').flags


# In[ ]:


arr_c[:50].flags.contiguous
arr_c[:, :50].flags


# In[ ]:


get_ipython().run_line_magic('xdel', 'arr_c')
get_ipython().run_line_magic('xdel', 'arr_f')


# In[ ]:


pd.options.display.max_rows = PREVIOUS_MAX_ROWS