In [1]:
%run ../../common_functions/import_all.py

from scipy import optimize
from scipy.integrate import quad, odeint
from scipy.interpolate import interp1d
from scipy.signal import detrend
from scipy.spatial import distance
from matplotlib.legend_handler import HandlerLine2D

from common_functions.setup_notebook import set_css_style, setup_matplotlib, config_ipython
config_ipython()
setup_matplotlib()
set_css_style()

dataset = '../../datasets/oldfaithful.txt'


# Some notes on Numpy and Scipy¶

This is just a very high-level introduction to these two libraries, not meant to be comprehensive at all.

• Numpy
• Array & Matrix manipulation
• Operations on array/matrix
• Linear Algebra
• Polynomials
• Fitting a polynomial to points
• Dealing with NaNs
• The Numpy data format, IO
• Scipy
• Optimization: finding the minimum of a function
• Optimization: finding the roots of a function
• Fitting a functional form, finding its parameters
• Histograms and PDFs
• Computing percentiles
• Interpolate: linear and cubic
• Integrate: area under the curve
• Integrate: solve ODEs
• Remove a linear trend from a noisy signal
• Mathematical distances

Note: some stuff here comes from the Scipy lectures http://www.scipy-lectures.org/index.html

## Numpy¶

### Arrays and Matrices: attributes; slicing; reshaping/resizing/flattening¶

In [3]:
# NOTE: np.array is function to create an ndarray (n-dimensional array), ndarray not to be used to create

# Vector

print('* A vector')
vec = np.array([1, 2, 3, 4, 5])
print('    Dimension %d and shape %s: %s' %(vec.ndim, vec.shape, vec))
print('    Slice of vector: ', vec[2:4])
print('    Slice of vector with step: ', vec[1:4:2])
print('    Slice with newaxis on column: ', vec[:, np.newaxis])
print('    Slice with newaxis on row: ', vec[np.newaxis, :])

* A vector
Dimension 1 and shape (5,): [1 2 3 4 5]
Slice of vector:  [3 4]
Slice of vector with step:  [2 4]
Slice with newaxis on column:  [[1]
[2]
[3]
[4]
[5]]
Slice with newaxis on row:  [[1 2 3 4 5]]

In [4]:
# Matrix

print('* A matrix')
mat = np.array([[1,2,3, 0], [2,3,4, 1], [4, 0, 1, 0]])
print('    This is an ndarray of dimension %d and shape %s:\n %s' %(mat.ndim, mat.shape, mat))
print('    It contains %d elements (size)' %mat.size)
print('    Its element on row 2, col 3 is %d' %mat[1,2])
print('    A slice of it is (second column)', mat[:,1])

* A matrix
This is an ndarray of dimension 2 and shape (3, 4):
[[1 2 3 0]
[2 3 4 1]
[4 0 1 0]]
It contains 12 elements (size)
Its element on row 2, col 3 is 4
A slice of it is (second column) [2 3 0]

In [5]:
# Reshaping, resizing and flattening

print('* Reshaping and resizing', ['t_red'])
a = np.arange(6)
print('    Original vector: ', a)
print('    Reshaping vector to shape (2, 3) (change shape, return reshaped ndarray): ')
a.reshape((2, 3))
print('    Resizing vector to shape (2, 2) (change shape and size, do in place and return None): ')
a
print('    Original matrix: ', mat)
print('    Flattened matrix: ', mat.ravel())

* Reshaping and resizing ['t_red']
Original vector:  [0 1 2 3 4 5]
Reshaping vector to shape (2, 3) (change shape, return reshaped ndarray):

Out[5]:
array([[0, 1, 2],
[3, 4, 5]])
    Resizing vector to shape (2, 2) (change shape and size, do in place and return None):

Out[5]:
array([0, 1, 2, 3, 4, 5])
    Original matrix:  [[1 2 3 0]
[2 3 4 1]
[4 0 1 0]]
Flattened matrix:  [1 2 3 0 2 3 4 1 4 0 1 0]

In [6]:
# Matrices of zeros and ones

print('* Matrices of zeros and ones', ['t_red'])
np.zeros((3, 4))
np.ones((1, 5))

* Matrices of zeros and ones ['t_red']

Out[6]:
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
Out[6]:
array([[1., 1., 1., 1., 1.]])

### Operations on arrays/matrices¶

In [7]:
# Setting the vars to use

print('* Vectors and matrices to use')
v = np.array([1, 2, 3])
A = np.array([[1,2, 0], [2,3,4]])
B = np.array([[0, 1], [2, 0], [1, 1]])
C = np.array([[3, 5, 1], [0, 0, 2]])
print('    vector v is', v)
print('    Matrix A is')
print(A)
print('    Matrix B is')
print(B)
print('    Matrix C is')
print(C)

* Vectors and matrices to use
vector v is [1 2 3]
Matrix A is
[[1 2 0]
[2 3 4]]
Matrix B is
[[0 1]
[2 0]
[1 1]]
Matrix C is
[[3 5 1]
[0 0 2]]

In [8]:
# matrix transpose

print('* Matrix and transpose')
print('    Transpose of A is\n', A.T)

* Matrix and transpose
Transpose of A is
[[1 2]
[2 3]
[0 4]]

In [9]:
# Some Matrix properties of elements

print('* Properties of matrix elements')
print('    Sum, mean, std of elements: ', A.sum(), A.mean(), A.std())
print('    Sum of A on axis 0: ', A.sum(axis=0))
print('    Sum of A on axis 1: ', A.sum(axis=1))

* Properties of matrix elements
Sum, mean, std of elements:  12 2.0 1.2909944487358056
Sum of A on axis 0:  [3 5 4]
Sum of A on axis 1:  [3 9]

In [10]:
# Product of matrices

print('* Matrix product')
print('    Product A * B is')
print(np.dot(A, B))

* Matrix product
Product A * B is
[[ 4  1]
[10  6]]

In [11]:
# scalar * vector; scalar * matrix

print('* Multiplication and sum with scalar (v and A)')
print('    2 * v: ', 2 * v)
print('    2 * A: ', 2 * A)
print('    v + 1: ', v + 1)
print('    A + 1: ', A + 1)

* Multiplication and sum with scalar (v and A)
2 * v:  [2 4 6]
2 * A:  [[2 4 0]
[4 6 8]]
v + 1:  [2 3 4]
A + 1:  [[2 3 1]
[3 4 5]]

In [12]:
# Matrix arithmetics

print('* Arithmetics with matrices')
print('    Summing A and C')
print(A + C)
print('    Subtracting C from A')
print(A - C)

* Arithmetics with matrices
Summing A and C
[[4 7 1]
[2 3 6]]
Subtracting C from A
[[-2 -3 -1]
[ 2  3  2]]

In [13]:
# sorting a matrix

print('* Sorting matrix')
A_copy = A.copy()
A.sort()
print('    Sorting A: ', A)
A = A_copy.copy()
A.sort(axis=0)
print('    Sorting A on axis 0', A)
A = A_copy.copy()
A.sort(axis=1)
print('    Sorting A on axis 1', A)

* Sorting matrix
Sorting A:  [[0 1 2]
[2 3 4]]
Sorting A on axis 0 [[1 2 0]
[2 3 4]]
Sorting A on axis 1 [[0 1 2]
[2 3 4]]

In [14]:
# array shuffling at random

print('* Randomly shuffling array:')
np.random.shuffle(v)                 # shuffles and returns None
print(v)

* Randomly shuffling array:
[3 2 1]


### Linear Algebra¶

In [15]:
# set the vars to use

v = np.array([1, 2, 1, 1])
A = np.array([[1, 2, 3], [0, 1, 0], [0, 2, 1]])

In [16]:
# determinant & co

print('* Determinant, trace, inverse, norm')
print('    Determinant of matrix A: ', np.linalg.det(A))
print('    Trace of matrix A:', np.trace(A))
print('    Inverse of matrix A: ', np.linalg.inv(A))
print('    Norm of vector v and of matrix A:', np.linalg.norm(v), np.linalg.norm(A))

* Determinant, trace, inverse, norm
Determinant of matrix A:  1.0
Trace of matrix A: 3
Inverse of matrix A:  [[ 1.  4. -3.]
[ 0.  1.  0.]
[-0. -2.  1.]]
Norm of vector v and of matrix A: 2.6457513110645907 4.47213595499958

In [17]:
# eigenthings

print('* Eigenvalues/Eigenvectors')
print('    Eigenvalues and eigenvectors of matrix A:', np.linalg.eig(A))

* Eigenvalues/Eigenvectors
Eigenvalues and eigenvectors of matrix A: (array([1., 1., 1.]), array([[ 1.00000000e+00, -1.00000000e+00,  1.00000000e+00],
[ 0.00000000e+00,  0.00000000e+00,  8.21730110e-33],
[ 0.00000000e+00,  7.40148683e-17, -7.40148683e-17]]))

In [18]:
# SVD

print('* Single Value Decomposition')
print('    SVD of matrix A:', np.linalg.svd(A))

* Single Value Decomposition
SVD of matrix A: (array([[ 0.86349489,  0.47752005, -0.16233045],
[ 0.15573818, -0.55857386, -0.81470293],
[ 0.47971053, -0.67821077,  0.55669378]]), array([4.27194722, 1.3109472 , 0.17856195]), array([[ 0.20213145,  0.66530528,  0.71868753],
[ 0.36425575, -0.73226084,  0.57542317],
[-0.9090988 , -0.14547494,  0.39035422]]))

In [19]:
# System of linear equations

A = np.array([[1, 2], [3, 1]])
b = np.array([0, 1])
print('* Linear systems')
print('    Solving linear system Ax = b:', np.linalg.solve(A, b))
print('    Least Square solution for Ax = b: ', np.linalg.lstsq(A, b))

* Linear systems
Solving linear system Ax = b: [ 0.4 -0.2]
Least Square solution for Ax = b:  (array([ 0.4, -0.2]), array([], dtype=float64), 2, array([3.61803399, 1.38196601]))

/Users/martina/Desktop/Mallzee/repos/plantation/venv/lib/python3.7/site-packages/ipykernel_launcher.py:7: FutureWarning: rcond parameter will change to the default of machine precision times max(M, N) where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass rcond=None, to keep using the old, explicitly pass rcond=-1.
import sys


### Polynomials¶

In [20]:
print('y = x^2 -2x + 1')
y = np.poly1d([1, -2, 1])
print('* Order of polynomial is ', y.order)
print('* y(2) = ', y(2))
print('* Roots of polynomial are ', y.roots)

y = x^2 -2x + 1
* Order of polynomial is  2
* y(2) =  1
* Roots of polynomial are  [1. 1.]


### Polynomial fit to points¶

In [21]:
# Setting x's to be 50 linearly spaced points in [0, 1]
x = np.linspace(0, 1, num=20)

# Setting y to be a noised cosine
y = np.cos(x) + 0.3*np.random.rand(20)

# Fitting points (x, y) to a polynomial of deg 3
fit_coeff = np.polyfit(x, y, 3)

# Build the polynomial with the fitting coefficients
p = np.poly1d(fit_coeff)

# Considering another interval of x's and plotting original points and fitting curve
x2 = np.linspace(0, 1, 200)
plt.plot(x, y, 'o', x2, p(x2), '-')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.title('A noisy cosine')
plt.show();


In [2]:
# Load data from text file (including csv) into Numpy matrix
delimiter=' ',
skiprows=1,                                                 # skip the first 1 lines (header)
usecols=(1,2)                                             # select cols 1 (x) and 2 (y)
)

# If there are missing values, run this instead
# data = np.genfromtxt(data_folder + filename,
#                      delimiter=delimiter,
#                      skip_header=1,                                       # skip the first 1 lines (header)
#                      usecols=features,                                    # select which features to use
#                      missing='?'                                          # what missing values are mapped to
#                     )


### Playing with NaNs¶

In [9]:
print(np.log(0), np.log(-1))
print(np.isnan([1.,np.log(-1), 2]))

-inf nan
[False  True False]

/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:1: RuntimeWarning: divide by zero encountered in log
if __name__ == '__main__':
/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:1: RuntimeWarning: invalid value encountered in log
if __name__ == '__main__':
/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in log
from ipykernel import kernelapp as app


### IO Numpy data format¶

In [10]:
data = np.ones((3, 3))

np.save('mat.npy', data)


## Scipy¶

### Optimize: finding min of function¶

In [23]:
# function definition and plot of it
def f(x):
return x**2 + 10*np.sin(x)

x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show();

# Gradient Descent through BFGS (finds local min)
print('BFGS')
optimize.fmin_bfgs(f, 0)

# Global optimizer though bashinopping (local optimizer + random sampling of starting points)
print('BASINHOPPING')
optimize.basinhopping(f, 0)

# NOTE: there are several other optimizers available

Out[23]:
[<matplotlib.lines.Line2D at 0x10af69048>]
BFGS
Optimization terminated successfully.
Current function value: -7.945823
Iterations: 5
Function evaluations: 18

Out[23]:
array([-1.30644012])
BASINHOPPING

Out[23]:
                        fun: -7.9458233756152845
lowest_optimization_result:       fun: -7.9458233756152845
hess_inv: array([[ 0.08581276]])
jac: array([  1.19209290e-07])
message: 'Optimization terminated successfully.'
nfev: 15
nit: 3
njev: 5
status: 0
success: True
x: array([-1.30644001])
message: ['requested number of basinhopping iterations completed successfully']
minimization_failures: 0
nfev: 1524
nit: 100
njev: 508
x: array([-1.30644001])

### Optimize: finding roots of functions¶

In [24]:
roots = []

#f = x**2 + 10*np.sin(x) is the function defined above
root = optimize.fsolve(f, 1)      # 1 is the initial guess
print(root)
roots.append(root)

# from plot, function has another root, so let us start from -2.5
root = optimize.fsolve(f, -2.5)      # 1 is the initial guess
print(root)
roots.append(root)

[ 0.]
[-2.47948183]


### Optimize: fit curve knowing the functional form, fitting the params¶

In [28]:
x = np.linspace(-10, 10, num=20)
y = f(x) + np.random.randn(x.size)

def f_to_fit(x, a, b):
return a*x**2 + b*np.sin(x)

guess = [2, 2]
params, params_covariance = optimize.curve_fit(f_to_fit, x, y, guess)

# Plotting all stuff together
f_line, = plt.plot(x, f(x), label='original function')
fitted_points, = plt.plot(x, f_to_fit(x, params[0], params[1]), 'o', label='points of fitted curve')
original, = plt.plot(roots, f(np.array(roots)), 'x', label='roots')
legend = plt.legend(handler_map={f_line: HandlerLine2D(numpoints=2)}, loc=2)
plt.show();


### The stats module: histograms and PDFs¶

In [29]:
# Build the bins separators, width 1, from -10 to 10
bins_sep = np.arange(-9, 10)

# Extract 1000 normally distributed points
normal_points = np.random.normal(size=1000)

# Histogram the points and compute the normal PDF over bins
histogram = np.histogram(normal_points, bins=bins_sep, normed=True)[0]
bins = 0.5*(bins_sep[1:] + bins_sep[:-1])    # to get the bins centers

# Normal PDF over specified bins
normal_pdf = stats.norm.pdf(bins)

# Plot both things
hist_line, = plt.plot(bins, histogram, 'o', label='hist norm points')
pdf_line, = plt.plot(bins, normal_pdf, label='Normal PDF')
legend = plt.legend(handler_map={hist_line: HandlerLine2D(numpoints=2)}, loc=2)
plt.show();


### Stats module: quickly compute percentiles¶

In [30]:
# 50 percentile of previously defined normal points

print(stats.scoreatpercentile(normal_points, 50))

-0.0508135291078


### Interpolate module: linear and cubic interpolation between points¶

In [35]:
# Defining noisy sine wave
t = np.linspace(0, 1, 10)
middle_t = np.linspace(0, 1, 20)
noise = (np.random.random(10)*2 - 1) * 1e-1
noisy_sin = np.sin(2 * np.pi * t) + noise

# Defining linear interpolation between points
linear_int = interp1d(t, noisy_sin)(middle_t)

# Defining cubic interpolation between points
cubic_int = interp1d(t, noisy_sin, kind='cubic')(middle_t)

# Plot all
noisy_sin_line, = plt.plot(t, noisy_sin, label='noisy sine wave')
noisy_sin_points, = plt.plot(t, noisy_sin, 'o', color='blue')
linear_int_points, = plt.plot(middle_t, linear_int, 'x', label='lin int')
cubic_int_points, = plt.plot(middle_t, cubic_int, 'x', label='cub int')
legend = plt.legend(handler_map={hist_line: HandlerLine2D(numpoints=2)}, loc=3)
plt.show();


### integrate module: compute definite integral of function¶

In [38]:
res, err = quad(np.sin, 0, np.pi/2)
print(res, err)

# NOTE: there are other integration methods

0.9999999999999999 1.1102230246251564e-14


### integrate module: solve ODE¶

In [41]:
# Defining dy/dt, iteration_count is the counter for the iteration till convergence
def y_dot(y, t, iteration_count):
iteration_count += 1
return -2 * y

# Defining t, the vector of iteration count, then solve dy/dt = -2y
t = np.linspace(0, 10, 100)
iteration_count = np.zeros((1,), dtype=np.uint16)
y_sol, infodict = odeint(y_dot, 1, t, args=(iteration_count,), full_output=True)

# Plot solution y(t)
plt.plot(t, y_sol)
plt.plot()
plt.show();

# NOTE: odeint solves a system of ODEs as well
# NOTE: there is no partial diff eq solver in Scipy


### signal module: remove linear trend from noisy signal¶

In [45]:
x = t + np.random.normal(size=100)
plt.plot(t, x, label='original')
plt.plot(t, detrend(x), label='detrended')
legend = plt.legend(handler_map={hist_line: HandlerLine2D(numpoints=2)}, loc=2)
plt.show();


### Mathematical distances¶

In [47]:
# Define two 1D arrays and two matrices

u = np.array([1, 1, 2])
v = np.array([1, 3, 1])
A = np.array([[1, 2, 0], [2,3,4]])
B = np.array([[0, 1, 1], [2, 0, 1]])

In [50]:
print('* Distances between the two vectors')
print('    Euclidean:', distance.euclidean(u, v))
print('    Cosine:', distance.cosine(u, v))
print('    Manhattan:', distance.cityblock(u, v))
print('    Minkowski with p=3:', distance.minkowski(u, v, 3))
print('    Chebyshev:', distance.chebyshev(u, v))
print('    Hamming:', distance.hamming(u, v))

* Distances between the two vectors
Euclidean: 2.2360679775
Cosine: 0.261451054124
Manhattan: 3
Minkowski with p=3: 2.08008382305
Chebyshev: 2
Hamming: 0.666666666667

In [51]:
print('* Distances between each pair of rows in matrices')
print(distance.cdist(A, B, metric='euclidean'))
# cdist is built as (numbers are row indices)
# A0B0 A0B1 A0B2
# A1B0 A1B1 A1B2

* Distances between each pair of rows in matrices
[[ 1.73205081  2.44948974]
[ 4.12310563  4.24264069]]

In [53]:
print('* Pairwise distances between rows in matrix')
print(distance.pdist(np.array([[0, 0], [1, 1], [1, 2]]), metric='euclidean'))
# pdist is built as (numbers are row indices)
# A0A1 A0A2 A1A2           # len will be N*(N-1) where N is the number of rows

* Pairwise distances between rows in matrix
[ 1.41421356  2.23606798  1.        ]

In [ ]: