#!/usr/bin/env python
# coding: utf-8

# # Numpy - multidimensional data arrays
# 
# Based on lectures from http://github.com/jrjohansson/scientific-python-lectures
# 

# The `numpy` package (module) is used in almost all numerical computation using Python. It is a package that provide high-performance vector, matrix and higher-dimensional data structures for Python. It is implemented in C and Fortran so when calculations are vectorized (formulated with vectors and matrices), performance is very good.

# In[1]:


from numpy import *


# ## Creating `numpy` arrays

# There are a number of ways to initialize new numpy arrays, for example from
# 
# * a Python list or tuples
# * using functions that are dedicated to generating numpy arrays, such as `arange`, `linspace`, `zeros`, `ones`, etc.
# * reading data from files

# In[2]:


M = array([[1,2],[3,4]])  # from list
M.shape, M.size, M.dtype


# In[3]:


array(((1,2),(3,5.0+0j)))


# In[4]:


M = array([[1,2],[3,4]], dtype=complex)  # from list
print(M)
M.shape, M.size, M.dtype


# In[5]:


arange(1,10,0.1)


# In[6]:


array(range(10,100,1))/10.


# In[7]:


linspace(1,10,91)


# In[8]:


logspace(-3,2,20)


# In[9]:


x = arange(0,10,0.5)  # linear mesh start:stop:increment
print(x)
y = linspace(0,10,21) # linear mesh start,stop,number of points
print(y)
z = logspace(-3,2,10) # log mesh, 10^start, 10^stop, number of points
print(z)


# In[10]:


Z=zeros((3,3,2),dtype=complex)

Z.shape, Z.size, Z.dtype


# In[11]:


Z=ones((3,3),dtype=float)*1j

Z.shape, Z.size, Z.dtype


# In[12]:


Z=ones((3,3))
# Z *= 1j   # issues an error, but would not work
Z = Z*1j
Z.shape, Z.size, Z.dtype


# ### Random number generators

# In[13]:


from numpy import random

print( random.rand() )  # uniformly distributed random number between [0,1]

print( random.rand(5,5) ) # uniform distributed (5x5) matrix


print( random.randn(5,5) ) # standard normal distribution random matrix


# ## File I/O
# 
# Very common form is comma-separated values (CSV) or tab-separated values (TSV). To read data from such files into Numpy arrays we can use the `numpy.loadtxt` or `numpy.genfromtxt`
# 
# File `stockholm_td_adj.dat.txt` contains Stockholm temperature over the years. The columns are [$year$,$month$,$day$,$T_{average}$,$T_{min}$,$T_{max}$]

# In[14]:


## Check if file exists
get_ipython().system('tail stockholm_td_adj.dat.txt')


# In[15]:


data = loadtxt('stockholm_td_adj.dat.txt')
data.shape


# In[16]:


# inline figures from matplotlib
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt


# In[17]:


import pylab as plt


# In[18]:


# time in years when we have year/month/day
t = data[:,0]+data[:,1]/12.+data[:,2]/365


# In[19]:


t[:10]


# In[20]:


plt.plot(t, data[:,3])


# In[21]:


# a bit more extended in x-direction
plt.figure(figsize=(14,4))
plt.plot(t, data[:,3])
plt.title('temperature in Stockholm')
plt.xlabel('year')
plt.ylabel('temperature $[\degree C]$');


# Lets save the data in the form [t,$T_{average}$]

# In[22]:


a=[1,2,3]
b=[4,5,6]
vstack((a,b))


# In[23]:


vstack((t,data[:,3])).T.shape


# In[24]:


data.shape


# In[25]:


savetxt('StockholmT.dat', vstack((t,data[:,3])).T)


# In[26]:


get_ipython().system('tail StockholmT.dat')


# ### More efficient binary storage of data to the disc

# In[27]:


save('ST_data',data)
get_ipython().system('ls -ltr')


# In[28]:


data2=load('ST_data.npy')


# In[29]:


allclose(data,data2)  # how close are the two sets of data


# In[30]:


max(abs(data-data2).ravel())


# In[31]:


amax(abs(data-data2))


# ## Manipulating data
# 
# ### Indexing and slicing
# 
# data[lower:upper:step, lower:upper:step]

# In[32]:


print(data[:,0])  # display year for all datapoints
array(data[::365,0],dtype=int) # the years with 365 spacings, and then last years


# **Fancy indexing**
# Index is itself an array of integer numbers, i.e, which rows or columns?
# 

# In[33]:


x=arange(100)
print(x[:10:2])
print(x[ [0,2,4,6,8] ])  # fancy indexing. Does not exists in C++


# In[34]:


data[[0,365,2*365,3*365,4*365,5*365+1],0] # fancy indexing for multidimensional array


# In[35]:


data[range(0,365*20,365),0]


# In[36]:


data[range(3,3+365*10,365),0]


# **Using mask to pick data***
# 
# In addition to fancy indexing, Python allows yet another way to pick the data.
# Create a mask of `[True,....False....]` values, and pick from the array only columns/rows where `True`. 
# 
# How to compute average temperature in the year of 1973?

# In[37]:


data[:,0] >= 1973
data[:,0] < 1974


# In[38]:


# Create mask for the year 1973
mask = logical_and(data[:,0] >= 1973, data[:,0] < 1974)


# In[39]:


mask


# In[40]:


data[mask,0]  # All should have 1973


# In[41]:


T1973 = data[mask,3]
print('Average temperature in 1973=', sum(T1973)/len(T1973))


# In[42]:


where(mask)


# In[43]:


# where tells you the index where True
indices = where(mask)
X1973 = data[indices,3]; # This gives similar data in 1973, but not identical


# In[44]:


print(T1973.shape, X1973.shape, X1973[0].shape)
print('Average temperature in 1973=', sum(X1973[0,:])/len(X1973[0,:]))
print('Min/Max temperature in 1973=', min(X1973[0,:]),'/',max(X1973[0,:]))


# **What is the mean monthly temperatures for each month of the year?**

# Let's do Ferbrurary first

# In[45]:


Febr=data[:,1]==2
mean(data[Febr,3])


# Now loop for all months

# In[46]:


monthly_mean=[mean(data[data[:,1]==month,3]) for month in range(1,13)]


# In[47]:


print(monthly_mean)


# In[48]:


plt.bar(range(1,13),monthly_mean);
plt.xlabel('month')
plt.ylabel('average temperature')


# ## Linear Algebra
# 
# It is implemented in low level fortran/C code, and is much more efficient than code written in Python.

# In[49]:


A = random.rand(3,3)


print(A * A)  # It is not matrix-matrix product, but element-wise product
print()
print(multiply(A,A))
print()
print(A @ A)  # It is a matrix product
print()
print(matmul(A,A))


# Matrix product or matrix-vector product can be performed by `dot` command

# In[50]:


print(dot(A,A))
print(matmul(A,A))
# dot == A[i,j,:] * B[l,:,n] 


# In[51]:


v1 = random.rand(3)
print(v1)
print( dot(A,v1) ) # matrix.vector product
print( dot(v1,v1) ) # length of vector^2


# In[52]:


print(A@v1)
print(dot(A,v1))


# In[53]:


w=A*v1
sum(w,axis=1)


# Slightly less efficient, but nicer code can be obtained by `matrix` clas

# In[54]:


M = matrix(A)
v = matrix(v1).T # create a column vector


# In[55]:


M*M  # this is now matrix-matrix product


# In[56]:


M.T


# In[57]:


M*v # this is matrix*vector product


# In[58]:


v.T * M # vector*matrix product


# In[59]:


v.T * v # inner-product


# **Array/Matrix transformations**
# 
# * `.T` or  `transpose(M)` transposes matrix
# * `.H` hermitian conjugate
# * `conjugate(M)` conjugates
# * `real(M)` and `imag(M)` takes real and imaginary part of the matrix

# In[60]:


conjugate(A.T)


# ### More advanced linear algebra operations
# 
# Library `linalg`:
# 
# * `linalg.det(A)`
# * `linalg.inv(A)` or just `M.I`
# * `linalg.eig`, `linalg.eigvals`, `linalg.eigh`
# * `linalg.svd()`
# * `linalg.solve()`
# * `linalg.cholesky()`

# In[61]:


from numpy import linalg


# In[62]:


print( linalg.det(A) )
linalg.inv(A)


# In[63]:


M.I


# The eigenvalue problem for a matrix $A$:
# 
# $\displaystyle A v_n = \lambda_n v_n$
# 
# where $v_n$ is the $n$th eigenvector and $\lambda_n$ is the $n$th eigenvalue.
# 
# To calculate eigenvalues of a matrix, use the `eigvals` (symmetric/hermitian `eigvalsh`) and for calculating both eigenvalues and eigenvectors, use the function `eig` (or `eigh`):

# In[64]:


linalg.eigvals(A)


# In[65]:


linalg.eigvalsh(A)


# In[66]:


A = array([[1,2,3], [4,5,6], [7,8,9]])
b = array([1,2,3])
x = linalg.solve(A,b) # A*x==b
print(x)
dot(A,x)-b


# ### sum, cumsum, trace, diag

# In[67]:


v1


# In[68]:


print( sum(v1) )
print( cumsum(v1) )
print( trace(A) )
print( diag(A) )
print( sum(diag(A)) )


# ## Reshaping, resizing, and stacking arrays

# In[69]:


print(A.shape)
Ag = reshape(A, (9,1))  # this is not new data
print(Ag.shape)


# In[70]:


Ag[0]=10
A        # we change A when we change Ag


# In[71]:


Ax = A.flatten()  # flatten creates 1D array of all data, but creates a copy
Ax[8]=100         # changing a copy
print(A)


# In[72]:


Ay=A.ravel()
Ay[8]=100
print(A)


# ## Vectorizing functions

# Every function written in Python is very slow. However numpy type operations are fast, because they are written in fortran/C

# In[73]:


Temp = data[:,3]

Temp**2  #  this is fast, written in C


# In[74]:


array([Temp[i]**2 for i in range(len(Temp))])  # This is slow, written in Python


# What if we have a function that can not simply work on arrays?
# 
# For example, theta function?

# In[75]:


def Theta(x):
    if x>=0:
        return 1
    else:
        return 0


# In[76]:


# Does not work on array
Theta(Temp)


# We can vectorize Theta, to make it applicable to arrays. 
# 
# This is simply achieved by call to numpy function `vectorize`, which will create low-level routine from your function

# In[77]:


Theta_vec = vectorize(Theta)


# In[78]:


# This is very fast now, and creates 0 or ones
positive_temperatures=Theta_vec(Temp)
positive_temperatures


# **How to calculate number of days in a year with positive temperatures?**

# In[79]:


# Boolean array to select data with positive temperatures
positives = array(positive_temperatures, dtype=bool)
# keeps data with positive temperatures only
kept = data[positives,0]
# now we just need to check how many of these data are in each year


# In[80]:


years = list(range(1800,2013,1))
hist = histogram(kept, bins=years)


# In[81]:


plt.plot(hist[1][:-1], hist[0]);


# In[82]:


negative = array(1-Theta_vec(Temp), dtype=bool)
kept2 = data[negative,0]
hist2 = histogram(kept2, bins=years)
plt.plot(hist2[1][:-1], hist[0]);


# In[ ]: