#!/usr/bin/env python # coding: utf-8 # # Numpy - multidimensional data arrays # # Based on lectures from http://github.com/jrjohansson/scientific-python-lectures # # The `numpy` package (module) is used in almost all numerical computation using Python. It is a package that provide high-performance vector, matrix and higher-dimensional data structures for Python. It is implemented in C and Fortran so when calculations are vectorized (formulated with vectors and matrices), performance is very good. # In[1]: from numpy import * # ## Creating `numpy` arrays # There are a number of ways to initialize new numpy arrays, for example from # # * a Python list or tuples # * using functions that are dedicated to generating numpy arrays, such as `arange`, `linspace`, `zeros`, `ones`, etc. # * reading data from files # In[2]: M = array([[1,2],[3,4]]) # from list M.shape, M.size, M.dtype # In[3]: array(((1,2),(3,5.0+0j))) # In[4]: M = array([[1,2],[3,4]], dtype=complex) # from list print(M) M.shape, M.size, M.dtype # In[5]: arange(1,10,0.1) # In[6]: array(range(10,100,1))/10. # In[7]: linspace(1,10,91) # In[8]: logspace(-3,2,20) # In[9]: x = arange(0,10,0.5) # linear mesh start:stop:increment print(x) y = linspace(0,10,21) # linear mesh start,stop,number of points print(y) z = logspace(-3,2,10) # log mesh, 10^start, 10^stop, number of points print(z) # In[10]: Z=zeros((3,3,2),dtype=complex) Z.shape, Z.size, Z.dtype # In[11]: Z=ones((3,3),dtype=float)*1j Z.shape, Z.size, Z.dtype # In[12]: Z=ones((3,3)) # Z *= 1j # issues an error, but would not work Z = Z*1j Z.shape, Z.size, Z.dtype # ### Random number generators # In[13]: from numpy import random print( random.rand() ) # uniformly distributed random number between [0,1] print( random.rand(5,5) ) # uniform distributed (5x5) matrix print( random.randn(5,5) ) # standard normal distribution random matrix # ## File I/O # # Very common form is comma-separated values (CSV) or tab-separated values (TSV). To read data from such files into Numpy arrays we can use the `numpy.loadtxt` or `numpy.genfromtxt` # # File `stockholm_td_adj.dat.txt` contains Stockholm temperature over the years. The columns are [$year$,$month$,$day$,$T_{average}$,$T_{min}$,$T_{max}$] # In[14]: ## Check if file exists get_ipython().system('tail stockholm_td_adj.dat.txt') # In[15]: data = loadtxt('stockholm_td_adj.dat.txt') data.shape # In[16]: # inline figures from matplotlib get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[17]: import pylab as plt # In[18]: # time in years when we have year/month/day t = data[:,0]+data[:,1]/12.+data[:,2]/365 # In[19]: t[:10] # In[20]: plt.plot(t, data[:,3]) # In[21]: # a bit more extended in x-direction plt.figure(figsize=(14,4)) plt.plot(t, data[:,3]) plt.title('temperature in Stockholm') plt.xlabel('year') plt.ylabel('temperature $[\degree C]$'); # Lets save the data in the form [t,$T_{average}$] # In[22]: a=[1,2,3] b=[4,5,6] vstack((a,b)) # In[23]: vstack((t,data[:,3])).T.shape # In[24]: data.shape # In[25]: savetxt('StockholmT.dat', vstack((t,data[:,3])).T) # In[26]: get_ipython().system('tail StockholmT.dat') # ### More efficient binary storage of data to the disc # In[27]: save('ST_data',data) get_ipython().system('ls -ltr') # In[28]: data2=load('ST_data.npy') # In[29]: allclose(data,data2) # how close are the two sets of data # In[30]: max(abs(data-data2).ravel()) # In[31]: amax(abs(data-data2)) # ## Manipulating data # # ### Indexing and slicing # # data[lower:upper:step, lower:upper:step] # In[32]: print(data[:,0]) # display year for all datapoints array(data[::365,0],dtype=int) # the years with 365 spacings, and then last years # **Fancy indexing** # Index is itself an array of integer numbers, i.e, which rows or columns? # # In[33]: x=arange(100) print(x[:10:2]) print(x[ [0,2,4,6,8] ]) # fancy indexing. Does not exists in C++ # In[34]: data[[0,365,2*365,3*365,4*365,5*365+1],0] # fancy indexing for multidimensional array # In[35]: data[range(0,365*20,365),0] # In[36]: data[range(3,3+365*10,365),0] # **Using mask to pick data*** # # In addition to fancy indexing, Python allows yet another way to pick the data. # Create a mask of `[True,....False....]` values, and pick from the array only columns/rows where `True`. # # How to compute average temperature in the year of 1973? # In[37]: data[:,0] >= 1973 data[:,0] < 1974 # In[38]: # Create mask for the year 1973 mask = logical_and(data[:,0] >= 1973, data[:,0] < 1974) # In[39]: mask # In[40]: data[mask,0] # All should have 1973 # In[41]: T1973 = data[mask,3] print('Average temperature in 1973=', sum(T1973)/len(T1973)) # In[42]: where(mask) # In[43]: # where tells you the index where True indices = where(mask) X1973 = data[indices,3]; # This gives similar data in 1973, but not identical # In[44]: print(T1973.shape, X1973.shape, X1973[0].shape) print('Average temperature in 1973=', sum(X1973[0,:])/len(X1973[0,:])) print('Min/Max temperature in 1973=', min(X1973[0,:]),'/',max(X1973[0,:])) # **What is the mean monthly temperatures for each month of the year?** # Let's do Ferbrurary first # In[45]: Febr=data[:,1]==2 mean(data[Febr,3]) # Now loop for all months # In[46]: monthly_mean=[mean(data[data[:,1]==month,3]) for month in range(1,13)] # In[47]: print(monthly_mean) # In[48]: plt.bar(range(1,13),monthly_mean); plt.xlabel('month') plt.ylabel('average temperature') # ## Linear Algebra # # It is implemented in low level fortran/C code, and is much more efficient than code written in Python. # In[49]: A = random.rand(3,3) print(A * A) # It is not matrix-matrix product, but element-wise product print() print(multiply(A,A)) print() print(A @ A) # It is a matrix product print() print(matmul(A,A)) # Matrix product or matrix-vector product can be performed by `dot` command # In[50]: print(dot(A,A)) print(matmul(A,A)) # dot == A[i,j,:] * B[l,:,n] # In[51]: v1 = random.rand(3) print(v1) print( dot(A,v1) ) # matrix.vector product print( dot(v1,v1) ) # length of vector^2 # In[52]: print(A@v1) print(dot(A,v1)) # In[53]: w=A*v1 sum(w,axis=1) # Slightly less efficient, but nicer code can be obtained by `matrix` clas # In[54]: M = matrix(A) v = matrix(v1).T # create a column vector # In[55]: M*M # this is now matrix-matrix product # In[56]: M.T # In[57]: M*v # this is matrix*vector product # In[58]: v.T * M # vector*matrix product # In[59]: v.T * v # inner-product # **Array/Matrix transformations** # # * `.T` or `transpose(M)` transposes matrix # * `.H` hermitian conjugate # * `conjugate(M)` conjugates # * `real(M)` and `imag(M)` takes real and imaginary part of the matrix # In[60]: conjugate(A.T) # ### More advanced linear algebra operations # # Library `linalg`: # # * `linalg.det(A)` # * `linalg.inv(A)` or just `M.I` # * `linalg.eig`, `linalg.eigvals`, `linalg.eigh` # * `linalg.svd()` # * `linalg.solve()` # * `linalg.cholesky()` # In[61]: from numpy import linalg # In[62]: print( linalg.det(A) ) linalg.inv(A) # In[63]: M.I # The eigenvalue problem for a matrix $A$: # # $\displaystyle A v_n = \lambda_n v_n$ # # where $v_n$ is the $n$th eigenvector and $\lambda_n$ is the $n$th eigenvalue. # # To calculate eigenvalues of a matrix, use the `eigvals` (symmetric/hermitian `eigvalsh`) and for calculating both eigenvalues and eigenvectors, use the function `eig` (or `eigh`): # In[64]: linalg.eigvals(A) # In[65]: linalg.eigvalsh(A) # In[66]: A = array([[1,2,3], [4,5,6], [7,8,9]]) b = array([1,2,3]) x = linalg.solve(A,b) # A*x==b print(x) dot(A,x)-b # ### sum, cumsum, trace, diag # In[67]: v1 # In[68]: print( sum(v1) ) print( cumsum(v1) ) print( trace(A) ) print( diag(A) ) print( sum(diag(A)) ) # ## Reshaping, resizing, and stacking arrays # In[69]: print(A.shape) Ag = reshape(A, (9,1)) # this is not new data print(Ag.shape) # In[70]: Ag[0]=10 A # we change A when we change Ag # In[71]: Ax = A.flatten() # flatten creates 1D array of all data, but creates a copy Ax[8]=100 # changing a copy print(A) # In[72]: Ay=A.ravel() Ay[8]=100 print(A) # ## Vectorizing functions # Every function written in Python is very slow. However numpy type operations are fast, because they are written in fortran/C # In[73]: Temp = data[:,3] Temp**2 # this is fast, written in C # In[74]: array([Temp[i]**2 for i in range(len(Temp))]) # This is slow, written in Python # What if we have a function that can not simply work on arrays? # # For example, theta function? # In[75]: def Theta(x): if x>=0: return 1 else: return 0 # In[76]: # Does not work on array Theta(Temp) # We can vectorize Theta, to make it applicable to arrays. # # This is simply achieved by call to numpy function `vectorize`, which will create low-level routine from your function # In[77]: Theta_vec = vectorize(Theta) # In[78]: # This is very fast now, and creates 0 or ones positive_temperatures=Theta_vec(Temp) positive_temperatures # **How to calculate number of days in a year with positive temperatures?** # In[79]: # Boolean array to select data with positive temperatures positives = array(positive_temperatures, dtype=bool) # keeps data with positive temperatures only kept = data[positives,0] # now we just need to check how many of these data are in each year # In[80]: years = list(range(1800,2013,1)) hist = histogram(kept, bins=years) # In[81]: plt.plot(hist[1][:-1], hist[0]); # In[82]: negative = array(1-Theta_vec(Temp), dtype=bool) kept2 = data[negative,0] hist2 = histogram(kept2, bins=years) plt.plot(hist2[1][:-1], hist[0]); # In[ ]: