#!/usr/bin/env python # coding: utf-8 # PCA, Nearest-Neighbors Classification and Clustering # ==================================================== # # *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes. # $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$ # $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$ # $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$ # $\newcommand{\norm}[1]{\|#1\|}$ # $\newcommand{\abs}[1]{\left|#1\right|}$ # $\newcommand{\choice}[1]{ \left\{ \begin{array}{l} #1 \end{array} \right. }$ # $\newcommand{\pa}[1]{\left(#1\right)}$ # $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$ # $\newcommand{\qandq}{\quad\text{and}\quad}$ # $\newcommand{\qwhereq}{\quad\text{where}\quad}$ # $\newcommand{\qifq}{ \quad \text{if} \quad }$ # $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$ # $\newcommand{\ZZ}{\mathbb{Z}}$ # $\newcommand{\CC}{\mathbb{C}}$ # $\newcommand{\RR}{\mathbb{R}}$ # $\newcommand{\EE}{\mathbb{E}}$ # $\newcommand{\Zz}{\mathcal{Z}}$ # $\newcommand{\Ww}{\mathcal{W}}$ # $\newcommand{\Vv}{\mathcal{V}}$ # $\newcommand{\Nn}{\mathcal{N}}$ # $\newcommand{\NN}{\mathcal{N}}$ # $\newcommand{\Hh}{\mathcal{H}}$ # $\newcommand{\Bb}{\mathcal{B}}$ # $\newcommand{\Ee}{\mathcal{E}}$ # $\newcommand{\Cc}{\mathcal{C}}$ # $\newcommand{\Gg}{\mathcal{G}}$ # $\newcommand{\Ss}{\mathcal{S}}$ # $\newcommand{\Pp}{\mathcal{P}}$ # $\newcommand{\Ff}{\mathcal{F}}$ # $\newcommand{\Xx}{\mathcal{X}}$ # $\newcommand{\Mm}{\mathcal{M}}$ # $\newcommand{\Ii}{\mathcal{I}}$ # $\newcommand{\Dd}{\mathcal{D}}$ # $\newcommand{\Ll}{\mathcal{L}}$ # $\newcommand{\Tt}{\mathcal{T}}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\al}{\alpha}$ # $\newcommand{\la}{\lambda}$ # $\newcommand{\ga}{\gamma}$ # $\newcommand{\Ga}{\Gamma}$ # $\newcommand{\La}{\Lambda}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\Si}{\Sigma}$ # $\newcommand{\be}{\beta}$ # $\newcommand{\de}{\delta}$ # $\newcommand{\De}{\Delta}$ # $\newcommand{\phi}{\varphi}$ # $\newcommand{\th}{\theta}$ # $\newcommand{\om}{\omega}$ # $\newcommand{\Om}{\Omega}$ # $\newcommand{\eqdef}{\equiv}$ # This tour details [Principal Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) (dimentionality reduction), supervised classification using nearest neighbors # and unsupervised [clustering](https://en.wikipedia.org/wiki/Unsupervised_learning) using [$k$-means](https://en.wikipedia.org/wiki/K-means_clustering). # # # We recommend that after doing this Numerical Tours, you apply it to your # own data, for instance using a dataset from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/). # # # _Disclaimer:_ these machine learning tours are intended to be # overly-simplistic implementations and applications of baseline machine learning methods. # For more advanced uses and implementations, we recommend # to use a state-of-the-art library, the most well known being # [Scikit-Learn](http://scikit-learn.org/). # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # Usefull function to convert to a column vector. # In[34]: def MakeCol(y): return y.reshape(-1,1) def MakeRow(y): return y.reshape(1,-1) def find(x): return np.nonzero(x)[0] # Dataset Loading # --------------- # We use here the famous # [IRIS dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set) of Fisher. # The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). # Four features were measured from each sample: the length and the width of the sepals and petals, in centimetres. # # # Helpers. # Load the dataset. # In[3]: from scipy import io name = 'iris' U = io.loadmat('nt_toolbox/data/ml-' + name) A = U['A'] # Randomly permute it. # In[4]: A = A[np.random.permutation(A.shape[0]),:] # Separate the features $ (x_i)_{i=1}^n $from the class information. # The feature are stored as the row of a matrix $X \in \RR^{n \times p}$ # Be sure to start the class at index 1. # In[5]: X = A[:,0:-1] y = MakeCol( A[:,-1] ) y = y-y.min()+1 # $n$ is the number of samples, $p$ is the dimensionality of the features, # $k$ is the number of classes. # In[6]: [n,p] = X.shape k = int( y.max() ) print(n,p,k) # In[7]: pmax = min(p,8) kdisp = 0 plt.clf for i in np.arange(0,pmax): for j in np.arange(0,pmax): kdisp = kdisp + 1 plt.subplot(pmax,pmax,kdisp) if i==j: plt.hist(X[:,i],6) plt.axis('tight') else: for m in np.arange(1,k+1): I = np.nonzero(y==m)[0] plt.plot(X[I,j],X[I,i], '.') plt.axis('tight') plt.tick_params(axis='x', labelbottom='off') plt.tick_params(axis='y', labelleft='off') # Dimenionality Reduction and PCA # ------------------------------- # In order to display in 2-D or 3-D the data, [dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction) # is needed. # The simplest method is the [Principal Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) (PCA), # which perform an # orthogonal linear projection on the principal axsis (eigenvector) of the # covariance matrix. # # # Compute [empirical mean](https://en.wikipedia.org/wiki/Sample_mean_and_covariance) # $$ m = \frac{1}{n} \sum_{i=1}^n x_i \in \RR^p $$ # and [covariance](https://en.wikipedia.org/wiki/Sample_mean_and_covariance) # $$ C = \frac{1}{n} \sum_{i=1}^n (x_i-m) (x_i-m)^\top \in \RR^{p \times p}. $$ # Denoting $\tilde X = X - 1_p m^\top $, one has $C=\tilde X^\top # \tilde X$. # In[8]: def Xm(X): return X-X.mean(axis=0) def Cov(X): return Xm(X).transpose().dot( Xm(X) ) # Display the covariance matrix. # In[9]: plt.clf plt.imshow(Cov(X)); # Compute PCA ortho-basis using the # [SVD decomposition](https://en.wikipedia.org/wiki/Singular_value_decomposition) # $$ \tilde X = U \text{diag}(d) V $$ # where $U \in \RR^{n \times p}$ and $V \in \RR^{p \times p}$ have # orthonormal columns. $V$ are the principal directions of variance # and # are order by decreasing variances. # In[10]: U, D, V = np.linalg.svd(X) # Compute the feature in the PCA basis, $z_i=V^\top (x_i-m) $, stored in matrix # format as $Z=\tilde X V$. # In[11]: Z = Xm(X).dot( V.transpose()) # Plot the singular values of the covariances, which corresponds to the standard deviation of the data # along the principal directions. # In[12]: plt.clf plt.plot(D, '.-') plt.axis('tight'); # The first dimensions of the $z_i$ are the optimal way to linearly # embed the data in a low dimensional space. # This can be used for display in 2-D using the first two dimension. # In[13]: plt.clf for i in np.arange(1,k+1): I = np.nonzero(y==i)[0] plt.plot(Z[I,0], Z[I,1], '.', label=str(i)) plt.axis('tight') plt.axis('equal') plt.legend(); # Similar display in 3-D. # In[14]: from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = Axes3D(fig) plt.clf for i in np.arange(1,k+1): I = np.where(y==i) ax.scatter(Z[I,0], Z[I,1], Z[I,2], '.', label=str(i)) plt.axis('tight') plt.axis('equal') plt.legend(); # Supervised Learning: Nearest Neighbor Classification # ---------------------------------------------------- # Probably the simplest method for supervised classification is [Nearest # Neighbor](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) ($R$-NN), where $R$ is a parameter indexing the number of neighbor. # Increasing $R$ is important to cope with noise and obtain smoother # decision boundary, and hence better generalization performance. # # # The class predicted for a point $x$ is the one which is the most # represented among the $R$ points $(x_i)_i$ which are the closed to # $x$. # # # # Split into training and testing. # In[15]: n0 = round(.5*n) n1 = n-n0 X0 = X[0:n0,:] y0 = y[0:n0] X1 = X[n0:,:] y1 = y[n0:] # Macro to compute pairwise squared Euclidean distance matrix. # In[16]: from sklearn.metrics.pairwise import pairwise_distances def DistPoint(X,x): return pairwise_distances(X,MakeRow(x)) # Compute Euclidean distance between some $x$ # and all other $x_{1,j}$ in the training set. # In[17]: i = 0 x = X1[i,:] # could be any point D = DistPoint(X0,x) # Sort the distance and generate the list of sorted classes $ y_\sigma = (y_{\si(i)})_i$. This # generate an indexing $\si$ (a permutation of $\{1,\ldots,n\}$) such that # $$ \norm{x-x_{\si(1)}} \leq \norm{x-x_{\si(2)}} \leq \ldots \leq \norm{x-x_{\si(n)}}. $$ # In[18]: I = np.argsort(D.flatten()) ys = y[I] # Function to compute the histogram of classes. # In[19]: def myhist(u): q = u.shape[1] h = np.zeros((k,q)) for i in np.arange(0,k): for j in np.arange(0,q): h[i,j] = sum(u[:,j]==i+1) return h # For a given $R$, one can compute the histogram of class apparition # $$ h_\ell \eqdef \frac{1}{R} \enscond{ i }{ \si(i) \in \{1,\ldots,R\} } # = \sharp \si^{-1}( \{1,\ldots,R\} ). $$ # The decision class for $x$ is then the maximum of the histogram # $$ c(x) \eqdef \text{argmax}_{\ell} h_\ell $$ # In[20]: R = 5 h = myhist(ys[0:R,:]) / R c = np.argmax(h)+1 print( 'c(x)=' + str(c) + ', true class=' + str(int(y1[i])) ) # Display the histigram $(h_\ell)_\ell$ of reparttion of class indexes as $R$ grows. # In[21]: Rlist = ( np.array([.05, .1, .5, 1])*n0 ).round() plt.clf for i in np.arange(0,Rlist.size): R = int( Rlist[i] ) h = myhist(ys[0:R,:]) / R plt.subplot(Rlist.size,1,i+1) plt.bar(np.arange(1,k+1), h) # __Exercise 1__ # # Perform the NN classification for all the points in the test set, and for varying $R$. # Show how the classification score $S$ (number of correctly classified points) # evolves with $R$ # lot(1:Rmax, S, '.-', 'MarkerSize', ms); # In[22]: run -i nt_solutions/ml_1_pca_nn/exo1 # In[23]: ## Insert your code here. # __Exercise 2__ # # Display, as a function of the position in 2-D PCA space, the class output by # the $R$-NN method when applied in 2-D. # In[24]: ## Insert your code here. # Unsupervised Learning: $k$-means # ---------------------------------- # In an [un-supervised setting](https://en.wikipedia.org/wiki/Unsupervised_learning), the class information $y$ is not # available. The basic problem is then to recover class information from # the knowledge of $x$ only. This corresponds to the [clustering problem](https://en.wikipedia.org/wiki/Cluster_analysis). # The most basic algorithm is the , which tries to recover the # class index $\bar y_i=\ell$ from the distance $ \norm{x_i-c_\ell} $ # between the feature point $x_i$ and the class centroid $c_\ell$ # (which are the unknown of the problem). # # # It does so by minimizing the following non-convex energy # $$ \umin{ (c_\ell)_\ell } \sum_i \umin{\ell} \norm{x_i-c_\ell}^2 $$ # # # We first initialize the class centroids $ (c_\ell)_\ell $ at random among the points. # They are stored in as the row of a matrix $ C \in \RR^{k \times p} $. # In[25]: I = np.random.permutation(n) I = I[0:k] C = X[I,:] # The $k$-means algorithm iterate between first determining the class of # each point using the distance to the centroids # $$ \forall i \in \{1,\ldots,n\}, \quad \bar y_i \leftarrow # \uargmin{\ell} \norm{x_i-c_\ell}. $$ # In[26]: D = pairwise_distances(X,C) yb = np.argmin(D, axis=1) # Display the centroids and the classes using colors. # This correspodns to a Voronoi diagram segmentation in the high # dimensional space, but here the display is done in 2D. # In[27]: plt.clf for i in np.arange(0,k): I = np.nonzero(yb==i)[0] plt.plot(Z[I,0], Z[I,1], '.') CV = (C-np.mean(X,axis=0)).dot( V.transpose() ) for i in np.arange(0,k): plt.plot(CV[i,0], CV[i,1], 'ok') plt.axis('tight') plt.axis('equal') plt.axis('off'); # The second step of the $k$-means algorithm is to update the centroids # position to be the mean of the points inside each class # $$ \forall \ell \in \{1,\ldots,k\}, \quad c_\ell \leftarrow # \frac{ \sum_{i:y_i=\ell} x_i }{ \sharp \{i:y_i=\ell\} }. $$ # In[28]: for l in np.arange(0,k): C[l,:] = np.mean( X[yb==l,:], axis=0 ) # __Exercise 3__ # # Peform several step of the $k$-means algorithm. # nit # In[29]: run -i nt_solutions/ml_1_pca_nn/exo3 # In[30]: ## Insert your code here. # Display the histogram of (true, i.e. according to $y$) class inside # each estimated class (i.e. according to $\bar y$). # In[31]: plt.clf for l in np.arange(1,k+1): I = np.nonzero(yb==l-1)[0] h = myhist(y[I]) h = h/h.sum() plt.subplot(k,1,l) plt.bar(np.arange(1,k+1),h) # __Exercise 4__ # # Implement better initialization strategies such as farthest point sampling or # [$k$-means++](https://en.wikipedia.org/wiki/K-means%2B%2B). # In[32]: run -i nt_solutions/ml_1_pca_nn/exo4 # In[33]: ## Insert your code here.