#!/usr/bin/env python # coding: utf-8 # Copyright (c) 2015-2017 [Sebastian Raschka](sebastianraschka.com) # # https://github.com/rasbt/python-machine-learning-book # # [MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt) # # Python Machine Learning - Code Examples # # Chapter 5 - Compressing Data via Dimensionality Reduction # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -p numpy,scipy,matplotlib,sklearn") # *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.* #
#
# ### Overview # - [Unsupervised dimensionality reduction via principal component analysis 128](#Unsupervised-dimensionality-reduction-via-principal-component-analysis-128) # - [Total and explained variance](#Total-and-explained-variance) # - [Feature transformation](#Feature-transformation) # - [Principal component analysis in scikit-learn](#Principal-component-analysis-in-scikit-learn) # - [Supervised data compression via linear discriminant analysis](#Supervised-data-compression-via-linear-discriminant-analysis) # - [Computing the scatter matrices](#Computing-the-scatter-matrices) # - [Selecting linear discriminants for the new feature subspace](#Selecting-linear-discriminants-for-the-new-feature-subspace) # - [Projecting samples onto the new feature space](#Projecting-samples-onto-the-new-feature-space) # - [LDA via scikit-learn](#LDA-via-scikit-learn) # - [Using kernel principal component analysis for nonlinear mappings](#Using-kernel-principal-component-analysis-for-nonlinear-mappings) # - [Kernel functions and the kernel trick](#Kernel-functions-and-the-kernel-trick) # - [Implementing a kernel principal component analysis in Python](#Implementing-a-kernel-principal-component-analysis-in-Python) # - [Example 1 – separating half-moon shapes](#Example-1:-Separating-half-moon-shapes) # - [Example 2 – separating concentric circles](#Example-2:-Separating-concentric-circles) # - [Projecting new data points](#Projecting-new-data-points) # - [Kernel principal component analysis in scikit-learn](#Kernel-principal-component-analysis-in-scikit-learn) # - [Summary](#Summary) #
#
# In[2]: from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Added version check for recent scikit-learn 0.18 checks from distutils.version import LooseVersion as Version from sklearn import __version__ as sklearn_version # # Unsupervised dimensionality reduction via principal component analysis # In[4]: Image(filename='./images/05_01.png', width=400) # In[5]: import pandas as pd df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/' 'machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine.head() #

# # ### Note: # # # If the link to the Wine dataset provided above does not work for you, you can find a local copy in this repository at [./../datasets/wine/wine.data](./../datasets/wine/wine.data). # # Or you could fetch it via # # # In[6]: df_wine = pd.read_csv('https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine.head() #

# Splitting the data into 70% training and 30% test subsets. # In[7]: if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.3, random_state=0) # Standardizing the data. # In[8]: from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) # --- # # **Note** # # Accidentally, I wrote `X_test_std = sc.fit_transform(X_test)` instead of `X_test_std = sc.transform(X_test)`. In this case, it wouldn't make a big difference since the mean and standard deviation of the test set should be (quite) similar to the training set. However, as remember from Chapter 3, the correct way is to re-use parameters from the training set if we are doing any kind of transformation -- the test set should basically stand for "new, unseen" data. # # My initial typo reflects a common mistake is that some people are *not* re-using these parameters from the model training/building and standardize the new data "from scratch." Here's simple example to explain why this is a problem. # # Let's assume we have a simple training set consisting of 3 samples with 1 feature (let's call this feature "length"): # # - train_1: 10 cm -> class_2 # - train_2: 20 cm -> class_2 # - train_3: 30 cm -> class_1 # # mean: 20, std.: 8.2 # # After standardization, the transformed feature values are # # - train_std_1: -1.21 -> class_2 # - train_std_2: 0 -> class_2 # - train_std_3: 1.21 -> class_1 # # Next, let's assume our model has learned to classify samples with a standardized length value < 0.6 as class_2 (class_1 otherwise). So far so good. Now, let's say we have 3 unlabeled data points that we want to classify: # # - new_4: 5 cm -> class ? # - new_5: 6 cm -> class ? # - new_6: 7 cm -> class ? # # If we look at the "unstandardized "length" values in our training datast, it is intuitive to say that all of these samples are likely belonging to class_2. However, if we standardize these by re-computing standard deviation and and mean you would get similar values as before in the training set and your classifier would (probably incorrectly) classify samples 4 and 5 as class 2. # # - new_std_4: -1.21 -> class 2 # - new_std_5: 0 -> class 2 # - new_std_6: 1.21 -> class 1 # # However, if we use the parameters from your "training set standardization," we'd get the values: # # - sample5: -18.37 -> class 2 # - sample6: -17.15 -> class 2 # - sample7: -15.92 -> class 2 # # The values 5 cm, 6 cm, and 7 cm are much lower than anything we have seen in the training set previously. Thus, it only makes sense that the standardized features of the "new samples" are much lower than every standardized feature in the training set. # # --- # Eigendecomposition of the covariance matrix. # In[9]: import numpy as np cov_mat = np.cov(X_train_std.T) eigen_vals, eigen_vecs = np.linalg.eig(cov_mat) print('\nEigenvalues \n%s' % eigen_vals) # **Note**: # # Above, I used the [`numpy.linalg.eig`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html) function to decompose the symmetric covariance matrix into its eigenvalues and eigenvectors. #

>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

>>> eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

# This is not really a "mistake," but probably suboptimal. It would be better to use [`numpy.linalg.eigh`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html) in such cases, which has been designed for [Hermetian matrices](https://en.wikipedia.org/wiki/Hermitian_matrix). The latter always returns real eigenvalues; whereas the numerically less stable `np.linalg.eig` can decompose nonsymmetric square matrices, you may find that it returns complex eigenvalues in certain cases. (S.R.) # # Sort eigenvectors in decreasing order of the eigenvalues: # In[32]: # Make a list of (eigenvalue, eigenvector) tuples eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))] # Sort the (eigenvalue, eigenvector) tuples from high to low eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True) # Visually confirm that the list is correctly sorted by decreasing eigenvalues print('Eigenvalues in decreasing order:\n') for eigen_val in eigen_pairs: print(eigen_val[0]) # In[33]: tot = sum(eigen_vals.real) discr = [(i / tot) for i in sorted(eigen_vals.real, reverse=True)] cum_discr = np.cumsum(discr) plt.bar(range(1, 14), discr, alpha=0.5, align='center', label='individual "discriminability"') plt.step(range(1, 14), cum_discr, where='mid', label='cumulative "discriminability"') plt.ylabel('"discriminability" ratio') plt.xlabel('Linear Discriminants') plt.ylim([-0.1, 1.1]) plt.legend(loc='best') plt.tight_layout() # plt.savefig('./figures/lda1.png', dpi=300) plt.show() # In[34]: w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real, eigen_pairs[1][1][:, np.newaxis].real)) print('Matrix W:\n', w) #
#
# ## Projecting samples onto the new feature space # In[35]: X_train_lda = X_train_std.dot(w) colors = ['r', 'b', 'g'] markers = ['s', 'x', 'o'] for l, c, m in zip(np.unique(y_train), colors, markers): plt.scatter(X_train_lda[y_train == l, 0] * (-1), X_train_lda[y_train == l, 1] * (-1), c=c, label=l, marker=m) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower right') plt.tight_layout() # plt.savefig('./figures/lda2.png', dpi=300) plt.show() #
#
# ## LDA via scikit-learn # In[36]: if Version(sklearn_version) < '0.18': from sklearn.lda import LDA else: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, y_train) # In[37]: from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() # plt.savefig('./images/lda3.png', dpi=300) plt.show() # In[38]: X_test_lda = lda.transform(X_test_std) plot_decision_regions(X_test_lda, y_test, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() # plt.savefig('./images/lda4.png', dpi=300) plt.show() #
#
# # Using kernel principal component analysis for nonlinear mappings # In[39]: Image(filename='./images/05_11.png', width=500) #
#
# ## Implementing a kernel principal component analysis in Python # In[40]: from scipy.spatial.distance import pdist, squareform from scipy import exp from numpy.linalg import eigh import numpy as np def rbf_kernel_pca(X, gamma, n_components): """ RBF kernel PCA implementation. Parameters ------------ X: {NumPy ndarray}, shape = [n_samples, n_features] gamma: float Tuning parameter of the RBF kernel n_components: int Number of principal components to return Returns ------------ X_pc: {NumPy ndarray}, shape = [n_samples, k_features] Projected dataset """ # Calculate pairwise squared Euclidean distances # in the MxN dimensional dataset. sq_dists = pdist(X, 'sqeuclidean') # Convert pairwise distances into a square matrix. mat_sq_dists = squareform(sq_dists) # Compute the symmetric kernel matrix. K = exp(-gamma * mat_sq_dists) # Center the kernel matrix. N = K.shape[0] one_n = np.ones((N, N)) / N K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n) # Obtaining eigenpairs from the centered kernel matrix # numpy.linalg.eigh returns them in sorted order eigvals, eigvecs = eigh(K) # Collect the top k eigenvectors (projected samples) X_pc = np.column_stack((eigvecs[:, -i] for i in range(1, n_components + 1))) return X_pc #
# ### Example 1: Separating half-moon shapes # In[41]: import matplotlib.pyplot as plt from sklearn.datasets import make_moons X, y = make_moons(n_samples=100, random_state=123) plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue', marker='o', alpha=0.5) plt.tight_layout() # plt.savefig('./figures/half_moon_1.png', dpi=300) plt.show() # In[42]: from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler scikit_pca = PCA(n_components=2) X_spca = scikit_pca.fit_transform(X) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3)) ax[0].scatter(X_spca[y == 0, 0], X_spca[y == 0, 1], color='red', marker='^', alpha=0.5) ax[0].scatter(X_spca[y == 1, 0], X_spca[y == 1, 1], color='blue', marker='o', alpha=0.5) ax[1].scatter(X_spca[y == 0, 0], np.zeros((50, 1)) + 0.02, color='red', marker='^', alpha=0.5) ax[1].scatter(X_spca[y == 1, 0], np.zeros((50, 1)) - 0.02, color='blue', marker='o', alpha=0.5) ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].set_ylim([-1, 1]) ax[1].set_yticks([]) ax[1].set_xlabel('PC1') plt.tight_layout() # plt.savefig('./figures/half_moon_2.png', dpi=300) plt.show() # In[43]: from matplotlib.ticker import FormatStrFormatter X_kpca = rbf_kernel_pca(X, gamma=15, n_components=2) fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(7,3)) ax[0].scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', marker='^', alpha=0.5) ax[0].scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', marker='o', alpha=0.5) ax[1].scatter(X_kpca[y==0, 0], np.zeros((50,1))+0.02, color='red', marker='^', alpha=0.5) ax[1].scatter(X_kpca[y==1, 0], np.zeros((50,1))-0.02, color='blue', marker='o', alpha=0.5) ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].set_ylim([-1, 1]) ax[1].set_yticks([]) ax[1].set_xlabel('PC1') ax[0].xaxis.set_major_formatter(FormatStrFormatter('%0.1f')) ax[1].xaxis.set_major_formatter(FormatStrFormatter('%0.1f')) plt.tight_layout() # plt.savefig('./figures/half_moon_3.png', dpi=300) plt.show() #
# ### Example 2: Separating concentric circles # In[44]: from sklearn.datasets import make_circles X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2) plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue', marker='o', alpha=0.5) plt.tight_layout() # plt.savefig('./figures/circles_1.png', dpi=300) plt.show() # In[45]: scikit_pca = PCA(n_components=2) X_spca = scikit_pca.fit_transform(X) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3)) ax[0].scatter(X_spca[y == 0, 0], X_spca[y == 0, 1], color='red', marker='^', alpha=0.5) ax[0].scatter(X_spca[y == 1, 0], X_spca[y == 1, 1], color='blue', marker='o', alpha=0.5) ax[1].scatter(X_spca[y == 0, 0], np.zeros((500, 1)) + 0.02, color='red', marker='^', alpha=0.5) ax[1].scatter(X_spca[y == 1, 0], np.zeros((500, 1)) - 0.02, color='blue', marker='o', alpha=0.5) ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].set_ylim([-1, 1]) ax[1].set_yticks([]) ax[1].set_xlabel('PC1') plt.tight_layout() # plt.savefig('./figures/circles_2.png', dpi=300) plt.show() # In[46]: X_kpca = rbf_kernel_pca(X, gamma=15, n_components=2) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3)) ax[0].scatter(X_kpca[y == 0, 0], X_kpca[y == 0, 1], color='red', marker='^', alpha=0.5) ax[0].scatter(X_kpca[y == 1, 0], X_kpca[y == 1, 1], color='blue', marker='o', alpha=0.5) ax[1].scatter(X_kpca[y == 0, 0], np.zeros((500, 1)) + 0.02, color='red', marker='^', alpha=0.5) ax[1].scatter(X_kpca[y == 1, 0], np.zeros((500, 1)) - 0.02, color='blue', marker='o', alpha=0.5) ax[0].set_xlabel('PC1') ax[0].set_ylabel('PC2') ax[1].set_ylim([-1, 1]) ax[1].set_yticks([]) ax[1].set_xlabel('PC1') plt.tight_layout() # plt.savefig('./figures/circles_3.png', dpi=300) plt.show() #
#
# ## Projecting new data points # In[47]: from scipy.spatial.distance import pdist, squareform from scipy import exp from scipy.linalg import eigh import numpy as np def rbf_kernel_pca(X, gamma, n_components): """ RBF kernel PCA implementation. Parameters ------------ X: {NumPy ndarray}, shape = [n_samples, n_features] gamma: float Tuning parameter of the RBF kernel n_components: int Number of principal components to return Returns ------------ X_pc: {NumPy ndarray}, shape = [n_samples, k_features] Projected dataset lambdas: list Eigenvalues """ # Calculate pairwise squared Euclidean distances # in the MxN dimensional dataset. sq_dists = pdist(X, 'sqeuclidean') # Convert pairwise distances into a square matrix. mat_sq_dists = squareform(sq_dists) # Compute the symmetric kernel matrix. K = exp(-gamma * mat_sq_dists) # Center the kernel matrix. N = K.shape[0] one_n = np.ones((N, N)) / N K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n) # Obtaining eigenpairs from the centered kernel matrix # numpy.eigh returns them in sorted order eigvals, eigvecs = eigh(K) # Collect the top k eigenvectors (projected samples) alphas = np.column_stack((eigvecs[:, -i] for i in range(1, n_components + 1))) # Collect the corresponding eigenvalues lambdas = [eigvals[-i] for i in range(1, n_components + 1)] return alphas, lambdas # In[48]: X, y = make_moons(n_samples=100, random_state=123) alphas, lambdas = rbf_kernel_pca(X, gamma=15, n_components=1) # In[49]: x_new = X[-1] x_new # In[50]: x_proj = alphas[-1] # original projection x_proj # In[51]: def project_x(x_new, X, gamma, alphas, lambdas): pair_dist = np.array([np.sum((x_new - row)**2) for row in X]) k = np.exp(-gamma * pair_dist) return k.dot(alphas / lambdas) # projection of the "new" datapoint x_reproj = project_x(x_new, X, gamma=15, alphas=alphas, lambdas=lambdas) x_reproj # In[52]: plt.scatter(alphas[y == 0, 0], np.zeros((50)), color='red', marker='^', alpha=0.5) plt.scatter(alphas[y == 1, 0], np.zeros((50)), color='blue', marker='o', alpha=0.5) plt.scatter(x_proj, 0, color='black', label='original projection of point X[25]', marker='^', s=100) plt.scatter(x_reproj, 0, color='green', label='remapped point X[25]', marker='x', s=500) plt.legend(scatterpoints=1) plt.tight_layout() # plt.savefig('./figures/reproject.png', dpi=300) plt.show() # In[53]: X, y = make_moons(n_samples=100, random_state=123) alphas, lambdas = rbf_kernel_pca(X[:-1, :], gamma=15, n_components=1) def project_x(x_new, X, gamma, alphas, lambdas): pair_dist = np.array([np.sum((x_new - row)**2) for row in X]) k = np.exp(-gamma * pair_dist) return k.dot(alphas / lambdas) # projection of the "new" datapoint x_new = X[-1] x_reproj = project_x(x_new, X[:-1], gamma=15, alphas=alphas, lambdas=lambdas) plt.scatter(alphas[y[:-1] == 0, 0], np.zeros((50)), color='red', marker='^', alpha=0.5) plt.scatter(alphas[y[:-1] == 1, 0], np.zeros((49)), color='blue', marker='o', alpha=0.5) plt.scatter(x_reproj, 0, color='green', label='new point [ 100.0, 100.0]', marker='x', s=500) plt.legend(scatterpoints=1) # In[54]: plt.scatter(alphas[y[:-1] == 0, 0], np.zeros((50)), color='red', marker='^', alpha=0.5) plt.scatter(alphas[y[:-1] == 1, 0], np.zeros((49)), color='blue', marker='o', alpha=0.5) plt.scatter(x_proj, 0, color='black', label='some point [1.8713, 0.0093]', marker='^', s=100) plt.scatter(x_reproj, 0, color='green', label='new point [ 100.0, 100.0]', marker='x', s=500) plt.legend(scatterpoints=1) plt.tight_layout() # plt.savefig('./figures/reproject.png', dpi=300) plt.show() #
#
# ## Kernel principal component analysis in scikit-learn # In[55]: from sklearn.decomposition import KernelPCA X, y = make_moons(n_samples=100, random_state=123) scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15) X_skernpca = scikit_kpca.fit_transform(X) plt.scatter(X_skernpca[y == 0, 0], X_skernpca[y == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X_skernpca[y == 1, 0], X_skernpca[y == 1, 1], color='blue', marker='o', alpha=0.5) plt.xlabel('PC1') plt.ylabel('PC2') plt.tight_layout() # plt.savefig('./figures/scikit_kpca.png', dpi=300) plt.show() #
#
# # Summary # ...