#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', " -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn") # # SciPy 2016 Scikit-learn Tutorial # # Case Study - Face Recognition with Eigenfaces # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # Here we'll take a look at a simple facial recognition example. # This uses a dataset available within scikit-learn consisting of a # subset of the [Labeled Faces in the Wild](http://vis-www.cs.umass.edu/lfw/) # data. Note that this is a relatively large download (~200MB) so it may # take a while to execute. # In[ ]: from sklearn import datasets lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4, data_home='datasets') lfw_people.data.shape # If you're on a unix-based system such as linux or Mac OSX, these shell commands # can be used to see the downloaded dataset: # In[ ]: get_ipython().system('ls datasets') # In[ ]: get_ipython().system('du -sh datasets/lfw_home') # Once again, let's visualize these faces to see what we're working with: # In[ ]: fig = plt.figure(figsize=(8, 6)) # plot several images for i in range(15): ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[]) ax.imshow(lfw_people.images[i], cmap=plt.cm.bone) # In[ ]: import numpy as np plt.figure(figsize=(10, 2)) unique_targets = np.unique(lfw_people.target) counts = [(lfw_people.target == i).sum() for i in unique_targets] plt.xticks(unique_targets, lfw_people.target_names[unique_targets]) locs, labels = plt.xticks() plt.setp(labels, rotation=45, size=14) plt.bar(unique_targets, counts); # One thing to note is that these faces have already been centered and scaled # to a common size. This is an important preprocessing piece for facial # recognition, and is a process that can require a large collection of training # data. This can be done in scikit-learn, but the challenge is gathering a # sufficient amount of training data for the algorithm to work. # # Fortunately, centering and scaling has already been applied to this dataset. # # (One good resource is [OpenCV](http://opencv.willowgarage.com/wiki/FaceRecognition), the # *Open Computer Vision Library*.) # Next, we'll perform a Support Vector classification of the images; like always, we start with a typical train-test split on the images: # In[ ]: from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( lfw_people.data, lfw_people.target, test_size=0.25, stratify=lfw_people.target, random_state=0) print('Train size:', X_train.shape) print('Test size:', X_test.shape) # ## Preprocessing: Principal Component Analysis # 1850 dimensions are a lot for fitting an SVM. We can use PCA to reduce these 1850 features to a manageable # size, while maintaining most of the information in the dataset. Here it is useful to use a variant # of PCA called ``RandomizedPCA``, which is an approximation of PCA that can be much faster for large # datasets. We saw this method in the previous notebook, and will use it again here: # In[ ]: from sklearn import decomposition pca = decomposition.RandomizedPCA(n_components=150, whiten=True, random_state=1999) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print('Train size after PCA:', X_train_pca.shape) print('Test size after PCA:', X_test_pca.shape) # These projected components correspond to factors in a linear combination of # component images such that the combination approaches the original face. In general, PCA can be a powerful technique for preprocessing that *can* improve classification performance substantially in certain applications. # ## Fitting a Support Vector Machine # Now we'll perform support-vector-machine classification on this reduced dataset: # In[ ]: from sklearn import svm clf = svm.SVC(C=5., gamma=0.001, random_state=1) clf.fit(X_train_pca, y_train) # Finally, we can evaluate how well this classification did. First, we might plot a # few of the test-cases with the labels learned from the training set: # In[ ]: fig = plt.figure(figsize=(8, 6)) for i in range(15): ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[]) ax.imshow(X_test[i].reshape((50, 37)), cmap=plt.cm.bone) y_pred = clf.predict(X_test_pca[i].reshape(1, -1))[0] color = 'black' if y_pred == y_test[i] else 'red' ax.set_title(lfw_people.target_names[y_pred], fontsize='small', color=color) # The classifier is correct on an impressive number of images given the simplicity # of its learning model! Using a linear classifier on 150 features derived from # the pixel-level data, the algorithm correctly identifies a large number of the # people in the images. # # Again, we can # quantify this effectiveness using ``clf.score`` # In[ ]: print('Accuracy: %.2f%%' % (clf.score(X_test_pca, y_test)*100)) # ## Final Note # Here we have used PCA "eigenfaces" as a pre-processing step for facial recognition. # The reason we chose this is because PCA is a broadly-applicable technique, which can # be useful for a wide array of data types. For more details on the eigenfaces approach, see the original paper by [Turk and Penland, Eigenfaces for Recognition](http://www.face-rec.org/algorithms/PCA/jcn.pdf). Research in the field of facial recognition has moved much farther beyond this paper, and has shown specific feature extraction methods can be more effective. However, eigenfaces is a canonical example of machine learning "in the wild", and is a simple method with good results.