Here we use principal component analysis (PCA) to reduce the number of features in a dataset of faces. The PC's are then fed into a Support Vector Machine (SVM) classifier to classify the faces based on learned features.
The dataset used in this example is a preprocessed excerpt of the "Labeled Faces in the Wild", aka LFW
Here we import the libraries that we need for later
from time import time
import logging
import pylab as pl
import numpy as np
import matplotlib.pyplot as plt
# !pip install -U scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
#Display progress
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
Here we pull in the data and store it in numpy arrays
#Download the data
lfw_people =fetch_lfw_people(min_faces_per_person=70, resize=0.4)
#Find out shape infomration about the images to help with plotting them
n_samples, h, w=lfw_people.images.shape
# for machine learning we use the data directly (as relative pixel
# position info is ignored by this model)
X =
n_features = X.shape[1]
# the label to predict is the ID of the person
y =
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print ("Total dataset size:")
print ("n_samples: %d" % n_samples)
print ("n_features: %d" % n_features)
print ("n_classes: %d" % n_classes)
print ("Classes: %s" % target_names)
Downloading LFW metadata: 2019-07-19 05:21:21,839 Downloading LFW metadata: Downloading LFW metadata: 2019-07-19 05:21:28,215 Downloading LFW metadata: Downloading LFW metadata: 2019-07-19 05:21:29,542 Downloading LFW metadata: Downloading LFW data (~200MB): 2019-07-19 05:21:31,138 Downloading LFW data (~200MB):
Total dataset size: n_samples: 1288 n_features: 1850 n_classes: 7 Classes: ['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush' 'Gerhard Schroeder' 'Hugo Chavez' 'Tony Blair']
#Lets look at the data to see what they look like
for i in range(0,3):
Here we split the data into a testing and train set
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.25, random_state=42)
Now we compute the PC's using PCA
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 250
print("Extracting the top %d eignefaces from %d faces" % (n_components, X_train.shape[0]))
#Initiate a time counter (kinda like tic toc)
#Here we take the training data and compute the PCs
pca=PCA(n_components=n_components, whiten=True).fit(X_train)
#Print the time it took to compute
print("Done in %0.3fs" % (time()-t0))
#Reshape the PCs to the image format
eigenfaces =pca.components_.reshape((n_components,h,w))
for i in range(0,3):
pl.title("Eigenface PC- %d" % (i+1))
# cbar = pl.colorbar()
# cbar.solids.set_edgecolor("face")
# pl.draw()
# print(pca.explained_variance_ )
# print(pca.explained_variance_ratio_)
Extracting the top 250 eignefaces from 966 faces Done in 0.399s
Here we're going to plot the top PCs identified
plt.figure( figsize=(9, 3))
Now that we have the PCs (the vectors that account for the max variances) we can project the data down to the PCs. In this case they are the eigenfaces from above.
print("Projecting the input data on the eignefaces orthonormal basis")
X_train_pca = pca.transform(X_train) #take the training data and project it to eigenfaces
X_test_pca = pca.transform(X_test)#take the test data and project it to eigenfaces
print("Done in %0.3fs" % (time()- t0)) #toc
Projecting the input data on the eignefaces orthonormal basis Done in 0.035s
So now that we have the most important features extracted out by PCA we can use those to train a classifier. The SVM classifier can then be used to predict who's face is being presented
print ("Fitting the classifier to the training set")
t0 = time()
#These set parameters that we want to optimize. These are passed to GridSearch
#which uses the optimal paramters in the fitting classifier =clf
param_grid = {
'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf =, y_train)
print ("done in %0.3fs" % (time() - t0))
print ("Best estimator found by grid search:")
print (clf.best_estimator_)
Fitting the classifier to the training set
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/ FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning. warnings.warn(CV_WARNING, FutureWarning)
done in 39.016s Best estimator found by grid search: SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
#Testing the classifier
print ("Predicting the people names on the testing set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print ("done in %0.3fs" % (time() - t0))
print (classification_report(y_test, y_pred, target_names=target_names))
print (confusion_matrix(y_test, y_pred, labels=range(n_classes)))
Predicting the people names on the testing set done in 0.125s precision recall f1-score support Ariel Sharon 0.64 0.69 0.67 13 Colin Powell 0.75 0.90 0.82 60 Donald Rumsfeld 0.82 0.67 0.73 27 George W Bush 0.91 0.92 0.91 146 Gerhard Schroeder 0.87 0.80 0.83 25 Hugo Chavez 0.80 0.53 0.64 15 Tony Blair 0.82 0.78 0.80 36 accuracy 0.84 322 macro avg 0.80 0.76 0.77 322 weighted avg 0.84 0.84 0.84 322 [[ 9 1 1 2 0 0 0] [ 1 54 2 2 0 1 0] [ 2 3 18 3 0 0 1] [ 1 6 1 134 1 1 2] [ 0 2 0 1 20 0 2] [ 0 4 0 2 0 8 1] [ 1 2 0 3 2 0 28]]
Now that we've trained and tested the SVM to classify faces lets visualized the results
#Create a helper function to look at the pictures
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
pl.figure(figsize=(1.8 * n_col, 2.4 * n_row))
pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
pl.subplot(n_row, n_col, i + 1)
pl.imshow(images[i].reshape((h, w)),
pl.title(titles[i], size=12)
# plot the result of the prediction on a portion of the test set
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
#Now print out preductions
# prediction_titles=(X_test, prediction_titles,h,w)
plot_gallery(X_test, prediction_titles, h, w )
#Plot the eignefaces
eigenface_titles =["Eigneface %d " % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)