from sklearn.datasets import fetch_lfw_people
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.decomposition import RandomizedPCA
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize = 0.4)
X = lfw_people.data
y = lfw_people.target
X, y = shuffle(X, y)
train_X, test_X, train_y, test_y = train_test_split(X, y,
test_size = 0.2)
print train_X.shape, test_X.shape
print np.unique(y)
(1030, 1850) (258, 1850) [0 1 2 3 4 5 6]
## PCA WHITEN OR NOT (ON AS MUCH DATA AS POSSIBLE)
pca_whiten = RandomizedPCA(n_components=150, whiten=True)
pca = RandomizedPCA(n_components=150, whiten = False)
white_train_X = pca_whiten.fit_transform(train_X)
white_test_X = pca_whiten.transform(test_X)
pca_train_X = pca.fit_transform(train_X)
pca_test_X = pca.transform(test_X)
datasets = {
'raw': [(train_X, train_y), (test_X, test_y)],
'pca': [(pca_train_X, train_y), (pca_test_X, test_y)],
'white': [(white_train_X, train_y), (white_test_X, test_y)]
}
trees = ExtraTreesClassifier()
svc = SVC(kernel = 'rbf')
trees_params = {
'n_estimators': [100, 200, 1000],
'max_features': [0.05, 0.2, 0.5, 1.0]
}
svc_params = {
'C': [5e2, 1e3, 5e3, 1e4, 5e4],
'gamma': [1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 1e-1]
}
models = {
'trees': (trees, trees_params),
'svc': (svc, svc_params)
}
from itertools import product
from sklearn.grid_search import GridSearchCV
def benchmark(candidate):
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
(data_name, data), (model_name, model_infor) = candidate
train_data, test_data = data
model, param_grid = model_infor
gs = GridSearchCV(model, param_grid, cv = 3, n_jobs=-1)
gs.fit(*train_data)
cv_score = gs.best_score_
test_score = gs.best_estimator_.score(*test_data)
return (model_name, data_name, cv_score, test_score, gs.best_params_)
candidates = product(datasets.items(), models.items())
results = []
for candidate in candidates:
print '============================='
results.append(benchmark(candidate))
print results[-1]
============================= ('svc', 'raw', 0.40194174757281553, 0.44961240310077522, {'C': 500.0, 'gamma': 0.0001}) ============================= ('trees', 'raw', 0.66407766990291262, 0.68217054263565891, {'max_features': 0.2, 'n_estimators': 1000}) ============================= ('svc', 'white', 0.80679611650485439, 0.85658914728682167, {'C': 500.0, 'gamma': 0.005}) ============================= ('trees', 'white', 0.62038834951456312, 0.68992248062015504, {'max_features': 1.0, 'n_estimators': 1000}) ============================= ('svc', 'pca', 0.40194174757281553, 0.44961240310077522, {'C': 500.0, 'gamma': 0.0001}) ============================= ('trees', 'pca', 0.60485436893203881, 0.70930232558139539, {'max_features': 1.0, 'n_estimators': 1000})
*Put it simple, SVM + PCA (dimensionality reduction) + Whitening is very good for face recognition*
## HOW TO BOOST the performance of random trees - one way could be
## to use more advanced features such as sparse filtering.
## BUT why does a SVM with PCA white has such a higher accuracy than
## random forest?
from sklearn.cross_validation import cross_val_score
super_trees = ExtraTreesClassifier(n_estimators=300, max_depth=None,
max_features=0.5, n_jobs=-1,
bootstrap=False)
print cross_val_score(super_trees, train_X, train_y, cv = 3)
[ 0.67151163 0.66472303 0.67638484]
## Is it because the one-vs-one (SVM) and one-vs-rest (RBF) difference?
## IT SEEMS NOT!!
from sklearn.multiclass import OneVsOneClassifier
super_trees = OneVsOneClassifier(ExtraTreesClassifier(n_estimators=1000, max_depth=None,
max_features=0.5, n_jobs=1,
bootstrap=False),
n_jobs = -1)
print cross_val_score(super_trees, train_X, train_y, cv = 3)
[ 0.66569767 0.67346939 0.70553936]
## MAKE IT BETTER BY USING TREES ON FEATURE SELECTION and USE SVM
trees_selector = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, max_features=1.0)
trees_selector.fit(white_train_X, train_y)
ExtraTreesClassifier(bootstrap=False, compute_importances=None, criterion='gini', max_depth=None, max_features=1.0, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, n_jobs=-1, oob_score=False, random_state=None, verbose=0)
trees_selector.estimators_[0].feature_importances_
feature_importances = np.mean(np.asarray([t.feature_importances_ for t in trees_selector.estimators_]), axis = 0)
figure(figsize=(32, 8))
plt.bar(np.arange(feature_importances.shape[0]), feature_importances)
<Container object of 150 artists>