#IPython notebook for tutorial on Scikit-learn random forests
#Created by Ryan Feather Nov 17, 2012 (ryan.feather@gmail.com)
#Downloadable from http://featherconflagration.com/tutorials
#For more information on scikit-learn, visit http://scikit-learn.org/stable/
#set up some utility routines for this example. You can safely ignore these for now
import numpy as np
#Sample a percentage of indexes from y in a manner that preserves the ratio of classes
def stratifiedSample(y,percent):
labels = np.unique(y)
sample = []
for label in labels:
#find the current label
currentIndexes = np.nonzero(y==label)[0]
#randomize the sample
np.random.shuffle(currentIndexes)
sampleSize = int(percent*len(currentIndexes))
#extend the current sample list
sample += currentIndexes[0:sampleSize].tolist()
#convert the sample to boolean indexes for convenience
booleanSample = np.zeros((len(y),),dtype=bool)
booleanSample[sample] = True
return booleanSample
#load digits sample data set
from sklearn.datasets import load_digits
digits = load_digits()
digits.data
array([[ 0., 0., 5., ..., 0., 0., 0.], [ 0., 0., 0., ..., 10., 0., 0.], [ 0., 0., 0., ..., 16., 9., 0.], ..., [ 0., 0., 1., ..., 6., 0., 0.], [ 0., 0., 2., ..., 12., 0., 0.], [ 0., 0., 10., ..., 12., 1., 0.]])
digits.data.shape
(1797, 64)
digits.target
array([0, 1, 2, ..., 8, 9, 8])
digits.target.shape
(1797,)
#visualize a digit
from matplotlib import pyplot
pyplot.gray()
pyplot.matshow(digits.images[39])
pyplot.show()
#set up test and training sets
fullData = digits.data
fullLabels = digits.target
#sample 25% for a test set
testIndexes = stratifiedSample(fullLabels,0.25)
testData = fullData[testIndexes,:]
testLabels = fullLabels[testIndexes]
testData.shape
trainData = fullData[~testIndexes,:]
trainLabels = fullLabels[~testIndexes]
trainData.shape
#load the random forest classifier library
from sklearn.ensemble.forest import RandomForestClassifier
#create a random forest
forest = RandomForestClassifier()
forest
RandomForestClassifier(bootstrap=True, compute_importances=False, criterion='gini', max_depth=None, max_features='auto', min_density=0.1, min_samples_leaf=1, min_samples_split=1, n_estimators=10, n_jobs=1, oob_score=False, random_state=<mtrand.RandomState object at 0x7fa36d10a3f0>, verbose=0)
#train the random forest on the training set
forest.fit(trainData,trainLabels)
RandomForestClassifier(bootstrap=True, compute_importances=False, criterion='gini', max_depth=None, max_features='auto', min_density=0.1, min_samples_leaf=1, min_samples_split=1, n_estimators=10, n_jobs=1, oob_score=False, random_state=<mtrand.RandomState object at 0x7fa36d10a3f0>, verbose=0)
#test the forest on the test set
predictedLabels = forest.predict(testData)
predictedLabels
array([4, 0, 6, 7, 5, 4, 9, 5, 5, 5, 8, 4, 0, 4, 6, 6, 0, 9, 3, 8, 1, 2, 1, 7, 4, 6, 1, 4, 5, 3, 8, 4, 2, 8, 7, 8, 6, 5, 8, 8, 7, 2, 2, 3, 7, 4, 6, 9, 1, 1, 5, 6, 7, 2, 5, 4, 0, 3, 4, 5, 6, 0, 8, 4, 5, 9, 0, 9, 8, 4, 7, 2, 3, 3, 3, 6, 7, 5, 1, 7, 1, 7, 5, 3, 7, 6, 2, 9, 5, 9, 3, 3, 4, 5, 8, 0, 5, 6, 9, 0, 8, 8, 7, 5, 2, 7, 2, 0, 3, 6, 9, 2, 0, 3, 2, 4, 6, 1, 9, 1, 6, 9, 7, 4, 2, 4, 9, 0, 3, 7, 9, 2, 5, 8, 1, 3, 5, 7, 9, 6, 7, 8, 2, 7, 3, 3, 6, 4, 9, 2, 0, 1, 1, 7, 1, 7, 8, 6, 5, 1, 8, 9, 0, 8, 9, 2, 4, 0, 8, 4, 3, 2, 3, 6, 9, 0, 9, 5, 0, 0, 7, 7, 1, 1, 6, 8, 7, 4, 5, 4, 8, 8, 1, 2, 5, 8, 9, 2, 6, 0, 2, 5, 8, 4, 5, 2, 1, 7, 6, 4, 9, 7, 1, 3, 4, 0, 5, 7, 5, 4, 5, 0, 8, 4, 6, 7, 9, 6, 9, 1, 8, 0, 9, 5, 5, 9, 8, 1, 2, 2, 0, 9, 5, 0, 2, 8, 6, 0, 1, 3, 1, 5, 4, 7, 2, 9, 0, 8, 7, 8, 6, 1, 2, 9, 0, 3, 6, 6, 0, 2, 7, 2, 1, 3, 3, 1, 3, 7, 4, 7, 2, 9, 8, 4, 0, 8, 8, 1, 2, 9, 5, 2, 3, 5, 0, 7, 2, 2, 9, 0, 1, 6, 3, 6, 5, 5, 3, 3, 8, 3, 5, 6, 9, 1, 8, 0, 3, 4, 7, 1, 8, 2, 4, 7, 9, 6, 0, 8, 3, 1, 0, 2, 9, 6, 9, 5, 9, 3, 1, 5, 3, 1, 4, 8, 2, 4, 0, 0, 3, 4, 5, 1, 0, 4, 3, 6, 7, 0, 5, 5, 7, 1, 2, 6, 6, 9, 6, 1, 3, 7, 4, 4, 8, 8, 2, 3, 7, 0, 7, 3, 9, 5, 4, 1, 7, 8, 0, 1, 6, 3, 3, 4, 6, 1, 5, 7, 0, 2, 7, 9, 3, 1, 3, 9, 4, 1, 4, 7, 1, 4, 0, 0, 6, 7, 1, 2, 4, 6, 1, 3, 5, 4, 7, 1, 2, 2, 6, 9, 3, 6, 0, 0, 0, 7, 6, 6, 3, 1, 9, 1, 6, 1, 1, 0, 7])
#compute the raw accuracy (same as forest.score())
print np.sum(predictedLabels==testLabels)/float(testLabels.shape[0])
0.937078651685
forest.score(testData,testLabels)
0.93707865168539328
#try to autoselect the best number of trees
#use stratified cv iterator to prevent
from sklearn.cross_validation import StratifiedKFold
stratifiedIterator = StratifiedKFold(testLabels,k=4)
stratifiedIterator
sklearn.cross_validation.StratifiedKFold(labels=[4 0 6 7 5 7 9 5 5 9 8 4 0 4 6 6 0 9 5 8 2 2 1 7 4 6 1 4 5 3 8 4 2 8 7 8 6 5 8 8 7 2 2 3 7 4 6 9 1 1 5 6 7 2 5 4 0 3 4 5 6 0 8 4 5 9 0 9 8 4 7 1 3 3 3 6 9 5 1 9 1 7 5 3 9 6 2 9 5 8 3 3 4 5 8 0 5 6 9 0 8 8 7 5 2 7 2 0 3 6 9 2 0 3 2 4 6 1 9 1 6 9 7 4 2 4 9 0 3 7 9 2 5 8 1 3 5 7 9 6 7 8 2 7 3 3 6 4 9 2 0 1 1 7 1 7 8 6 5 8 8 9 0 8 9 2 4 0 8 4 3 2 3 6 9 0 9 5 0 0 7 7 1 1 6 8 4 9 5 4 8 8 1 2 5 8 9 2 6 0 2 5 9 4 5 2 1 7 6 4 9 7 1 3 4 0 5 7 5 4 5 0 8 4 6 7 9 6 9 1 8 0 9 5 5 9 8 1 2 2 0 9 5 0 2 8 6 3 1 3 1 5 4 7 2 9 0 8 7 8 6 1 2 9 0 3 6 6 0 2 7 2 1 3 9 1 3 7 4 7 2 9 8 4 0 8 8 1 2 9 5 2 3 5 0 7 2 2 8 0 1 6 3 6 5 5 3 3 8 3 5 6 9 1 8 0 3 4 7 1 8 2 4 7 9 6 0 8 3 1 0 2 2 6 9 5 9 3 1 5 3 1 4 8 2 4 9 0 3 4 5 1 0 4 5 6 7 0 5 5 7 1 2 6 6 9 6 1 3 7 4 4 8 8 2 3 7 0 2 3 9 5 8 1 7 7 0 1 6 3 3 4 6 4 5 8 0 2 7 4 3 1 3 9 4 1 4 7 4 4 0 0 6 7 1 2 4 6 1 3 5 4 7 1 2 2 6 3 3 6 0 0 0 7 6 6 3 1 9 1 6 3 1 0 7], k=4)
#set up a grid search to select optimum number of trees
from sklearn.grid_search import GridSearchCV
params = {'n_estimators':range(10,101,10)}
#create a parallel grid search that will search for the best parameter set
gridSearch = GridSearchCV(forest,param_grid=params,n_jobs=-1, cv=stratifiedIterator)
#perform the search for the best parameters
cvFitForest = gridSearch.fit(trainData,trainLabels)
print gridSearch.grid_scores_
[({'n_estimators': 10}, 0.92577622265122272, array([ 0.95535714, 0.93693694, 0.93693694, 0.87387387])), ({'n_estimators': 20}, 0.95051077863577871, array([ 0.97321429, 0.95495495, 0.94594595, 0.92792793])), ({'n_estimators': 30}, 0.94825852638352637, array([ 0.97321429, 0.95495495, 0.93693694, 0.92792793])), ({'n_estimators': 40}, 0.95501528314028317, array([ 0.97321429, 0.96396396, 0.94594595, 0.93693694])), ({'n_estimators': 50}, 0.95499517374517373, array([ 0.98214286, 0.95495495, 0.94594595, 0.93693694])), ({'n_estimators': 60}, 0.95726753539253551, array([ 0.97321429, 0.96396396, 0.94594595, 0.94594595])), ({'n_estimators': 70}, 0.95503539253539249, array([ 0.96428571, 0.95495495, 0.95495495, 0.94594595])), ({'n_estimators': 80}, 0.95278314028314037, array([ 0.96428571, 0.95495495, 0.94594595, 0.94594595])), ({'n_estimators': 90}, 0.9550353925353926, array([ 0.96428571, 0.96396396, 0.94594595, 0.94594595])), ({'n_estimators': 100}, 0.9550353925353926, array([ 0.96428571, 0.95495495, 0.94594595, 0.95495495]))]
#take a look at the cv scores for one parameter set
gridSearch.grid_scores_[0]
({'n_estimators': 10}, 0.92577622265122272, array([ 0.95535714, 0.93693694, 0.93693694, 0.87387387]))
#extract interesting data
estimator = [gridScore[0]['n_estimators'] for gridScore in gridSearch.grid_scores_]
score = [gridScore[1] for gridScore in gridSearch.grid_scores_]
estimator
[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
score
[0.92577622265122272, 0.95051077863577871, 0.94825852638352637, 0.95501528314028317, 0.95499517374517373, 0.95726753539253551, 0.95503539253539249, 0.95278314028314037, 0.9550353925353926, 0.9550353925353926]
#plot the cv score with respect to parameters
pyplot.plot(estimator,score)
pyplot.show()
#compare to score on the test set
print cvFitForest.score(testData,testLabels)
0.961797752809