In [3]:

#IPython notebook for tutorial on Scikit-learn random forests
#Created by Ryan Feather Nov 17, 2012 (ryan.feather@gmail.com)
#Downloadable from http://featherconflagration.com/tutorials
#For more information on scikit-learn, visit http://scikit-learn.org/stable/

In [13]:

#set up some utility routines for this example. You can safely ignore these for now
import numpy as np

#Sample a percentage of indexes from y in a manner that preserves the ratio of classes
def stratifiedSample(y,percent):
    labels = np.unique(y)
    sample = []
    for label in labels:
        #find the current label
        currentIndexes = np.nonzero(y==label)[0]
        #randomize the sample
        np.random.shuffle(currentIndexes)
        sampleSize = int(percent*len(currentIndexes))
        #extend the current sample list
        sample += currentIndexes[0:sampleSize].tolist()
    #convert the sample to boolean indexes for convenience
    booleanSample = np.zeros((len(y),),dtype=bool)
    booleanSample[sample] = True
    return booleanSample

In [1]:

#load digits sample data set
from sklearn.datasets import load_digits
digits = load_digits()

In [5]:

digits.data

Out[5]:

array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,  10.,   0.,   0.],
       [  0.,   0.,   0., ...,  16.,   9.,   0.],
       ..., 
       [  0.,   0.,   1., ...,   6.,   0.,   0.],
       [  0.,   0.,   2., ...,  12.,   0.,   0.],
       [  0.,   0.,  10., ...,  12.,   1.,   0.]])

In [6]:

digits.data.shape

Out[6]:

(1797, 64)

In [8]:

digits.target

Out[8]:

array([0, 1, 2, ..., 8, 9, 8])

In [9]:

digits.target.shape

Out[9]:

(1797,)

In [10]:

#visualize a digit
from  matplotlib import pyplot
pyplot.gray()
pyplot.matshow(digits.images[39]) 
pyplot.show()

In [11]:

#set up test and training sets
fullData = digits.data
fullLabels = digits.target

In [16]:

#sample 25% for a test set
testIndexes = stratifiedSample(fullLabels,0.25)

In [ ]:

testData = fullData[testIndexes,:]
testLabels = fullLabels[testIndexes]

In [ ]:

testData.shape

In [ ]:

trainData = fullData[~testIndexes,:]
trainLabels = fullLabels[~testIndexes]

In [ ]:

trainData.shape

In [17]:

#load the random forest classifier library
from sklearn.ensemble.forest import RandomForestClassifier
#create a random forest
forest = RandomForestClassifier()

In [18]:

forest

Out[18]:

RandomForestClassifier(bootstrap=True, compute_importances=False,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=0.1, min_samples_leaf=1, min_samples_split=1,
            n_estimators=10, n_jobs=1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x7fa36d10a3f0>,
            verbose=0)

In [19]:

#train the random forest on the training set
forest.fit(trainData,trainLabels)

Out[19]:

RandomForestClassifier(bootstrap=True, compute_importances=False,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=0.1, min_samples_leaf=1, min_samples_split=1,
            n_estimators=10, n_jobs=1, oob_score=False,
            random_state=<mtrand.RandomState object at 0x7fa36d10a3f0>,
            verbose=0)

In [20]:

#test the forest on the test set
predictedLabels = forest.predict(testData)

In [21]:

predictedLabels

Out[21]:

array([4, 0, 6, 7, 5, 4, 9, 5, 5, 5, 8, 4, 0, 4, 6, 6, 0, 9, 3, 8, 1, 2, 1,
       7, 4, 6, 1, 4, 5, 3, 8, 4, 2, 8, 7, 8, 6, 5, 8, 8, 7, 2, 2, 3, 7, 4,
       6, 9, 1, 1, 5, 6, 7, 2, 5, 4, 0, 3, 4, 5, 6, 0, 8, 4, 5, 9, 0, 9, 8,
       4, 7, 2, 3, 3, 3, 6, 7, 5, 1, 7, 1, 7, 5, 3, 7, 6, 2, 9, 5, 9, 3, 3,
       4, 5, 8, 0, 5, 6, 9, 0, 8, 8, 7, 5, 2, 7, 2, 0, 3, 6, 9, 2, 0, 3, 2,
       4, 6, 1, 9, 1, 6, 9, 7, 4, 2, 4, 9, 0, 3, 7, 9, 2, 5, 8, 1, 3, 5, 7,
       9, 6, 7, 8, 2, 7, 3, 3, 6, 4, 9, 2, 0, 1, 1, 7, 1, 7, 8, 6, 5, 1, 8,
       9, 0, 8, 9, 2, 4, 0, 8, 4, 3, 2, 3, 6, 9, 0, 9, 5, 0, 0, 7, 7, 1, 1,
       6, 8, 7, 4, 5, 4, 8, 8, 1, 2, 5, 8, 9, 2, 6, 0, 2, 5, 8, 4, 5, 2, 1,
       7, 6, 4, 9, 7, 1, 3, 4, 0, 5, 7, 5, 4, 5, 0, 8, 4, 6, 7, 9, 6, 9, 1,
       8, 0, 9, 5, 5, 9, 8, 1, 2, 2, 0, 9, 5, 0, 2, 8, 6, 0, 1, 3, 1, 5, 4,
       7, 2, 9, 0, 8, 7, 8, 6, 1, 2, 9, 0, 3, 6, 6, 0, 2, 7, 2, 1, 3, 3, 1,
       3, 7, 4, 7, 2, 9, 8, 4, 0, 8, 8, 1, 2, 9, 5, 2, 3, 5, 0, 7, 2, 2, 9,
       0, 1, 6, 3, 6, 5, 5, 3, 3, 8, 3, 5, 6, 9, 1, 8, 0, 3, 4, 7, 1, 8, 2,
       4, 7, 9, 6, 0, 8, 3, 1, 0, 2, 9, 6, 9, 5, 9, 3, 1, 5, 3, 1, 4, 8, 2,
       4, 0, 0, 3, 4, 5, 1, 0, 4, 3, 6, 7, 0, 5, 5, 7, 1, 2, 6, 6, 9, 6, 1,
       3, 7, 4, 4, 8, 8, 2, 3, 7, 0, 7, 3, 9, 5, 4, 1, 7, 8, 0, 1, 6, 3, 3,
       4, 6, 1, 5, 7, 0, 2, 7, 9, 3, 1, 3, 9, 4, 1, 4, 7, 1, 4, 0, 0, 6, 7,
       1, 2, 4, 6, 1, 3, 5, 4, 7, 1, 2, 2, 6, 9, 3, 6, 0, 0, 0, 7, 6, 6, 3,
       1, 9, 1, 6, 1, 1, 0, 7])

In [22]:

#compute the raw accuracy (same as forest.score())
print np.sum(predictedLabels==testLabels)/float(testLabels.shape[0])

0.937078651685

In [25]:

forest.score(testData,testLabels)

Out[25]:

0.93707865168539328

In [30]:

#try to autoselect the best number of trees
#use stratified cv iterator to prevent 
from sklearn.cross_validation import StratifiedKFold
stratifiedIterator = StratifiedKFold(testLabels,k=4)

In [31]:

stratifiedIterator

Out[31]:

sklearn.cross_validation.StratifiedKFold(labels=[4 0 6 7 5 7 9 5 5 9 8 4 0 4 6 6 0 9 5 8 2 2 1 7 4 6 1 4 5 3 8 4 2 8 7 8 6
 5 8 8 7 2 2 3 7 4 6 9 1 1 5 6 7 2 5 4 0 3 4 5 6 0 8 4 5 9 0 9 8 4 7 1 3 3
 3 6 9 5 1 9 1 7 5 3 9 6 2 9 5 8 3 3 4 5 8 0 5 6 9 0 8 8 7 5 2 7 2 0 3 6 9
 2 0 3 2 4 6 1 9 1 6 9 7 4 2 4 9 0 3 7 9 2 5 8 1 3 5 7 9 6 7 8 2 7 3 3 6 4
 9 2 0 1 1 7 1 7 8 6 5 8 8 9 0 8 9 2 4 0 8 4 3 2 3 6 9 0 9 5 0 0 7 7 1 1 6
 8 4 9 5 4 8 8 1 2 5 8 9 2 6 0 2 5 9 4 5 2 1 7 6 4 9 7 1 3 4 0 5 7 5 4 5 0
 8 4 6 7 9 6 9 1 8 0 9 5 5 9 8 1 2 2 0 9 5 0 2 8 6 3 1 3 1 5 4 7 2 9 0 8 7
 8 6 1 2 9 0 3 6 6 0 2 7 2 1 3 9 1 3 7 4 7 2 9 8 4 0 8 8 1 2 9 5 2 3 5 0 7
 2 2 8 0 1 6 3 6 5 5 3 3 8 3 5 6 9 1 8 0 3 4 7 1 8 2 4 7 9 6 0 8 3 1 0 2 2
 6 9 5 9 3 1 5 3 1 4 8 2 4 9 0 3 4 5 1 0 4 5 6 7 0 5 5 7 1 2 6 6 9 6 1 3 7
 4 4 8 8 2 3 7 0 2 3 9 5 8 1 7 7 0 1 6 3 3 4 6 4 5 8 0 2 7 4 3 1 3 9 4 1 4
 7 4 4 0 0 6 7 1 2 4 6 1 3 5 4 7 1 2 2 6 3 3 6 0 0 0 7 6 6 3 1 9 1 6 3 1 0
 7], k=4)

In [32]:

#set up a grid search to select optimum number of trees
from sklearn.grid_search import GridSearchCV
params = {'n_estimators':range(10,101,10)}

In [ ]:

#create a parallel grid search that will search for the best parameter set
gridSearch = GridSearchCV(forest,param_grid=params,n_jobs=-1, cv=stratifiedIterator)

In [ ]:

#perform the search for the best parameters
cvFitForest = gridSearch.fit(trainData,trainLabels)

In [34]:

print gridSearch.grid_scores_

[({'n_estimators': 10}, 0.92577622265122272, array([ 0.95535714,  0.93693694,  0.93693694,  0.87387387])), ({'n_estimators': 20}, 0.95051077863577871, array([ 0.97321429,  0.95495495,  0.94594595,  0.92792793])), ({'n_estimators': 30}, 0.94825852638352637, array([ 0.97321429,  0.95495495,  0.93693694,  0.92792793])), ({'n_estimators': 40}, 0.95501528314028317, array([ 0.97321429,  0.96396396,  0.94594595,  0.93693694])), ({'n_estimators': 50}, 0.95499517374517373, array([ 0.98214286,  0.95495495,  0.94594595,  0.93693694])), ({'n_estimators': 60}, 0.95726753539253551, array([ 0.97321429,  0.96396396,  0.94594595,  0.94594595])), ({'n_estimators': 70}, 0.95503539253539249, array([ 0.96428571,  0.95495495,  0.95495495,  0.94594595])), ({'n_estimators': 80}, 0.95278314028314037, array([ 0.96428571,  0.95495495,  0.94594595,  0.94594595])), ({'n_estimators': 90}, 0.9550353925353926, array([ 0.96428571,  0.96396396,  0.94594595,  0.94594595])), ({'n_estimators': 100}, 0.9550353925353926, array([ 0.96428571,  0.95495495,  0.94594595,  0.95495495]))]

In [35]:

#take a look at the cv scores for one parameter set
gridSearch.grid_scores_[0]

Out[35]:

({'n_estimators': 10},
 0.92577622265122272,
 array([ 0.95535714,  0.93693694,  0.93693694,  0.87387387]))

In [36]:

#extract interesting data
estimator = [gridScore[0]['n_estimators'] for gridScore in gridSearch.grid_scores_]
score = [gridScore[1] for gridScore in gridSearch.grid_scores_]

In [37]:

estimator

Out[37]:

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [38]:

score

Out[38]:

[0.92577622265122272,
 0.95051077863577871,
 0.94825852638352637,
 0.95501528314028317,
 0.95499517374517373,
 0.95726753539253551,
 0.95503539253539249,
 0.95278314028314037,
 0.9550353925353926,
 0.9550353925353926]

In [39]:

#plot the cv score with respect to parameters
pyplot.plot(estimator,score)
pyplot.show()

In [40]:

#compare to score on the test set
print cvFitForest.score(testData,testLabels)

0.961797752809