#!/usr/bin/env python # coding: utf-8 # ___ # # # ___ #
Copyright by Pierian Data Inc.
#
For more information, visit us at www.pieriandata.com
# # Random Forest - Classification # ## The Data # # We will be using the same dataset through our discussions on classification with tree-methods (Decision Tree,Random Forests, and Gradient Boosted Trees) in order to compare performance metrics across these related models. # # We will work with the "Palmer Penguins" dataset, as it is simple enough to help us fully understand how changing hyperparameters can change classification results. # # # # # Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network. # # Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism and Environmental Variability within a Community of Antarctic Penguins (Genus Pygoscelis). PLoS ONE 9(3): e90081. doi:10.1371/journal.pone.0090081 # # Summary: # The data folder contains two CSV files. For intro courses/examples, you probably want to use the first one (penguins_size.csv). # # * penguins_size.csv: Simplified data from original penguin data sets. Contains variables: # # * species: penguin species (Chinstrap, Adélie, or Gentoo) # * culmen_length_mm: culmen length (mm) # * culmen_depth_mm: culmen depth (mm) # * flipper_length_mm: flipper length (mm) # * body_mass_g: body mass (g) # * island: island name (Dream, Torgersen, or Biscoe) in the Palmer Archipelago (Antarctica) # * sex: penguin sex # # * (Not used) penguins_lter.csv: Original combined data for 3 penguin species # # Note: The culmen is "the upper ridge of a bird's beak" # # **Our goal is to create a model that can help predict a species of a penguin based on physical attributes, then we can use that model to help researchers classify penguins in the field, instead of needing an experienced biologist** # ## Imports # In[77]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # In[78]: df = pd.read_csv("../DATA/penguins_size.csv") # In[79]: df = df.dropna() df.head() # ## Train | Test Split # In[87]: X = pd.get_dummies(df.drop('species',axis=1),drop_first=True) y = df['species'] # In[88]: from sklearn.model_selection import train_test_split # In[89]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) # # Random Forest Classification # In[90]: from sklearn.ensemble import RandomForestClassifier # In[91]: help(RandomForestClassifier) # In[98]: # Use 10 random trees model = RandomForestClassifier(n_estimators=10,max_features='auto',random_state=101) # In[99]: model.fit(X_train,y_train) # In[100]: preds = model.predict(X_test) # ## Evaluation # In[101]: from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score # In[102]: confusion_matrix(y_test,preds) # In[103]: plot_confusion_matrix(model,X_test,y_test) # ## Feature Importance # # Very useful attribute of the trained model! # In[52]: model.feature_importances_ # ## Choosing correct number of trees # Let's explore if continually adding more trees improves performance... # In[53]: test_error = [] for n in range(1,40): # Use n random trees model = RandomForestClassifier(n_estimators=n,max_features='auto') model.fit(X_train,y_train) test_preds = model.predict(X_test) test_error.append(1-accuracy_score(test_preds,y_test)) # In[54]: plt.plot(range(1,40),test_error,label='Test Error') plt.legend() # Clearly there are diminishing returns, on such a small dataset, we've pretty much extracted all the information we can after about 5 trees. # # Random Forest - HyperParameter Exploration # https://archive.ics.uci.edu/ml/datasets/banknote+authentication # In[55]: df = pd.read_csv("../DATA/data_banknote_authentication.csv") # In[56]: df.head() # In[57]: sns.pairplot(df,hue='Class') # In[58]: X = df.drop("Class",axis=1) # In[59]: y = df["Class"] # In[60]: from sklearn.model_selection import train_test_split # In[61]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101) # In[62]: from sklearn.model_selection import GridSearchCV # In[63]: n_estimators=[64,100,128,200] max_features= [2,3,4] bootstrap = [True,False] oob_score = [True,False] # In[64]: param_grid = {'n_estimators':n_estimators, 'max_features':max_features, 'bootstrap':bootstrap, 'oob_score':oob_score} # Note, oob_score only makes sense when bootstrap=True! # In[65]: rfc = RandomForestClassifier() grid = GridSearchCV(rfc,param_grid) # In[66]: grid.fit(X_train,y_train) # In[67]: grid.best_params_ # In[68]: predictions = grid.predict(X_test) # In[69]: print(classification_report(y_test,predictions)) # In[70]: plot_confusion_matrix(grid,X_test,y_test) # In[71]: # No underscore, reports back original oob_score parameter grid.best_estimator_.oob_score # In[72]: # With underscore, reports back fitted attribute of oob_score grid.best_estimator_.oob_score_ # ## Understanding Number of Estimators (Trees) # # Let's plot out error vs. Number of Estimators # In[73]: from sklearn.metrics import accuracy_score # In[74]: errors = [] misclassifications = [] for n in range(1,64): rfc = RandomForestClassifier( n_estimators=n,bootstrap=True,max_features= 2) rfc.fit(X_train,y_train) preds = rfc.predict(X_test) err = 1 - accuracy_score(preds,y_test) n_missed = np.sum(preds != y_test) # watch the video to understand this line!! errors.append(err) misclassifications.append(n_missed) # In[75]: plt.plot(range(1,64),errors) # In[76]: plt.plot(range(1,64),misclassifications) # In[ ]: