#!/usr/bin/env python
# coding: utf-8
# ___
#
#
# ___
#
Copyright by Pierian Data Inc.
# For more information, visit us at www.pieriandata.com
# # Random Forest - Classification
# ## The Data
#
# We will be using the same dataset through our discussions on classification with tree-methods (Decision Tree,Random Forests, and Gradient Boosted Trees) in order to compare performance metrics across these related models.
#
# We will work with the "Palmer Penguins" dataset, as it is simple enough to help us fully understand how changing hyperparameters can change classification results.
#
#
#
#
# Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network.
#
# Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism and Environmental Variability within a Community of Antarctic Penguins (Genus Pygoscelis). PLoS ONE 9(3): e90081. doi:10.1371/journal.pone.0090081
#
# Summary:
# The data folder contains two CSV files. For intro courses/examples, you probably want to use the first one (penguins_size.csv).
#
# * penguins_size.csv: Simplified data from original penguin data sets. Contains variables:
#
# * species: penguin species (Chinstrap, Adélie, or Gentoo)
# * culmen_length_mm: culmen length (mm)
# * culmen_depth_mm: culmen depth (mm)
# * flipper_length_mm: flipper length (mm)
# * body_mass_g: body mass (g)
# * island: island name (Dream, Torgersen, or Biscoe) in the Palmer Archipelago (Antarctica)
# * sex: penguin sex
#
# * (Not used) penguins_lter.csv: Original combined data for 3 penguin species
#
# Note: The culmen is "the upper ridge of a bird's beak"
#
# **Our goal is to create a model that can help predict a species of a penguin based on physical attributes, then we can use that model to help researchers classify penguins in the field, instead of needing an experienced biologist**
# ## Imports
# In[77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# In[78]:
df = pd.read_csv("../DATA/penguins_size.csv")
# In[79]:
df = df.dropna()
df.head()
# ## Train | Test Split
# In[87]:
X = pd.get_dummies(df.drop('species',axis=1),drop_first=True)
y = df['species']
# In[88]:
from sklearn.model_selection import train_test_split
# In[89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
# # Random Forest Classification
# In[90]:
from sklearn.ensemble import RandomForestClassifier
# In[91]:
help(RandomForestClassifier)
# In[98]:
# Use 10 random trees
model = RandomForestClassifier(n_estimators=10,max_features='auto',random_state=101)
# In[99]:
model.fit(X_train,y_train)
# In[100]:
preds = model.predict(X_test)
# ## Evaluation
# In[101]:
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score
# In[102]:
confusion_matrix(y_test,preds)
# In[103]:
plot_confusion_matrix(model,X_test,y_test)
# ## Feature Importance
#
# Very useful attribute of the trained model!
# In[52]:
model.feature_importances_
# ## Choosing correct number of trees
# Let's explore if continually adding more trees improves performance...
# In[53]:
test_error = []
for n in range(1,40):
# Use n random trees
model = RandomForestClassifier(n_estimators=n,max_features='auto')
model.fit(X_train,y_train)
test_preds = model.predict(X_test)
test_error.append(1-accuracy_score(test_preds,y_test))
# In[54]:
plt.plot(range(1,40),test_error,label='Test Error')
plt.legend()
# Clearly there are diminishing returns, on such a small dataset, we've pretty much extracted all the information we can after about 5 trees.
# # Random Forest - HyperParameter Exploration
# https://archive.ics.uci.edu/ml/datasets/banknote+authentication
# In[55]:
df = pd.read_csv("../DATA/data_banknote_authentication.csv")
# In[56]:
df.head()
# In[57]:
sns.pairplot(df,hue='Class')
# In[58]:
X = df.drop("Class",axis=1)
# In[59]:
y = df["Class"]
# In[60]:
from sklearn.model_selection import train_test_split
# In[61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)
# In[62]:
from sklearn.model_selection import GridSearchCV
# In[63]:
n_estimators=[64,100,128,200]
max_features= [2,3,4]
bootstrap = [True,False]
oob_score = [True,False]
# In[64]:
param_grid = {'n_estimators':n_estimators,
'max_features':max_features,
'bootstrap':bootstrap,
'oob_score':oob_score} # Note, oob_score only makes sense when bootstrap=True!
# In[65]:
rfc = RandomForestClassifier()
grid = GridSearchCV(rfc,param_grid)
# In[66]:
grid.fit(X_train,y_train)
# In[67]:
grid.best_params_
# In[68]:
predictions = grid.predict(X_test)
# In[69]:
print(classification_report(y_test,predictions))
# In[70]:
plot_confusion_matrix(grid,X_test,y_test)
# In[71]:
# No underscore, reports back original oob_score parameter
grid.best_estimator_.oob_score
# In[72]:
# With underscore, reports back fitted attribute of oob_score
grid.best_estimator_.oob_score_
# ## Understanding Number of Estimators (Trees)
#
# Let's plot out error vs. Number of Estimators
# In[73]:
from sklearn.metrics import accuracy_score
# In[74]:
errors = []
misclassifications = []
for n in range(1,64):
rfc = RandomForestClassifier( n_estimators=n,bootstrap=True,max_features= 2)
rfc.fit(X_train,y_train)
preds = rfc.predict(X_test)
err = 1 - accuracy_score(preds,y_test)
n_missed = np.sum(preds != y_test) # watch the video to understand this line!!
errors.append(err)
misclassifications.append(n_missed)
# In[75]:
plt.plot(range(1,64),errors)
# In[76]:
plt.plot(range(1,64),misclassifications)
# In[ ]: