In [2]:

%run ../../../common_functions/import_all.py

from common_functions.setup_notebook import set_css_style, setup_matplotlib, config_ipython
from common_functions.class_helpers import do_plot_conf_mat

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

config_ipython()
setup_matplotlib()
set_css_style()

Out[2]:

Classification - performance metrics¶

We will use the Iris Dataset to do a little classification with a Random Forest and look at the performance metrics.

Confusion matrix¶

This uses a routine we wrote here for the job of computing it and plotting it

In [6]:

# Load the Iris dataset from sklearn, separating the data matrix and the array of classes
iris = load_iris()
X = iris.data
y = iris.target

# Initiate the classifier (using default parameters)
rf = RandomForestClassifier()

# Splitting the dataset into train and test (70%/30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Fitting model on training set and predict on test set
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Plot the confusion matrix
do_plot_conf_mat(y_test, y_pred)

Out[6]:

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Out[6]:

array([[18,  0,  0],
       [ 0, 14,  0],
       [ 0,  1, 12]])

Precision, recall and F1-score¶

In [7]:

# sklearn furnishes a report of these metrics for all classes in one go!
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       0.93      1.00      0.97        14
           2       1.00      0.92      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.98        45
weighted avg       0.98      0.98      0.98        45

Accuracy¶

In [8]:

# From sklearn
accuracy_score(y_test, y_pred)

Out[8]:

0.9777777777777777