%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import sys
sys.path.append('/Users/kaonpark/workspace/github.com/likejazz/kaon-learn')
import kaonlearn
from kaonlearn.plots import plot_decision_regions
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# create a synthetic dataset
X, y = make_blobs(random_state=0)
# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# instantiate a model and fit it to the training set
logreg = LogisticRegression().fit(X_train, y_train)
# evaluate the model on the test set
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))
Test set score: 0.88
plot_decision_regions(X, y, logreg)
<matplotlib.axes._subplots.AxesSubplot at 0x110cd7c18>
plot_decision_regions(X_train, y_train, logreg)
<matplotlib.axes._subplots.AxesSubplot at 0x1110570b8>
plot_decision_regions(X_test, y_test, logreg)
<matplotlib.axes._subplots.AxesSubplot at 0x10bf0c7f0>
X_test
array([[ 1.28535145, 1.43691285], [ 0.94808785, 4.7321192 ], [ 3.00251949, 0.74265357], [-0.6700734 , 2.26685667], [ 2.11567076, 3.06896151], [-2.56114686, 3.59947678], [-1.62535654, 2.25440397], [ 3.97820955, 2.37817845], [-2.02493646, 4.84741432], [-0.33887422, 3.23482487], [ 2.2635425 , 1.8743027 ], [-0.42724442, 3.57314599], [-1.88089792, 1.54293097], [ 2.50904929, 5.7731461 ], [-0.57748321, 3.0054335 ], [ 1.12031365, 5.75806083], [-0.88677249, 1.30092622], [ 0.9845149 , 1.95211539], [ 2.47034915, 4.09862906], [ 2.72756228, 1.3051255 ], [-0.73000011, 6.25456272], [-2.33031368, 2.22833248], [-0.63762777, 4.09104705], [ 3.2460247 , 2.84942165], [ 0.4666179 , 3.86571303]])
y_test
array([1, 0, 1, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2, 0, 2, 0, 2, 1, 0, 1, 0, 2, 0, 0, 0])
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(logreg, X, y)
print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [ 0.88235294 0.84848485 0.90909091]
scores = cross_val_score(logreg, X, y, cv=5)
print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [ 0.9047619 0.85714286 0.9047619 0.89473684 0.88888889]
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
print("Cross-validation scores:\n{}".format(
cross_val_score(logreg, X, y, cv=kfold)))
Cross-validation scores: [ 0.85 0.9 0.95 0.8 0.95]
kfold = KFold(n_splits=3)
print("Cross-validation scores:\n{}".format(
cross_val_score(logreg, X, y, cv=kfold)))
Cross-validation scores: [ 0.88235294 0.93939394 0.84848485]
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
print("Cross-validation scores:\n{}".format(
cross_val_score(logreg, X, y, cv=kfold)))
Cross-validation scores: [ 0.91176471 0.87878788 0.93939394]
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, X, y, cv=shuffle_split)
print("Cross-validation scores:\n{}".format(scores))
Cross-validation scores: [ 0.84 0.88 0.9 0.86 0.9 0.86 0.86 0.92 0.94 0.9 ]
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))
Parameter grid: {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)
grid_search.fit(X_train, y_train)
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Test set score: 0.88 Best parameters: {'C': 1, 'gamma': 1} Best cross-validation score: 0.95
print("Best estimator:\n{}".format(grid_search.best_estimator_))
Best estimator: SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# convert to DataFrame
results = pd.DataFrame(grid_search.cv_results_)
# show the first 5 rows
results.head()
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_C | param_gamma | params | rank_test_score | split0_test_score | split0_train_score | ... | split2_test_score | split2_train_score | split3_test_score | split3_train_score | split4_test_score | split4_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000852 | 0.000425 | 0.36 | 0.359969 | 0.001 | 0.001 | {'C': 0.001, 'gamma': 0.001} | 20 | 0.375 | 0.355932 | ... | 0.333333 | 0.366667 | 0.357143 | 0.360656 | 0.357143 | 0.360656 | 0.000282 | 0.000171 | 0.015533 | 0.00396 |
1 | 0.000534 | 0.000262 | 0.36 | 0.359969 | 0.001 | 0.01 | {'C': 0.001, 'gamma': 0.01} | 20 | 0.375 | 0.355932 | ... | 0.333333 | 0.366667 | 0.357143 | 0.360656 | 0.357143 | 0.360656 | 0.000118 | 0.000056 | 0.015533 | 0.00396 |
2 | 0.000494 | 0.000230 | 0.36 | 0.359969 | 0.001 | 0.1 | {'C': 0.001, 'gamma': 0.1} | 20 | 0.375 | 0.355932 | ... | 0.333333 | 0.366667 | 0.357143 | 0.360656 | 0.357143 | 0.360656 | 0.000039 | 0.000014 | 0.015533 | 0.00396 |
3 | 0.000656 | 0.000321 | 0.36 | 0.359969 | 0.001 | 1 | {'C': 0.001, 'gamma': 1} | 20 | 0.375 | 0.355932 | ... | 0.333333 | 0.366667 | 0.357143 | 0.360656 | 0.357143 | 0.360656 | 0.000138 | 0.000082 | 0.015533 | 0.00396 |
4 | 0.000770 | 0.000402 | 0.36 | 0.359969 | 0.001 | 10 | {'C': 0.001, 'gamma': 10} | 20 | 0.375 | 0.355932 | ... | 0.333333 | 0.366667 | 0.357143 | 0.360656 | 0.357143 | 0.360656 | 0.000211 | 0.000159 | 0.015533 | 0.00396 |
5 rows × 22 columns
scores = np.array(results.mean_test_score).reshape(6, 6)
sns.heatmap(scores, xticklabels=param_grid['gamma'],
yticklabels=param_grid['C'], annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1110ace48>
param_grid = [{'kernel': ['rbf'],
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
{'kernel': ['linear'],
'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
print("List of grids:\n{}".format(param_grid))
List of grids: [{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Best parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'} Best cross-validation score: 0.95
results = pd.DataFrame(grid_search.cv_results_)
# we display the transposed table so that it better fits on the page:
results.T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean_fit_time | 0.000600719 | 0.000883627 | 0.00111666 | 0.0011353 | 0.000996351 | 0.00113578 | 0.000873089 | 0.000591946 | 0.000497246 | 0.000456381 | ... | 0.000386429 | 0.000453997 | 0.000532818 | 0.000525141 | 0.000346756 | 0.000330734 | 0.000304556 | 0.000306177 | 0.000330257 | 0.000708103 |
mean_score_time | 0.000257683 | 0.000851917 | 0.000538158 | 0.000598478 | 0.000419283 | 0.000628805 | 0.000488091 | 0.0002985 | 0.000229931 | 0.000209188 | ... | 0.000184202 | 0.000191832 | 0.000219202 | 0.000204468 | 0.000189781 | 0.000181198 | 0.00018549 | 0.000175571 | 0.000183487 | 0.000184298 |
mean_test_score | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 | ... | 0.92 | 0.92 | 0.746667 | 0.453333 | 0.36 | 0.906667 | 0.893333 | 0.893333 | 0.893333 | 0.893333 |
mean_train_score | 0.359969 | 0.359969 | 0.359969 | 0.359969 | 0.359969 | 0.359969 | 0.359969 | 0.359969 | 0.359969 | 0.359969 | ... | 0.973382 | 0.976772 | 1 | 1 | 0.359969 | 0.913258 | 0.919981 | 0.936541 | 0.933319 | 0.946654 |
param_C | 0.001 | 0.001 | 0.001 | 0.001 | 0.001 | 0.001 | 0.01 | 0.01 | 0.01 | 0.01 | ... | 100 | 100 | 100 | 100 | 0.001 | 0.01 | 0.1 | 1 | 10 | 100 |
param_gamma | 0.001 | 0.01 | 0.1 | 1 | 10 | 100 | 0.001 | 0.01 | 0.1 | 1 | ... | 0.1 | 1 | 10 | 100 | NaN | NaN | NaN | NaN | NaN | NaN |
param_kernel | rbf | rbf | rbf | rbf | rbf | rbf | rbf | rbf | rbf | rbf | ... | rbf | rbf | rbf | rbf | linear | linear | linear | linear | linear | linear |
params | {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 0.1, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 1, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 10, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 100, 'kernel': 'rbf'} | {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'} | {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'} | {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'} | {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'} | ... | {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'} | {'C': 100, 'gamma': 1, 'kernel': 'rbf'} | {'C': 100, 'gamma': 10, 'kernel': 'rbf'} | {'C': 100, 'gamma': 100, 'kernel': 'rbf'} | {'C': 0.001, 'kernel': 'linear'} | {'C': 0.01, 'kernel': 'linear'} | {'C': 0.1, 'kernel': 'linear'} | {'C': 1, 'kernel': 'linear'} | {'C': 10, 'kernel': 'linear'} | {'C': 100, 'kernel': 'linear'} |
rank_test_score | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | ... | 3 | 3 | 19 | 22 | 25 | 7 | 10 | 10 | 10 | 10 |
split0_test_score | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | ... | 1 | 0.9375 | 0.75 | 0.4375 | 0.375 | 0.9375 | 0.875 | 0.875 | 0.875 | 0.875 |
split0_train_score | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | ... | 0.966102 | 0.966102 | 1 | 1 | 0.355932 | 0.915254 | 0.898305 | 0.898305 | 0.915254 | 0.915254 |
split1_test_score | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | 0.375 | ... | 0.875 | 0.875 | 0.8125 | 0.4375 | 0.375 | 0.875 | 0.875 | 0.875 | 0.9375 | 0.9375 |
split1_train_score | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | 0.355932 | ... | 0.983051 | 1 | 1 | 1 | 0.355932 | 0.898305 | 0.932203 | 0.949153 | 0.949153 | 0.966102 |
split2_test_score | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | ... | 0.8 | 0.933333 | 0.666667 | 0.4 | 0.333333 | 0.866667 | 0.866667 | 0.866667 | 0.8 | 0.8 |
split2_train_score | 0.366667 | 0.366667 | 0.366667 | 0.366667 | 0.366667 | 0.366667 | 0.366667 | 0.366667 | 0.366667 | 0.366667 | ... | 0.983333 | 0.983333 | 1 | 1 | 0.366667 | 0.916667 | 0.933333 | 0.95 | 0.933333 | 0.966667 |
split3_test_score | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | ... | 0.928571 | 0.928571 | 0.642857 | 0.5 | 0.357143 | 0.857143 | 0.857143 | 0.857143 | 0.857143 | 0.857143 |
split3_train_score | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | ... | 0.967213 | 0.967213 | 1 | 1 | 0.360656 | 0.934426 | 0.934426 | 0.95082 | 0.934426 | 0.95082 |
split4_test_score | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | ... | 1 | 0.928571 | 0.857143 | 0.5 | 0.357143 | 1 | 1 | 1 | 1 | 1 |
split4_train_score | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | 0.360656 | ... | 0.967213 | 0.967213 | 1 | 1 | 0.360656 | 0.901639 | 0.901639 | 0.934426 | 0.934426 | 0.934426 |
std_fit_time | 0.000149639 | 0.000419799 | 0.000435166 | 0.000215438 | 0.00038384 | 0.000299698 | 0.000253743 | 0.000120045 | 8.02341e-05 | 3.29161e-05 | ... | 3.51941e-05 | 3.22151e-05 | 8.05663e-05 | 2.90466e-05 | 1.13799e-05 | 6.73911e-06 | 6.78819e-06 | 8.73719e-06 | 2.1366e-05 | 0.00031625 |
std_score_time | 2.08539e-05 | 0.00104085 | 0.000159495 | 0.000170187 | 0.000142739 | 0.000141239 | 0.000211396 | 5.26278e-05 | 1.89746e-05 | 1.90744e-05 | ... | 6.81327e-06 | 9.67004e-06 | 3.48269e-05 | 6.07401e-06 | 6.18971e-06 | 4.4604e-06 | 1.68924e-05 | 1.94629e-06 | 1.54289e-05 | 1.03067e-05 |
std_test_score | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | 0.0155329 | ... | 0.0767184 | 0.023671 | 0.0806029 | 0.0385861 | 0.0155329 | 0.0531096 | 0.0515167 | 0.0515167 | 0.0678116 | 0.0678116 |
std_train_score | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | 0.00395963 | ... | 0.00802033 | 0.0132604 | 0 | 0 | 0.00395963 | 0.012824 | 0.0163866 | 0.0200526 | 0.0107656 | 0.0196547 |
23 rows × 42 columns
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, grid_search.predict(X_test))
print("Confusion matrix:\n{}".format(confusion))
Confusion matrix: [[ 6 2 1] [ 0 6 0] [ 0 0 10]]
len(y_test)
25
plot_decision_regions(X_test, y_test, grid_search)
<matplotlib.axes._subplots.AxesSubplot at 0x111588550>
plot_decision_regions(X_test, y_test, logreg)
<matplotlib.axes._subplots.AxesSubplot at 0x111588400>
kaonlearn.plots.plot_confusion_matrix_illustration()
kaonlearn.plots.plot_binary_confusion_matrix()
We call correctly classified samples belonging to the positive class true positives
and correctly classified samples belonging to the negative class true negatives
. These terms are usually abbreviated FP, FN, TP, and TN and lead to the following interpretation for the confusion matrix
$ Accuracy = \frac{TP+TN}{TP+TN+FP+FN} $
$ Precision = \frac{TP}{TP+FP} $
$ Recall = \frac{TP}{TP+FN} $
$ F_1 = 2\frac{precision*recall}{precision+recall} $
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_search.predict(X_test)))
precision recall f1-score support 0 1.00 0.67 0.80 9 1 0.75 1.00 0.86 6 2 0.91 1.00 0.95 10 avg / total 0.90 0.88 0.87 25
from sklearn.metrics import classification_report
print(classification_report(y_test, logreg.predict(X_test)))
precision recall f1-score support 0 0.88 0.78 0.82 9 1 0.75 1.00 0.86 6 2 1.00 0.90 0.95 10 avg / total 0.90 0.88 0.88 25
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target, random_state=0)
lr = LogisticRegression().fit(X_train, y_train)
pred = lr.predict(X_test)
print("Accuracy: {:.3f}".format(accuracy_score(y_test, pred)))
print("Confusion matrix:\n{}".format(confusion_matrix(y_test, pred)))
Accuracy: 0.953 Confusion matrix: [[37 0 0 0 0 0 0 0 0 0] [ 0 39 0 0 0 0 2 0 2 0] [ 0 0 41 3 0 0 0 0 0 0] [ 0 0 1 43 0 0 0 0 0 1] [ 0 0 0 0 38 0 0 0 0 0] [ 0 1 0 0 0 47 0 0 0 0] [ 0 0 0 0 0 0 52 0 0 0] [ 0 1 0 1 1 0 0 45 0 0] [ 0 3 1 0 0 0 0 0 43 1] [ 0 0 0 1 0 1 0 0 1 44]]
sns.heatmap(confusion_matrix(y_test, pred), annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x11192c860>
print(classification_report(y_test, pred))
precision recall f1-score support 0 1.00 1.00 1.00 37 1 0.89 0.91 0.90 43 2 0.95 0.93 0.94 44 3 0.90 0.96 0.92 45 4 0.97 1.00 0.99 38 5 0.98 0.98 0.98 48 6 0.96 1.00 0.98 52 7 1.00 0.94 0.97 48 8 0.93 0.90 0.91 48 9 0.96 0.94 0.95 47 avg / total 0.95 0.95 0.95 450