For an explanation of nested cross-validation, please see:
Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
%load_ext watermark
%watermark -a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scikit-learn
Sebastian Raschka Last updated: 11/30/2015 CPython 3.5.0 IPython 4.0.0 numpy 1.10.1 pandas 0.17.1 matplotlib 1.5.0 scikit-learn 0.17
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
# load and split data
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# pipeline setup
cls = SVC(C=10.0, kernel='rbf', gamma=0.1, decision_function_shape='ovr')
kernel_svm = Pipeline([('std', StandardScaler()),
('svc', cls)])
# gridsearch setup
param_grid = [
{'svc__C': [1, 10, 100, 1000],
'svc__gamma': [0.001, 0.0001],
'svc__kernel': ['rbf']},
]
# setup multiple GridSearchCV objects, 1 for each algorithm
gs_svm = GridSearchCV(estimator=kernel_svm,
param_grid=param_grid,
scoring='accuracy',
n_jobs=-1,
cv=5,
verbose=0,
refit=True,
pre_dispatch='2*n_jobs')
Here, the cross_val_function
runs the 5 outer loops, and the the GridSearch
object (gs
) peforms the hyperparameter optimization during the 5 inner loops.
import numpy as np
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(gs_svm, X_train, y_train, scoring='accuracy', cv=5)
print('\nAverage Accuracy %.2f +/- %.2f' % (np.mean(scores), np.std(scores)))
Average Accuracy 0.95 +/- 0.06
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
params = []
scores = []
skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=False, random_state=1)
for train_idx, test_idx in skfold:
gs_svm.fit(X_train[train_idx], y_train[train_idx])
y_pred = gs_svm.predict(X_train[test_idx])
acc = accuracy_score(y_true=y_train[test_idx], y_pred=y_pred)
params.append(gs_svm.best_params_)
scores.append(acc)
print('SVM models:')
for idx, m in enumerate(zip(params, scores)):
print('%s. Acc: %.2f Params: %s' % (idx+1, m[1], m[0]))
print('\nAverage Accuracy %.2f +/- %.2f' % (np.mean(scores), np.std(scores)))
SVM models: 1. Acc: 0.96 Params: {'svc__C': 100, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} 2. Acc: 1.00 Params: {'svc__C': 100, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} 3. Acc: 0.83 Params: {'svc__C': 1000, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} 4. Acc: 1.00 Params: {'svc__C': 100, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} 5. Acc: 0.96 Params: {'svc__C': 100, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'} Average Accuracy 0.95 +/- 0.06
Repeat the nested cross-validation for different algorithms. Then, pick the "best" algorithm (not the best model!). Next, use the complete training set to tune the best algorithm via grid search:
gs_svm.fit(X_train, y_train)
print('Best parameters %s' % gs_svm.best_params_)
Best parameters {'svc__C': 100, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
train_acc = accuracy_score(y_true=y_train, y_pred=gs_svm.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=gs_svm.predict(X_test))
print('Training accuracy: %.2f' % train_acc)
print('Test accuracy: %.2f' % test_acc)
print('Parameters: %s' % gs_svm.best_params_)
Training accuracy: 0.97 Test accuracy: 0.97 Parameters: {'svc__C': 100, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}