import numpy as np
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), range(5)
(X_train, X_test,
y_train, y_test) = train_test_split(X, y, test_size=0.3,
random_state=42)
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
import numpy as np
from sklearn.model_selection import LeaveOneOut
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
groups = np.array([1, 1, 2, 2])
logo = LeaveOneGroupOut()
for train_index, test_index in logo.split(X, y, groups):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
# Load the dataset
X_iris = load_iris().data
y_iris = load_iris().target
# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100],
"gamma": [.01, .1]}
# We will use a Support Vector Classifier with "rbf" kernel
svr = SVC(kernel="rbf")
# Create inner and outer strategies
inner_cv = KFold(n_splits=2, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
# Pass the gridSearch estimator to cross_val_score
clf = GridSearchCV(estimator=svr, param_grid=p_grid, cv=inner_cv)
nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv).mean(); print(nested_score)
0.9800000000000001
from scipy.stats import wilcoxon
from sklearn.datasets import load_iris
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
# Load the dataset
X = load_iris().data
y = load_iris().target
# Prepare models and select your CV method
model1 = ExtraTreesClassifier(n_estimators=100)
model2 = RandomForestClassifier(n_estimators=100)
kf = KFold(n_splits=20, random_state=42)
# Extract results for each model on the same folds
results_model1 = cross_val_score(model1, X, y, cv=kf)
results_model2 = cross_val_score(model2, X, y, cv=kf)
# Calculate p value
stat, p = wilcoxon(results_model1, results_model2, zero_method='zsplit'); p
0.9836649672612962
import numpy as np
from mlxtend.evaluate import mcnemar_table, mcnemar
# The correct target (class) labels
y_target = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
# Class labels predicted by model 1
y_model1 = np.array([0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1])
# Class labels predicted by model 2
y_model2 = np.array([0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0])
tb = mcnemar_table(y_target=y_target,
y_model1=y_model1,
y_model2=y_model2)
chi2, p = mcnemar(ary=tb, exact=True)
print('chi-squared:', chi2)
print('p-value:', p)
chi-squared: 3 p-value: 0.5078125
from mlxtend.evaluate import paired_ttest_5x2cv
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from mlxtend.data import iris_data
# Prepare data and clfs
X, y = iris_data()
clf1 = ExtraTreeClassifier()
clf2 = DecisionTreeClassifier()
# Calculate p-value
t, p = paired_ttest_5x2cv(estimator1=clf1,
estimator2=clf2,
X=X, y=y,
random_seed=1)
print('t-value:', t)
print('p-value:', p)
t-value: -2.267786838055364 p-value: 0.0726397949448125
from mlxtend.evaluate import combined_ftest_5x2cv
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from mlxtend.data import iris_data
# Prepare data and clfs
X, y = iris_data()
clf1 = ExtraTreeClassifier()
clf2 = DecisionTreeClassifier()
# Calculate p-value
f, p = combined_ftest_5x2cv(estimator1=clf1,
estimator2=clf2,
X=X, y=y,
random_seed=1)
print('f-value:', f)
print('p-value:', p)
f-value: 1.7547169811320726 p-value: 0.27786091669060475