from sklearn.datasets import load_digits digits = load_digits() data, target = digits.data, digits.target from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline model = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1_000)) from sklearn.model_selection import cross_val_score, KFold cv = KFold(shuffle=False) test_score_no_shuffling = cross_val_score(model, data, target, cv=cv, n_jobs=2) print( "The average accuracy is " f"{test_score_no_shuffling.mean():.3f} ± " f"{test_score_no_shuffling.std():.3f}" ) cv = KFold(shuffle=True) test_score_with_shuffling = cross_val_score( model, data, target, cv=cv, n_jobs=2 ) print( "The average accuracy is " f"{test_score_with_shuffling.mean():.3f} ± " f"{test_score_with_shuffling.std():.3f}" ) import pandas as pd all_scores = pd.DataFrame( [test_score_no_shuffling, test_score_with_shuffling], index=["KFold without shuffling", "KFold with shuffling"], ).T import matplotlib.pyplot as plt all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7) plt.xlim([0.8, 1.0]) plt.xlabel("Accuracy score") plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = plt.title("Distribution of the test scores") print(test_score_no_shuffling) print(digits.DESCR) target[:200] from itertools import count import numpy as np # defines the lower and upper bounds of sample indices # for each writer writer_boundaries = [ 0, 130, 256, 386, 516, 646, 776, 915, 1029, 1157, 1287, 1415, 1545, 1667, 1797, ] groups = np.zeros_like(target) lower_bounds = writer_boundaries[:-1] upper_bounds = writer_boundaries[1:] for group_id, lb, up in zip(count(), lower_bounds, upper_bounds): groups[lb:up] = group_id plt.plot(groups) plt.yticks(np.unique(groups)) plt.xticks(writer_boundaries, rotation=90) plt.xlabel("Target index") plt.ylabel("Writer index") _ = plt.title("Underlying writer groups existing in the target") from sklearn.model_selection import GroupKFold cv = GroupKFold() test_score = cross_val_score( model, data, target, groups=groups, cv=cv, n_jobs=2 ) print( f"The average accuracy is {test_score.mean():.3f} ± {test_score.std():.3f}" ) all_scores = pd.DataFrame( [test_score_no_shuffling, test_score_with_shuffling, test_score], index=[ "KFold without shuffling", "KFold with shuffling", "KFold with groups", ], ).T all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7) plt.xlim([0.8, 1.0]) plt.xlabel("Accuracy score") plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = plt.title("Distribution of the test scores")