from sklearn.datasets import make_classification data, target = make_classification( n_samples=5000, n_features=100, n_informative=2, n_redundant=0, n_repeated=0, random_state=0, ) from sklearn.ensemble import RandomForestClassifier model_without_selection = RandomForestClassifier(n_jobs=2) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif from sklearn.pipeline import make_pipeline model_with_selection = make_pipeline( SelectKBest(score_func=f_classif, k=2), RandomForestClassifier(n_jobs=2), ) import pandas as pd from sklearn.model_selection import cross_validate cv_results_without_selection = cross_validate( model_without_selection, data, target ) cv_results_without_selection = pd.DataFrame(cv_results_without_selection) cv_results_with_selection = cross_validate( model_with_selection, data, target, return_estimator=True ) cv_results_with_selection = pd.DataFrame(cv_results_with_selection) cv_results = pd.concat( [cv_results_without_selection, cv_results_with_selection], axis=1, keys=["Without feature selection", "With feature selection"], ) # swap the level of the multi-index of the columns cv_results = cv_results.swaplevel(axis="columns") import matplotlib.pyplot as plt color = {"whiskers": "black", "medians": "black", "caps": "black"} cv_results["fit_time"].plot.box(color=color, vert=False) plt.xlabel("Elapsed time (s)") _ = plt.title("Time to fit the model") cv_results["score_time"].plot.box(color=color, vert=False) plt.xlabel("Elapsed time (s)") _ = plt.title("Time to make prediction") cv_results["test_score"].plot.box(color=color, vert=False) plt.xlabel("Accuracy score") _ = plt.title("Test score via cross-validation") import numpy as np for idx, pipeline in enumerate(cv_results_with_selection["estimator"]): print( f"Fold #{idx} - features selected are: " f"{np.argsort(pipeline[0].scores_)[-2:]}" )