from sklearn.datasets import fetch_california_housing data, target = fetch_california_housing(return_X_y=True, as_frame=True) target *= 100 # rescale the target in k$ # solution from sklearn.ensemble import HistGradientBoostingRegressor hist_gbdt = HistGradientBoostingRegressor( max_iter=1000, early_stopping=True, random_state=0 ) # solution from sklearn.model_selection import GridSearchCV params = { "max_depth": [3, 8], "max_leaf_nodes": [15, 31], "learning_rate": [0.1, 1], } search = GridSearchCV(hist_gbdt, params) # solution from sklearn.model_selection import cross_validate from sklearn.model_selection import KFold cv = KFold(n_splits=5, shuffle=True, random_state=0) results = cross_validate( search, data, target, cv=cv, return_estimator=True, n_jobs=2 ) # solution print( "R2 score with cross-validation:\n" f"{results['test_score'].mean():.3f} ± " f"{results['test_score'].std():.3f}" ) # solution for estimator in results["estimator"]: print(estimator.best_params_) print(f"# trees: {estimator.best_estimator_.n_iter_}") # solution import pandas as pd index_columns = [f"param_{name}" for name in params.keys()] columns = index_columns + ["mean_test_score"] inner_cv_results = [] for cv_idx, estimator in enumerate(results["estimator"]): search_cv_results = pd.DataFrame(estimator.cv_results_) search_cv_results = search_cv_results[columns].set_index(index_columns) search_cv_results = search_cv_results.rename( columns={"mean_test_score": f"CV {cv_idx}"} ) inner_cv_results.append(search_cv_results) inner_cv_results = pd.concat(inner_cv_results, axis=1).T import matplotlib.pyplot as plt color = {"whiskers": "black", "medians": "black", "caps": "black"} inner_cv_results.plot.box(vert=False, color=color) plt.xlabel("R2 score") plt.ylabel("Parameters") _ = plt.title( "Inner CV results with parameters\n" "(max_depth, max_leaf_nodes, learning_rate)" )