from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split data, target = fetch_california_housing(as_frame=True, return_X_y=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( data, target, random_state=0, test_size=0.5 ) # solution from sklearn.metrics import mean_absolute_error from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor tree = DecisionTreeRegressor() bagging = BaggingRegressor(estimator=tree, n_jobs=2) bagging.fit(data_train, target_train) target_predicted = bagging.predict(data_test) print( "Basic mean absolute error of the bagging regressor:\n" f"{mean_absolute_error(target_test, target_predicted):.2f} k$" ) # solution for param in bagging.get_params().keys(): print(param) from scipy.stats import randint from sklearn.model_selection import RandomizedSearchCV param_grid = { "n_estimators": randint(10, 30), "max_samples": [0.5, 0.8, 1.0], "max_features": [0.5, 0.8, 1.0], "estimator__max_depth": randint(3, 10), } search = RandomizedSearchCV( bagging, param_grid, n_iter=20, scoring="neg_mean_absolute_error" ) _ = search.fit(data_train, target_train) import pandas as pd columns = [f"param_{name}" for name in param_grid.keys()] columns += ["mean_test_error", "std_test_error"] cv_results = pd.DataFrame(search.cv_results_) cv_results["mean_test_error"] = -cv_results["mean_test_score"] cv_results["std_test_error"] = cv_results["std_test_score"] cv_results[columns].sort_values(by="mean_test_error") target_predicted = search.predict(data_test) print( "Mean absolute error after tuning of the bagging regressor:\n" f"{mean_absolute_error(target_test, target_predicted):.2f} k$" )