from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split data, target = fetch_california_housing(return_X_y=True, as_frame=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( data, target, random_state=0, test_size=0.5 ) # solution from sklearn.ensemble import GradientBoostingRegressor gbdt = GradientBoostingRegressor(max_depth=5, learning_rate=0.5) # solution from sklearn.ensemble import RandomForestRegressor forest = RandomForestRegressor(max_depth=None) # solution import numpy as np from sklearn.model_selection import ValidationCurveDisplay param_range = np.array([1, 2, 5, 10, 20, 50, 100]) disp = ValidationCurveDisplay.from_estimator( forest, data, target, param_name="n_estimators", param_range=param_range, scoring="neg_mean_absolute_error", negate_score=True, std_display_style="errorbar", n_jobs=2, ) _ = disp.ax_.set( xlabel="Number of trees in the forest", ylabel="Mean absolute error (k$)", title="Validation curve for random forest", ) # solution gbdt = GradientBoostingRegressor(n_estimators=1_000, n_iter_no_change=5) gbdt.fit(data_train, target_train) gbdt.n_estimators_ # solution from sklearn.metrics import mean_absolute_error error = mean_absolute_error(target_test, gbdt.predict(data_test)) print(f"On average, our GBDT regressor makes an error of {error:.2f} k$")