# reload modules before executing user code %load_ext autoreload # reload all modules every time before executing Python code %autoreload 2 # render plots in notebook %matplotlib inline # uncomment this if running locally or on Google Colab # !pip install --upgrade hepml # data wrangling import pandas as pd import numpy as np # data viz import matplotlib.pyplot as plt import seaborn as sns from hepml.core import plot_regression_tree sns.set(color_codes=True) sns.set_palette(sns.color_palette("muted")) # ml magic from sklearn.ensemble import GradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error number_of_examples = 100 # fix the seed for reproducibility np.random.seed(42) # generate features X = np.random.rand(number_of_examples, 1) - 0.5 # generate target y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(number_of_examples) # create pandas.DataFrame data = pd.DataFrame(data=np.stack([X[:, 0], y], axis=1), columns=["X", "y"]) data.head() sns.scatterplot(x="X", y="y", data=data) plt.show() tree_1 = DecisionTreeRegressor(max_depth=2) tree_1.fit(X, y) plot_regression_tree(tree_1, data.columns, fontsize=24) data["Tree 1 prediction"] = tree_1.predict(X) data.head() def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None): x1 = np.linspace(axes[0], axes[1], 500) y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors) plt.plot(X[:, 0], y, data_style, label=data_label) plt.plot(x1, y_pred, style, linewidth=2, label=label) if label or data_label: plt.legend(loc="upper center", fontsize=16) plt.axis(axes) plt.ylabel("$y$", fontsize=16) plt.xlabel("$X$", fontsize=16) plot_predictions( [tree_1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(X)=h_1(X)$", style="g-", data_label="Training set" ) plt.show() data["Tree 1 residual"] = data["y"] - data["Tree 1 prediction"] data.head() tree_2 = DecisionTreeRegressor(max_depth=2) tree_2.fit(X, data["Tree 1 residual"]) data["Tree 2 prediction"] = tree_2.predict(X) data["Tree 1 + tree 2 prediction"] = sum(tree.predict(X) for tree in (tree_1, tree_2)) data.head() fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5)) plt.subplot(ax0) plot_predictions( [tree_2], X, data["Tree 1 residual"], axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(X)$", style="g-", data_style="k+", data_label="Residuals", ) plt.ylabel("$y - h_1(X)$", fontsize=18) plt.title("Residuals and tree predictions", fontsize=16) plt.subplot(ax1) plot_predictions([tree_1, tree_2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(X) = h_1(X) + h_2(X)$") plt.title("Ensemble predictions", fontsize=16) plt.tight_layout() data["Tree 1 + tree 2 residual"] = data["Tree 1 residual"] - data["Tree 2 prediction"] data.head() tree_3 = DecisionTreeRegressor(max_depth=2) tree_3.fit(X, data["Tree 1 + tree 2 residual"]) data["Tree 3 prediction"] = tree_3.predict(X) data["Tree 1 + tree 2 + tree 3 prediction"] = sum(tree.predict(X) for tree in (tree_1, tree_2, tree_3)) data["Final residual"] = data["Tree 1 + tree 2 residual"] - data["Tree 3 prediction"] data.head() fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5)) plt.subplot(ax0) plot_predictions( [tree_3], X, data["Tree 1 + tree 2 residual"], axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(X)$", style="g-", data_style="k+", ) plt.ylabel("$y - h_1(X) - h_2(X)$", fontsize=16) plt.subplot(ax1) plot_predictions( [tree_1, tree_2, tree_3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(X) = h_1(X) + h_2(X) + h_3(X)$", ) plt.tight_layout() gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0) gbrt.fit(X, y) plot_predictions( [gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions", style="g-", data_label="Training set" ) plt.show() gbrt_high_lr = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42) gbrt_high_lr.fit(X, y) gbrt_low_lr = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42) gbrt_low_lr.fit(X, y) fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5)) plt.subplot(ax0) plot_predictions([gbrt_high_lr], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions") plt.title(f"learning_rate={gbrt_high_lr.learning_rate}, n_estimators={gbrt_high_lr.n_estimators}", fontsize=18) plt.subplot(ax1) plot_predictions([gbrt_low_lr], X, y, axes=[-0.5, 0.5, -0.1, 0.8]) plt.title(f"learning_rate={gbrt_low_lr.learning_rate}, n_estimators={gbrt_low_lr.n_estimators}", fontsize=18) plt.tight_layout() X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=49) gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42) gbrt.fit(X_train, y_train) errors = [mean_squared_error(y_valid, y_pred) for y_pred in gbrt.staged_predict(X_valid)] bst_n_estimators = np.argmin(errors) + 1 gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators, random_state=42) gbrt_best.fit(X_train, y_train) min_error = np.min(errors) fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5)) plt.subplot(ax0) plt.plot(errors, "b.-") plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--") plt.plot([0, 120], [min_error, min_error], "k--") plt.plot(bst_n_estimators, min_error, "ko") plt.text(bst_n_estimators, min_error * 1.2, "Minimum", ha="center", fontsize=14) plt.axis([0, 120, 0, 0.01]) plt.xlabel("Number of trees", fontsize=16) plt.ylabel("MSE", fontsize=16) plt.title("Validation error", fontsize=14) plt.subplot(ax1) plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8]) plt.title(f"Best model ({bst_n_estimators} trees)", fontsize=14) plt.tight_layout() gbrt = GradientBoostingRegressor( n_estimators=120, validation_fraction=0.2, n_iter_no_change=5, tol=1e-5, max_depth=2, random_state=42, warm_start=True ) gbrt.fit(X, y) print(f"Optimal number of trees: {gbrt.n_estimators_}") print(f"Minimum validation MSE: {np.min(gbrt.train_score_)}") plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8]) from sklearn.datasets import make_moons X, y = make_moons(n_samples=500, noise=0.30, random_state=42) sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y) plt.xlabel("$x_1$") plt.ylabel("$x_2$") plt.show()