# reload modules before executing user code
%load_ext autoreload
# reload all modules every time before executing Python code
%autoreload 2
# render plots in notebook
%matplotlib inline

# uncomment this if running locally or on Google Colab
# !pip install --upgrade hepml

# data wrangling
import pandas as pd
import numpy as np

# data viz
import matplotlib.pyplot as plt
import seaborn as sns
from hepml.core import plot_regression_tree

sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))

# ml magic
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

number_of_examples = 100
# fix the seed for reproducibility
np.random.seed(42)
# generate features
X = np.random.rand(number_of_examples, 1) - 0.5
# generate target
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(number_of_examples)
# create pandas.DataFrame
data = pd.DataFrame(data=np.stack([X[:, 0], y], axis=1), columns=["X", "y"])
data.head()

sns.scatterplot(x="X", y="y", data=data)
plt.show()

tree_1 = DecisionTreeRegressor(max_depth=2)
tree_1.fit(X, y)

plot_regression_tree(tree_1, data.columns, fontsize=24)

data["Tree 1 prediction"] = tree_1.predict(X)
data.head()

def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)
    plt.ylabel("$y$", fontsize=16)
    plt.xlabel("$X$", fontsize=16)

plot_predictions(
    [tree_1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(X)=h_1(X)$", style="g-", data_label="Training set"
)
plt.show()

data["Tree 1 residual"] = data["y"] - data["Tree 1 prediction"]
data.head()

tree_2 = DecisionTreeRegressor(max_depth=2)
tree_2.fit(X, data["Tree 1 residual"])

data["Tree 2 prediction"] = tree_2.predict(X)
data["Tree 1 + tree 2 prediction"] = sum(tree.predict(X) for tree in (tree_1, tree_2))
data.head()

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5))

plt.subplot(ax0)
plot_predictions(
    [tree_2],
    X,
    data["Tree 1 residual"],
    axes=[-0.5, 0.5, -0.5, 0.5],
    label="$h_2(X)$",
    style="g-",
    data_style="k+",
    data_label="Residuals",
)
plt.ylabel("$y - h_1(X)$", fontsize=18)
plt.title("Residuals and tree predictions", fontsize=16)

plt.subplot(ax1)
plot_predictions([tree_1, tree_2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(X) = h_1(X) + h_2(X)$")
plt.title("Ensemble predictions", fontsize=16)
plt.tight_layout()

data["Tree 1 + tree 2 residual"] = data["Tree 1 residual"] - data["Tree 2 prediction"]
data.head()

tree_3 = DecisionTreeRegressor(max_depth=2)
tree_3.fit(X, data["Tree 1 + tree 2 residual"])

data["Tree 3 prediction"] = tree_3.predict(X)
data["Tree 1 + tree 2 + tree 3 prediction"] = sum(tree.predict(X) for tree in (tree_1, tree_2, tree_3))
data["Final residual"] = data["Tree 1 + tree 2 residual"] - data["Tree 3 prediction"]
data.head()

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5))

plt.subplot(ax0)
plot_predictions(
    [tree_3],
    X,
    data["Tree 1 + tree 2 residual"],
    axes=[-0.5, 0.5, -0.5, 0.5],
    label="$h_3(X)$",
    style="g-",
    data_style="k+",
)
plt.ylabel("$y - h_1(X) - h_2(X)$", fontsize=16)

plt.subplot(ax1)
plot_predictions(
    [tree_1, tree_2, tree_3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(X) = h_1(X) + h_2(X) + h_3(X)$",
)
plt.tight_layout()

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

plot_predictions(
    [gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions", style="g-", data_label="Training set"
)
plt.show()

gbrt_high_lr = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42)
gbrt_high_lr.fit(X, y)

gbrt_low_lr = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)
gbrt_low_lr.fit(X, y)

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5))

plt.subplot(ax0)
plot_predictions([gbrt_high_lr], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title(f"learning_rate={gbrt_high_lr.learning_rate}, n_estimators={gbrt_high_lr.n_estimators}", fontsize=18)

plt.subplot(ax1)
plot_predictions([gbrt_low_lr], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title(f"learning_rate={gbrt_low_lr.learning_rate}, n_estimators={gbrt_low_lr.n_estimators}", fontsize=18)

plt.tight_layout()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_valid, y_pred) for y_pred in gbrt.staged_predict(X_valid)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)

min_error = np.min(errors)

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(20, 5))

plt.subplot(ax0)
plt.plot(errors, "b.-")
plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(bst_n_estimators, min_error, "ko")
plt.text(bst_n_estimators, min_error * 1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees", fontsize=16)
plt.ylabel("MSE", fontsize=16)
plt.title("Validation error", fontsize=14)

plt.subplot(ax1)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title(f"Best model ({bst_n_estimators} trees)", fontsize=14)

plt.tight_layout()

gbrt = GradientBoostingRegressor(
    n_estimators=120, validation_fraction=0.2, n_iter_no_change=5, tol=1e-5, max_depth=2, random_state=42, warm_start=True
)
gbrt.fit(X, y)

print(f"Optimal number of trees: {gbrt.n_estimators_}")
print(f"Minimum validation MSE: {np.min(gbrt.train_score_)}")

plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8])

from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.show()