import pandas as pd
import numpy as np

# Create a random number generator that will be used to set the randomness
rng = np.random.RandomState(0)


def generate_data(n_samples=50):
    """Generate synthetic dataset. Returns `data_train`, `data_test`,
    `target_train`."""
    x_max, x_min = 1.4, -1.4
    len_x = x_max - x_min
    x = rng.rand(n_samples) * len_x - len_x / 2
    noise = rng.randn(n_samples) * 0.3
    y = x**3 - 0.5 * x**2 + noise

    data_train = pd.DataFrame(x, columns=["Feature"])
    data_test = pd.DataFrame(
        np.linspace(x_max, x_min, num=300), columns=["Feature"]
    )
    target_train = pd.Series(y, name="Target")

    return data_train, data_test, target_train


data_train, data_test, target_train = generate_data()

import matplotlib.pyplot as plt
import seaborn as sns

sns.scatterplot(
    x=data_train["Feature"], y=target_train, color="black", alpha=0.5
)
_ = plt.title("Synthetic regression dataset")

from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=3, random_state=0)
tree.fit(data_train, target_train)

target_train_predicted = tree.predict(data_train)
target_test_predicted = tree.predict(data_test)

# plot the data
sns.scatterplot(
    x=data_train["Feature"], y=target_train, color="black", alpha=0.5
)
# plot the predictions
line_predictions = plt.plot(data_test["Feature"], target_test_predicted, "--")

# plot the residuals
for value, true, predicted in zip(
    data_train["Feature"], target_train, target_train_predicted
):
    lines_residuals = plt.plot([value, value], [true, predicted], color="red")

plt.legend(
    [line_predictions[0], lines_residuals[0]], ["Fitted tree", "Residuals"]
)
_ = plt.title("Prediction function together \nwith errors on the training set")

residuals = target_train - target_train_predicted

tree_residuals = DecisionTreeRegressor(max_depth=5, random_state=0)
tree_residuals.fit(data_train, residuals)

target_train_predicted_residuals = tree_residuals.predict(data_train)
target_test_predicted_residuals = tree_residuals.predict(data_test)

sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5)
line_predictions = plt.plot(
    data_test["Feature"], target_test_predicted_residuals, "--"
)

# plot the residuals of the predicted residuals
for value, true, predicted in zip(
    data_train["Feature"], residuals, target_train_predicted_residuals
):
    lines_residuals = plt.plot([value, value], [true, predicted], color="red")

plt.legend(
    [line_predictions[0], lines_residuals[0]],
    ["Fitted tree", "Residuals"],
    bbox_to_anchor=(1.05, 0.8),
    loc="upper left",
)
_ = plt.title("Prediction of the previous residuals")

sample = data_train.iloc[[-2]]
x_sample = sample["Feature"].iloc[0]
target_true = target_train.iloc[-2]
target_true_residual = residuals.iloc[-2]

# Plot the previous information:
#   * the dataset
#   * the predictions
#   * the residuals

sns.scatterplot(
    x=data_train["Feature"], y=target_train, color="black", alpha=0.5
)
plt.plot(data_test["Feature"], target_test_predicted, "--")
for value, true, predicted in zip(
    data_train["Feature"], target_train, target_train_predicted
):
    lines_residuals = plt.plot([value, value], [true, predicted], color="red")

# Highlight the sample of interest
plt.scatter(
    sample, target_true, label="Sample of interest", color="tab:orange", s=200
)
plt.xlim([-1, 0])
plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
_ = plt.title("Tree predictions")

# Plot the previous information:
#   * the residuals committed by the first tree
#   * the residual predictions
#   * the residuals of the residual predictions

sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5)
plt.plot(data_test["Feature"], target_test_predicted_residuals, "--")
for value, true, predicted in zip(
    data_train["Feature"], residuals, target_train_predicted_residuals
):
    lines_residuals = plt.plot([value, value], [true, predicted], color="red")

# Highlight the sample of interest
plt.scatter(
    sample,
    target_true_residual,
    label="Sample of interest",
    color="tab:orange",
    s=200,
)
plt.xlim([-1, 0])
plt.legend()
_ = plt.title("Prediction of the residuals")

print(f"True value to predict for f(x={x_sample:.3f}) = {target_true:.3f}")

y_pred_first_tree = tree.predict(sample)[0]
print(
    f"Prediction of the first decision tree for x={x_sample:.3f}: "
    f"y={y_pred_first_tree:.3f}"
)
print(f"Error of the tree: {target_true - y_pred_first_tree:.3f}")

print(
    f"Prediction of the residual for x={x_sample:.3f}: "
    f"{tree_residuals.predict(sample)[0]:.3f}"
)

y_pred_first_and_second_tree = (
    y_pred_first_tree + tree_residuals.predict(sample)[0]
)
print(
    "Prediction of the first and second decision trees combined for "
    f"x={x_sample:.3f}: y={y_pred_first_and_second_tree:.3f}"
)
print(f"Error of the tree: {target_true - y_pred_first_and_second_tree:.3f}")

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_validate

data, target = fetch_california_housing(return_X_y=True, as_frame=True)
target *= 100  # rescale the target in k$

from sklearn.ensemble import GradientBoostingRegressor

gradient_boosting = GradientBoostingRegressor(n_estimators=200)
cv_results_gbdt = cross_validate(
    gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

print("Gradient Boosting Decision Tree")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_gbdt['test_score'].mean():.3f} ± "
    f"{cv_results_gbdt['test_score'].std():.3f} k$"
)
print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds"
)

from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(n_estimators=200, n_jobs=2)
cv_results_rf = cross_validate(
    random_forest,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

print("Random Forest")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_rf['test_score'].mean():.3f} ± "
    f"{cv_results_rf['test_score'].std():.3f} k$"
)
print(f"Average fit time: {cv_results_rf['fit_time'].mean():.3f} seconds")
print(f"Average score time: {cv_results_rf['score_time'].mean():.3f} seconds")