#!/usr/bin/env python # coding: utf-8 #

#
Machine Learning : Predicting wind energy #

# Authors: Joël Hamilcaro, Jie Tu # #

# #

# Wind turbines (Source : Pixabay Royalty-free Image) #

# # #

1 Introduction
- 1.1 Defining the problem
2 Importing the required librairies
3 Data import
- 3.1 Importing data
- 3.2 Train-test split
4 Data exploration
- 4.1 Quick overview
- 4.2 Distribution analysis
5 Data preprocessing
6 A first model
7 Model comparison
8 Conclusion
9 Appendix : Statement

# # Introduction # # ## Defining the problem # # # Our goal is to predict the electrical power generated by two wind turbines according to different characteristics: # - Wind speed # - Wind direction # - Temperature # # For this, we have two data sets, each corresponding to a particular wind turbine. # For each independent individual $i$ in each data set, we have a features vector $x_i=(x_{i_1},x_{i_2},x_{i_3})^T \in \mathbb{R}^3$ corresponding to the value of the wind speed, the wind direction, and the temperature. # For each $i$, we know the label $y_i \in \mathbb{R}$ corresponding to the electrical power. # # *Problem : Given a features vector $x$ we want to predict its label $y$*. # # As $y \in \mathbb{R}$, this is a **regression problem**. # # # # Importing the required librairies # In[1]: # Maths tools import numpy as np import scipy as sp # Data Visualization import pandas as pd # Dataframes import matplotlib.pyplot as plt # plot (suitable for numerical data) import seaborn as sns # plot (suitable for pandas dataframe) # Files import os # open files # ML Preprocessing from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline, Pipeline # ML Models from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor # ML Selection from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV from sklearn.model_selection import validation_curve # ML Metrics from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error # # Data import # ## Importing data # The two datasets contain the following features: # # - P : electrical power in kW. # # - V : wind speed in $m.s^{-1}$. # # - Dir : wind direction in degrees °. # # - T : temperature in °C. # # Each `.txt` file (**eol1.txt** and **eol2.txt**) stores data for a wind turbine. We will read these files and store the data in dataframes (one dataframe per turbine) # In[2]: # Repertory where files are stored repertory = './' # The name of the files filename1 = 'eol1.txt' filename2 = 'eol2.txt' files = [filename1,filename2] datas = [] for file in files : path = os.path.join(repertory, file) try : datas.append( pd.read_csv(path, sep=' ') ) except OSError as err: print(f"OS error: {err}") raise data1_full, data2_full = datas DataFrame = type(data1_full) original_columns = data1_full.columns X_original_cols = ['V','Dir','T'] y_original_col = ['P'] # ## Train-test split # # For each wind turbine, we are given a dataset where the labels are known. # # In a real situation, our model should predict labels on a dataset that has never been seen, and where the labels are unknown. # # We will simulate this real situation, by splitting each dataset in two. The test datasets will play the role of these "never seen" datasets. # In[3]: # 80-20% split test_size = 0.2 data1_X, data1_X_test, data1_y, data1_y_test =\ train_test_split(data1_full[X_original_cols], data1_full[y_original_col], test_size=test_size, stratify=None) data2_X, data2_X_test, data2_y, data2_y_test =\ train_test_split(data2_full[X_original_cols], data2_full[y_original_col], test_size=test_size, stratify=None) data1 = data1_X.join(data1_y).sort_index().reset_index(drop=True) data2 = data2_X.join(data2_y).sort_index().reset_index(drop=True) data1_test = data1_X_test.join(data1_y_test).sort_index().reset_index(drop=True) data2_test = data2_X_test.join(data2_y_test).sort_index().reset_index(drop=True) data1_train_original = data1.copy() data2_train_original = data2.copy() data1_test_original = data1_test.copy() data2_test_original = data2_test.copy() # don't keep copy data1_X, data1_y = None,None data2_X, data2_y = None,None data1_full = None data2_full = None #

# 📎 From now on, the test data set is assumed to be unknown, and not in our possession. #

# We rename the column for our convenience. # In[4]: original_columns = data1.columns def rename_cols(data) : data = data.copy() new_columns = { 'P' : 'Power' , 'V' : 'Speed', 'Dir' : 'Direction', 'T' : 'Temperature' } data = data.rename(columns = new_columns) return data data1 = rename_cols(data1) data2 = rename_cols(data2) X_cols = ['Speed','Direction','Temperature'] Y_col = ['Power'] # # Data exploration # # ## Quick overview # Let's look at the shape of the data # In[5]: print("Dataset 1:", data1.shape, "\tDataset 2:",data2.shape) # We visualize a sample of the data with the `sample()` method. # In[6]: def displaying_two_prototype(display_fun,fun,name, data1, data2, **args) : print(f"Dataset 1 ~ {name}") display_fun(fun(data1,**args)) print(f"\n\n\nDataset 2 ~ {name}") display_fun(fun(data2,**args)) return def display_two(fun,name,data1,data2,**args) : displaying_two_prototype(display,fun,name,data1,data2,**args) return def print_two(fun,name,data1,data2,**args) : displaying_two_prototype(print,fun,name,data1,data2,**args) return # In[7]: display_two(DataFrame.sample,"Sample",data1,data2,n=5, random_state=15) # The sample above shows us that the features are all real numbers. # We use the `describe()` method to display descriptive statistics values on our data # In[8]: display_two(DataFrame.describe,"Descriptive statistics",data1,data2) # We can now note some interesting things: there are negative electric power values. (see line `min`) # ## Distribution analysis # We look at the distribution of each of the variables and their relationships using a `pairplot`. # In[9]: data1["Turbine"] = 1 data2["Turbine"] = 2 full_data = pd.concat([data1,data2]) sns.pairplot(full_data, hue='Turbine', plot_kws = {'alpha': 0.3, 's': 20, 'edgecolor': 'k'}, height = 3, palette=['blue','orange']) plt.suptitle("Pair plot of the two wind turbines", size=15, y=1.05) plt.show() data1.drop('Turbine', axis=1, inplace=True) data2.drop('Turbine', axis=1, inplace=True) # Visually, it is expected that there is a strong dependency between the wind speed and the electrical power produced. The other variables seem to be less correlated (values are scattered). # # We can indeed verify these results in the correlation matrix below. # In[10]: corr_mat1 = data1.corr() corr_mat2 = data2.corr() fig, ax = plt.subplots(figsize=(20,8)) ax1 = plt.subplot(1,2,1) ax1 = sns.heatmap(corr_mat1, annot=True, cmap="YlGnBu") ax1.set_title("Wind turbine 1") ax2 = plt.subplot(1,2,2) ax2 = sns.heatmap(corr_mat2, annot=True, cmap="YlGnBu") ax2.set_title("Wind turbine 2") plt.show() # # Data preprocessing # ## Data type # # We check how the data are typed. # In[11]: f = lambda x : x.dtypes print_two(f,"Types",data1,data2) # The default dtypes is relevant for real numbers. We will keep this `dtypes`. # ## Dealing with missing values # # ### Empty lines # First, we check the lines where all values are missing # In[12]: f = lambda x : x[x.isna().all(axis=1)].head(1) display_two(f,"Example of line where all values are missing",data1,data2) # We remove these kind of lines. # In[13]: def remove_empty_lines(data) : data = data.dropna(how='all') data.reset_index(inplace=True, drop=True) return data data1 = remove_empty_lines(data1) data2 = remove_empty_lines(data2) print("Dataset 1:", data1.shape, "\tDataset 2:",data2.shape) # ### Lines with missing values # # Now we check the lines where there is at least one missing value. # In[14]: f = lambda x : x.isna().sum(axis=0)/x.count() print_two(f, "Proportion of missing values (by line)", data1, data2) # For each column, the rows with at least one missing value correspond to less than 0.8% of the data. We will remove these lines (because it will not really change the distribution of our data). # In[15]: def remove_missing(data) : data = data.dropna(how='any') data.reset_index(inplace=True, drop=True) return data f = lambda x : x.shape print_two(f ,"Shape before removing missing values:",data1,data2) data1 = remove_missing(data1) data2 = remove_missing(data2) print("\n\n\n") print_two(f,"Shape after removing missing values:",data1,data2) # ## What about negative power # We could see that there were data with negative electrical powers. On peut afficher leur proportion : # In[16]: f = lambda x : ( x[x.Power < 0].count()/x.count() ).Power print_two(f,"Proportion of negative electrical power", data1, data2) # This corresponds to about 10 to 20% of the data. This number is not negligible. We can ask ourselves if these negative values make sense. After research and reflection, we have concluded that they do. Indeed, it is possible that a wind turbine consumes more electricity than it produces (in very low wind conditions). # # So we will not apply any particular pre-treatment for the lines where the power is negative. Especially since the power is the label to predict. If it is negative, it must be taken into account for the training of the model. # ## Standardization # # Now we standardize our features in order to put each feature on the same scale. We will center-reduce each value with the following formula : # # $z_{i_f}=\dfrac{x_{i_f}-\mu_f}{\sigma_f}$ # # with : # - $x_{i_f}$ the value of the feature $f$, for the indivual $i$. # - $\mu_f$ the mean of the values for the feature $f$. # - $\sigma_f$ the standard deviation of the values for the feature $f$. # - $z_{i_f}$ the standardized value that we want. # # For an instance of `StandardScaler`, the method `fit` compute the mean and the standard deviation to use and the method `transform()` will apply the above formula on each value. # In[17]: # Separing X and y def separe_Xy(data) : data_X = data[ X_cols ] try : data_y = data[ Y_col ] except : data_y = None return data_X, data_y data1_X, data1_y = separe_Xy(data1) data2_X, data2_y = separe_Xy(data2) # Prevent copy problems data1 = None data2 = None # In[18]: # Standardize features X def standardize(data_X, standard_scaler, fit) : if fit : data_X = standard_scaler.fit_transform(data_X) # standard_scaler.fit(...) ; standard_scaler.tranfsorm(...) else : data_X = standard_scaler.transform(data_X) data_X = pd.DataFrame(data_X) data_X.columns = X_cols return data_X def destandardize(data_X, standard_scaler) : return data_X * np.sqrt(standard_scaler.var_) + standard_scaler.mean_ # In[19]: standard_scaler1 = StandardScaler() data1_X = standardize(data1_X,standard_scaler1,fit=True) standard_scaler2 = StandardScaler() data2_X = standardize(data2_X,standard_scaler2,fit=True) # ## Preprocessing summary # Now that we have pre-processed our training data, it is ready to be used to train our model. We now define a function `preprocess` that performs all these preprocessing steps. The reasons is as follows. # # After we have train our model on the training dataset, we want to predict labels for a supposed unknown similar dataset. # # This supposed unknown dataset is not already preprocessed. We have to: # # 1) Rename our columns (only for our convenience) # 2) Clean our data (removing missing values) # 3) Standardize our data # # #

# # ⚠️ Note that standardization use parameters estimated from our training dataset (mean, standard deviation). The unknown similar dataset is supposed to have the same distribution. We don't have to re-estimated these parameters (especially since the training data set is representative). In other terms, we dont't have to apply `fit` method of the `StandardScaler` but only the `transform` method # #

# # # In[20]: def preprocess_no_scale(data, verbose=False) : data = rename_cols(data) if verbose : print("Rename") display(data) data = remove_empty_lines(data) if verbose : print("Remove empty") display(data) data = remove_missing(data) if verbose : print("Remove missing") display(data) data_X, data_y = separe_Xy(data) if verbose : print("Separe X Y") display(data_X) display(data_y) return data_X, data_y def preprocess_scale(data_X, data_y, standard_scaler, fit, verbose=False) : data_X = standardize(data_X,standard_scaler,fit) if verbose : print("Standardize X") display(data_X) display(data_y) return data_X, data_y def preprocess(data, standard_scaler, fit, verbose=False) : data_X, data_y = preprocess_no_scale(data, verbose) return preprocess_scale(data_X, data_y, standard_scaler, fit, verbose) # # A first model # # We will create our models: Random Forest regressors (one model for each turbine). To do this we will instantiate our models, then we will train it with our training data via the `fit` method # # ## Models creation and training # In[21]: def create_and_fit_model(data_X, data_y, model_name, **hyperparameters) : regressor = model_name(**hyperparameters) regressor.fit(data_X, data_y.values.ravel()) return regressor regressor1 = create_and_fit_model(data1_X, data1_y , RandomForestRegressor) regressor2 = create_and_fit_model(data2_X, data2_y , RandomForestRegressor) # ## Testing our models # # Now it's time to test our models. For this we will use the test data that we have ignored so far. We do not know this data. The only thing we assume is that the test data set for turbine 1 (respectively, for turbine 2) follows the same probability law as the training data for turbine 1 (respectively, for turbine 2). These are the laws that we have tried to estimate by our models. This is what will allow us to make predictions on the unknown test data. # # ### Pre-processing of test data # # Our test data has not been pre-processed. However, this data must be in the same format as the training data that fed our model. In other words, our test data must not contain any missing values, and the feature values must be standardized using the training parameters (the means and the standard deviations of the training data, assumed to be representative of any data from the wind turbine in question) # In[22]: # Preprocess our test data (no fit !) data1_X_test, data1_y_test = preprocess(data1_test, standard_scaler1, fit=False) data2_X_test, data2_y_test = preprocess(data2_test, standard_scaler2, fit=False) # ### Making predictions # # We can now make our predictions. # In[23]: # Make predictions data1_y_test_pred = regressor1.predict(data1_X_test) data2_y_test_pred = regressor2.predict(data2_X_test) # ## Performance evaluation. # # # ### Defining the metrics # # In order to evaluate the success of our predictions, we need to define performance metrics. Since we are dealing with a regression problem, our metrics will measure a distance between the prediction and the true value of the label. For $n$ individuals, with $y_i$ a true label of the individual $i$, $\hat{y}_i$ the predicted label for $i$ and $\bar{y}$ the mean of values, we will choose the following metrics: # # - R2 Score : score between -1 and 1 (the best possible score is 1) # # $R_2 = 1 - \dfrac{\sum^{n}_{i=1}{(y_i - \hat{y}_i)^2}}{\sum^{n}_{i=1}{(y_i - \bar{y})^2}}$ # - Mean Absolute Error : error in the same unit as the label (the best possible score is 0) # # $MAE = \dfrac{\sum^{n}_{i=1}{|y_i - \hat{y}_i|}}{n} $ # - Root Mean Squared Error : error in the same unit as the label (the best possible score is 0) # # $RMSE = \displaystyle\sqrt{\dfrac{\sum^{n}_{i=1}{(y_i - \hat{y}_i)^2}}{n}}$ # # # ### Compute scores # # We will now calculate the scores of our predictions for each of the models. # In[24]: scores_keys = ['R2 Score [-1,1] ','Mean Absolute Error (kW) ', 'Root Mean Squared Error (kW) '] def compute_scores(y_pred,y_true) : return { scores_keys[0] : r2_score(y_pred,y_true), scores_keys[1] : mean_absolute_error(y_pred,y_true), scores_keys[2] : mean_squared_error(y_pred,y_true,squared=False) } def scores_tostr(y_pred,y_true) : s = '' scores = compute_scores(y_pred,y_true) for idx,score in scores.items() : s = s+str(idx)+': \t'+str(score)+'\n' return s sco = lambda x : scores_tostr(x[0],x[1]) print_two(sco,'Scores', (data1_y_test,data1_y_test_pred) , (data2_y_test,data2_y_test_pred)) # ### Plot predictions # # Finally, we will plot the actual labels of the test data and the labels predicted by our model as a function of the "speed" feature. We choose speed because it is the feature most correlated to the label value. # In[25]: def plot_predictions(scores1, data1_X_test,data1_y_test,data1_y_test_pred,standard_scaler1, model_name1, scores2, data2_X_test,data2_y_test,data2_y_test_pred,standard_scaler2, model_name2, feature="Speed") : fig, (ax1,ax2) = plt.subplots(1,2, figsize=(14,8)) if (standard_scaler1 != None) : df1 = destandardize(data1_X_test,standard_scaler1) else : df1 = data1_X_test if (standard_scaler2 != None) : df2 = destandardize(data2_X_test,standard_scaler2) else : df2 = data2_X_test s = 5 #for col in df1.columns : ax1.set_title(f"Dataset 1") ax1.scatter(df1[feature],data1_y_test, c="blue", alpha=0.7, label="Real values", s=s) ax1.scatter(df1[feature],data1_y_test_pred, c="green", alpha=0.7, label="Predicted values", s=s) ax1.set_ylabel("Power") ax1.set_xlabel("Speed") ax1.text(13,200,f"R2 Score : { round(scores1[scores_keys[0]],3) }") ax1.text(13,100,f"MAE : { round(scores1[scores_keys[1]],3) } kW") ax1.text(13,0,f"RMSE : { round(scores1[scores_keys[2]],3) } kW") ax2.set_title(f"Dataset 2") ax2.scatter(df2[feature],data2_y_test, c="red",alpha=0.7, s=s, label="Real values") ax2.scatter(df2[feature],data2_y_test_pred, c="green", alpha=0.7, s=s) ax2.set_xlabel("Speed") ax2.set_ylabel("Power") ax2.text(13,200,f"R2 Score : { round(scores2[scores_keys[0]],3) }") ax2.text(13,100,f"MAE : { round(scores2[scores_keys[1]],3) } kW") ax2.text(13,0,f"RMSE : { round(scores2[scores_keys[2]],3) } kW") fig.suptitle(f"{model_name} predictions") fig.legend() fig.show() model_name = 'Random Forest' plot_predictions(compute_scores(data1_y_test_pred,data1_y_test), data1_X_test,data1_y_test,data1_y_test_pred,standard_scaler1, model_name, compute_scores(data2_y_test_pred,data2_y_test), data2_X_test,data2_y_test,data2_y_test_pred,standard_scaler2, model_name ) # ### The K-Folds technique # # # Measured performance may be biased. Indeed, the separation of the training data and the test data was done in a random way. Maybe the test data and the training data were by chance well chosen in order to obtain such a score. To correct this, we will use the K-folds technique. # # First, the data set is randomly separated into k folds. The model will be tested on 1 folds, while the remaining k-1 folds will serve as training data. We reiterate by changing each time the fold that will be used as test data and by noting each time the scores obtained. # #

# # ⚠️ **Important remark:** we do not preprocess the data before separating them into $k$ folds. Indeed, if we preprocess our data before the separation, the standardization would estimate parameters $\mu$ and $\sigma$ from the data set. Once this is done, we would separate the data into $k$ folds. One of the folds would serve as a test fold. However, the data of the test fold (supposedly unknown) were taken into account in the calculation of $\mu$ and $\sigma$. There would then be a data leak! # # Thus, in our `KFolds_validation` function we use non-preprocessed data that we separate into $k$ folds. Within each iteration, we preprocess the training folds: they are standardized. There is no information leakage because the standardization is done on the training folds. After that, we preprocess the test fold by standardizing from the parameters of the training folds (in order to have the test data in the same format), then we make our predictions. # #

# # # # In[26]: # Create k-Fold (with k the number of splits) def KFolds_validation(data, k, model_name, **hyperparameters) : folds = KFold(n_splits = k, shuffle=True, random_state=10) folds_scores = [] for train_index, valid_index in folds.split(data) : # Split data data_train = data.iloc[train_index] data_valid = data.iloc[valid_index] # Preprocessing standard_scaler = StandardScaler() data_X_train, data_y_train = preprocess(data_train, standard_scaler, fit=True) # Build model regressor = create_and_fit_model(data_X_train, data_y_train, model_name, **hyperparameters) # Preprocess our valid data data_X_valid, data_y_valid = preprocess(data_valid, standard_scaler, fit=False) # Predict valid data data_y_valid_pred = regressor.predict(data_X_valid) # Compute scores folds_scores += [ compute_scores(data_y_valid_pred,data_y_valid) ] return folds_scores # In[27]: data1 = data1_train_original.copy() # no preprocessing data2 = data2_train_original.copy() # no preprocessing folds_scores1 = KFolds_validation(data1,k=10, model_name=RandomForestRegressor) folds_scores2 = KFolds_validation(data2,k=10, model_name=RandomForestRegressor) display_two(lambda x : pd.DataFrame(x), "Folds scores", folds_scores1, folds_scores2 ) # In[28]: r2_scores1 = [ fold_scores[scores_keys[0]] for fold_scores in folds_scores1 ] mae1 = [ fold_scores[scores_keys[1]] for fold_scores in folds_scores1 ] rmse1 = [ fold_scores[scores_keys[2]] for fold_scores in folds_scores1 ] r2_scores2 = [ fold_scores[scores_keys[0]] for fold_scores in folds_scores2 ] mae2 = [ fold_scores[scores_keys[1]] for fold_scores in folds_scores2 ] rmse2 = [ fold_scores[scores_keys[2]] for fold_scores in folds_scores2 ] r2_scores1 folds1 = list(map( lambda x : "F"+str(x) , np.arange(1,len(folds_scores1)+1) )) folds2 = list(map( lambda x : "F"+str(x) , np.arange(1,len(folds_scores2)+1) )) fig, axes = plt.subplots(3,2, figsize=(14,14)) r2_min = min( min(r2_scores1), min(r2_scores2))-0.01 mae_min = min( min(mae1), min(mae2))-1 rmse_min = min( min(rmse1), min(rmse2))-1 r2_max = max( max(r2_scores1), max(r2_scores2))+0.01 mae_max = max( max(mae1), max(mae2))+1 rmse_max = max( max(rmse1), max(rmse2))+1 # axes[0,0].bar(folds1,r2_scores1, linewidth=1, edgecolor='black', color='blue') axes[0,0].set_title("Dataset 1") axes[0,0].set_ylabel("R2 Score") axes[0,0].set_ylim(r2_min, r2_max) # axes[0,1].bar(folds2,r2_scores2, linewidth=1, edgecolor='black', color='red') axes[0,1].set_title("Dataset 2") axes[0,1].set_ylim(r2_min, r2_max) # axes[1,0].bar(folds1,mae1, linewidth=1, edgecolor='black', color='blue') axes[1,0].set_ylabel("Mean Absolute Error") axes[1,0].set_ylim(mae_min, mae_max) # axes[1,1].bar(folds2,mae2, linewidth=1, edgecolor='black', color='red') axes[1,1].set_ylim(mae_min, mae_max) # axes[2,0].bar(folds1,rmse1, linewidth=1, edgecolor='black', color='blue') axes[2,0].set_ylabel("Root Mean Squared Error") axes[2,0].set_xlabel("Fold") axes[2,0].set_ylim(rmse_min, rmse_max) # axes[2,1].bar(folds2,rmse2, linewidth=1, edgecolor='black', color='red') axes[2,1].set_xlabel("Fold") axes[2,1].set_ylim(rmse_min, rmse_max) # Finally, we compute the average scores of the obtained K-folds. # We can see below that they are close to the scores obtained without the K-folds technique. So it was not a lucky guess. # In[29]: def mean_cross(folds_scores) : scores_list = [] for key in scores_keys : scores_list.append(list(map(lambda x : x[key] , folds_scores))) scores_means = list(map( lambda x: np.mean(x) , scores_list) ) return scores_means def mean_cross_str(scores_means) : s = {} for i in range(0,len(scores_means)) : s[scores_keys[i]] = scores_means[i] return s def dict_to_str(dic) : s = "" for key,val in dic.items() : s+=key+': '+str(val)+'\n' return s print_two(lambda x : dict_to_str(mean_cross_str(mean_cross(x))), "Mean score (K-folds)", folds_scores1, folds_scores2) # ## Choice of hyper-parameters # Until now, we have used the `RandomForestRegressor` model with its default hyper-parameters. # # Our goal is now to study the influence of hyperparameters on the performance of our model. # # The `validation_curve` function, will use the K-folds technique to compute test scores based on the value of the hyperparameter to study. # # #

# # ⚠️ Important note: in our `KFolds_validation` function we use non-preprocessed data. # Here, the `validation_curve` function will use the cross-validation K-Fold technique to separate the input data. # # Problem: We want to standardize the data inside the iterations to avoid data leakage (see Section 6.1.3.4 The K-Folds technique) # # # To solve this problem, we will create a pipeline. The pipeline will be composed of a `StandardScaler` and a `RandomForestRegressor` model. This pipeline will be given as a parameter to the function (instead of just the model). Thus, the function will apply the pipeline at each iteration of the K-Fold cross-validation. #

# ### Tree pruning # # The first hyperparameter we will focus on is the `min_samples_leaf` hyperparameter which allows to perform a tree pruning on the trees of our random forest. # In[30]: def compute_validation_curve(data1_train,data2_train, model, param_name, param_range) : data1_X_train, data1_y_train = preprocess_no_scale(data1_train) data2_X_train, data2_y_train = preprocess_no_scale(data2_train) ppl1 = Pipeline( [ ('scl', StandardScaler() ) , ('model', model() ) ] ) ppl2 = Pipeline( [ ('scl', StandardScaler() ) , ('model', model() ) ] ) train_scores1, valid_scores1 = validation_curve(ppl1, data1_X_train, data1_y_train.values.ravel(), param_name="model__"+param_name, param_range=param_range, cv=5, n_jobs=-1) train_scores2, valid_scores2 = validation_curve(ppl2, data2_X_train, data2_y_train.values.ravel(), param_name="model__"+param_name, param_range=param_range, cv=5, n_jobs=-1) mean_valid_scores1 = list(map(np.mean,valid_scores1)) mean_train_scores1 = list(map(np.mean,train_scores1)) mean_valid_scores2 = list(map(np.mean,valid_scores2)) mean_train_scores2 = list(map(np.mean,train_scores2)) return mean_valid_scores1,mean_train_scores1,mean_valid_scores2,mean_train_scores2 def plot_validation_curve(mean_valid_scores1,mean_train_scores1,mean_valid_scores2,mean_train_scores2,param_name,param_range) : fig, axs = plt.subplots(1,2, figsize=(14,4)) axs[0].plot(param_range,mean_train_scores1, c="blue", label='Train set') axs[0].plot(param_range,mean_valid_scores1, c="green", label='Test set') axs[0].set_title("Dataset 1") axs[0].set_xlabel(param_name) axs[0].set_ylabel("R2 Score") x_max = param_range[np.argmax(mean_valid_scores1)] axs[0].axvline(x=x_max, color='black', linestyle=(0,(1,5)), label='Best fit' ) axs[0].legend() axs[1].plot(param_range,mean_train_scores2, c="red", label='Train set') axs[1].plot(param_range,mean_valid_scores2, c="green", label='Test set') axs[1].set_title("Dataset 2") axs[1].set_xlabel(param_name) axs[1].set_ylabel("R2 Score") x_max = param_range[np.argmax(mean_valid_scores2)] axs[1].axvline(x=x_max, color='black', linestyle=(0,(1,5)), label='Best fit' ) axs[1].legend() fig.suptitle("Hyperparameter influence (validation curve)") fig.show() return # In[31]: get_ipython().run_cell_magic('time', '', "\ndata1_train = data1_train_original.copy()\ndata2_train = data2_train_original.copy()\n\nmin_samples_leaf = range(1,25,1)\n\nmean_valid_scores1,mean_train_scores1,\\\nmean_valid_scores2,mean_train_scores2 = compute_validation_curve(data1_train,data2_train, \n RandomForestRegressor, \n 'min_samples_leaf', \n min_samples_leaf) \n") # In[32]: plot_validation_curve(mean_valid_scores1, mean_train_scores1, mean_valid_scores2, mean_train_scores2, 'Min samples leaf', min_samples_leaf) # We find that the `RandomForestRegressor` with full trees (`min_samples_leaf` close to 0) does overfitting (very high score for train, but lower for test). By performing tree pruning on the trees of the random forest, the score on the test increases, then decreases again if the trees are too pruned (underfitting). # ### Number of trees # # The second hyperparameter we will focus on is the `n_estimators` hyperparameter which compute the number of trees used for the `RandomForestRegressor` model. # # In[33]: get_ipython().run_cell_magic('time', '', "\ndata1_train = data1_train_original.copy()\ndata2_train = data2_train_original.copy()\n\nn_estimators_range = range(1,50,1)\n\nmean_valid_scores1,mean_train_scores1,\\\n mean_valid_scores2,mean_train_scores2 = compute_validation_curve(data1_train,\n data2_train, \n RandomForestRegressor, \n 'n_estimators', \n n_estimators_range) \n") # In[34]: plot_validation_curve(mean_valid_scores1, mean_train_scores1, mean_valid_scores2, mean_train_scores2, 'Number of trees', n_estimators_range) # As we can see on the graph, we do not increase the overfitting by adding more trees to the `RandomForestRegressor` model. The score stabilizes for a large enough number of trees # ### Automatic search for hyperparameters # # We will now implement an automatic search procedure for the best hyperparameters. # In[35]: rdf_params = { 'RdForest__n_estimators': sp.stats.randint(1,50), 'RdForest__max_depth' : np.arange(10,15,1), 'RdForest__min_samples_split' : np.arange(30,150), 'RdForest__min_samples_leaf' : np.arange(1,20), #'RdForest__min_weight_fraction_leaf' : np.arange(0.0,0.1,0.01), 'RdForest__max_features' : ['auto', 'sqrt', 'log2'], 'RdForest__max_leaf_nodes' : np.arange(20,100,1), #'RdForest__min_impurity_decrease' : np.arange(0,100,1), 'RdForest__ccp_alpha' : np.arange(0,100,1), } def automatic_search(data1_train, data2_train, model,model_name,model_params, randomized=True) : data1_X_train, data1_y_train = preprocess_no_scale(data1_train) data2_X_train, data2_y_train = preprocess_no_scale(data2_train) # Pipelines ppl1 = Pipeline( [ ('scl', StandardScaler() ) , (model_name, model() ) ] ) ppl2 = Pipeline( [ ('scl', StandardScaler() ) , (model_name, model() ) ] ) # KFold kfold = KFold(n_splits = 5, shuffle=True) # Model searchCV = RandomizedSearchCV if not randomized : searchCV = GridSearchCV rds1 = searchCV( ppl1, # model model_params, # param dictionnary cv = kfold, # number of folds in cross-val refit = True, # after finding the best params, refit model on all dataset verbose = 0, # 3 print details, False/0 pas print n_jobs=-1, # if -1 use all processors #random_state = 75 ) rds2 = searchCV( ppl2, # model model_params, # param dictionnary cv = kfold, # number of folds in cross-val refit = True, # after finding the best params, refit model on all dataset verbose = 0, # 3 print details, False/0 pas print n_jobs=-1, # if -1 use all processors #random_state = 75 ) rds1.fit(data1_X_train, data1_y_train.values.ravel()) rds2.fit(data2_X_train, data2_y_train.values.ravel()) # return best parameter return rds1, rds2 # In[36]: get_ipython().run_cell_magic('time', '', "\ndata1_train = data1_train_original.copy()\ndata2_train = data2_train_original.copy()\n\nmodel_name = 'RdForest'\nrf_rds1,rf_rds2 = automatic_search(data1_train,data2_train,RandomForestRegressor,model_name,rdf_params)\n") # In[37]: print_two(lambda x : dict_to_str(x),"Best hyperparameters" , rf_rds1.best_params_, rf_rds2.best_params_) # In[38]: data1_test = data1_test_original.copy() data1_test = data1_test_original.copy() data1_X_test, data1_y_test = preprocess_no_scale(data1_test) data2_X_test, data2_y_test = preprocess_no_scale(data2_test) data1_y_test_pred = rf_rds1.predict(data1_X_test) data2_y_test_pred = rf_rds2.predict(data2_X_test) plot_predictions(compute_scores(data1_y_test_pred,data1_y_test), data1_X_test,data1_y_test,data1_y_test_pred,None, model_name, compute_scores(data2_y_test_pred,data2_y_test), data2_X_test,data2_y_test,data2_y_test_pred,None, model_name) # # Model comparison # # Now that the typical steps of solving a regression problem have been discussed, we are equipped to compare different models in terms of their performance. # # If the possibilities of hyperparameters to test are few, one can do an exhaustive search of the best hyperparameters with `GridSearchCV`. Otherwise, it is preferable to make a randomized search with `RandomizedSearchCV` and in this case, we will plot the influence of the hyperparameters on the performance of the model with a validation curve as previous *(cf: Pruning and Number of trees for Random Forest)*. # In[39]: def full_auto_search_print_plot(model,model_name,params,randomized) : data1_train = data1_train_original.copy() data2_train = data2_train_original.copy() rds1,rds2 = automatic_search(data1_train,data2_train,model,model_name,params, randomized=randomized) print_two(lambda x : dict_to_str(x),"Best hyperparameters" , rds1.best_params_, rds2.best_params_) # Make predictions data1_test = data1_test_original.copy() data1_test = data1_test_original.copy() data1_X_test, data1_y_test = preprocess_no_scale(data1_test) data2_X_test, data2_y_test = preprocess_no_scale(data2_test) data1_y_test_pred = rds1.predict(data1_X_test) data2_y_test_pred = rds2.predict(data2_X_test) plot_predictions(compute_scores(data1_y_test_pred,data1_y_test), data1_X_test,data1_y_test,data1_y_test_pred,None, model_name, compute_scores(data2_y_test_pred,data2_y_test), data2_X_test,data2_y_test,data2_y_test_pred,None, model_name) return rds1, rds2 def full_validation_curve(model,param_name,param_range) : data1_train = data1_train_original.copy() data2_train = data2_train_original.copy() mean_valid_scores1,mean_train_scores1,\ mean_valid_scores2,mean_train_scores2 = compute_validation_curve(data1_train, data2_train, model, param_name, param_range) plot_validation_curve(mean_valid_scores1, mean_train_scores1, mean_valid_scores2, mean_train_scores2, param_name, param_range) return estimators1 = [] estimators2 = [] estimators_names = [] estimators1.append(rf_rds1) estimators2.append(rf_rds2) estimators_names.append(model_name) # ## Linear Regressors # # ### Linear Regression # In[40]: # Find best params lin_reg_params = { 'Linear__fit_intercept' : [True,False] , } model_name = 'Linear' lin_rds1,lin_rds2 = full_auto_search_print_plot(LinearRegression, model_name, lin_reg_params, randomized=False) estimators1.append(lin_rds1) estimators2.append(lin_rds2) estimators_names.append(model_name) # ### Ridge Regression # # In[41]: full_validation_curve(Ridge,"alpha",np.arange(0,1000,10)) # In[42]: # Find best params ridge_params = { 'Ridge__fit_intercept' : [True,False] , 'Ridge__alpha' : np.arange(0,300) , } model_name = 'Ridge' ridge_rds1,ridge_rds2 = full_auto_search_print_plot(Ridge, model_name, ridge_params, randomized=True) estimators1.append(ridge_rds1) estimators2.append(ridge_rds2) estimators_names.append(model_name) # ### Lasso Regression # In[43]: full_validation_curve(Lasso,"alpha",np.arange(0,100,10)) # In[44]: # Find best params lasso_params = { 'Lasso__fit_intercept' : [True,False] , 'Lasso__alpha' : np.arange(0,30,0.01) , 'Lasso__selection' : [True,False] , 'Lasso__max_iter' : np.arange(100,10000,100) , 'Lasso__tol' : np.arange(0.00001,0.1,0.001) , 'Lasso__selection' : ['cyclic', 'random'] } model_name = 'Lasso' lasso_rds1,lasso_rds2 = full_auto_search_print_plot(Lasso, model_name, lasso_params, randomized=True) estimators1.append(lasso_rds1) estimators2.append(lasso_rds2) estimators_names.append(model_name) # ### SGD Regression # In[45]: full_validation_curve(SGDRegressor,"alpha",np.arange(0.001,10,0.1)) full_validation_curve(SGDRegressor,"l1_ratio",np.arange(0.001,1,0.1)) full_validation_curve(SGDRegressor,"epsilon",np.arange(1,100,2)) full_validation_curve(SGDRegressor,"eta0",np.arange(0.001,1,0.1)) full_validation_curve(SGDRegressor,"power_t",np.arange(0.001,1,0.1)) # In[46]: SGD_reg_params = { 'SGD__loss' : ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'SGD__penalty' : ['l2', 'l1', 'elasticnet'] , 'SGD__alpha' : np.arange(0.001,1,0.1), 'SGD__l1_ratio' : np.arange(0.001,1,0.1), 'SGD__epsilon' : np.arange(0.001,1,0.1), 'SGD__power_t' : np.arange(0.001,1,0.1), 'SGD__learning_rate' : ['constant','optimal','invscaling','adaptive'], } model_name = 'SGD' sgd_rds1,sgd_rds2 = full_auto_search_print_plot(SGDRegressor, model_name, SGD_reg_params, randomized=True) estimators1.append(sgd_rds1) estimators2.append(sgd_rds2) estimators_names.append(model_name) # ## KNN Regressor # # First, we define some kernel functions that are not available in the default sklearn library for `KNeighborsRegressor`. # In[47]: # kernel for KNN def gaussian_kernel(distance) : return np.exp(-distance**2) def cauchy_kernel(distance) : dim = 3 # number of features X return 1/(1+distance**(dim+1)) # In[48]: full_validation_curve(KNeighborsRegressor,"n_neighbors",np.arange(1,100,1)) # In[49]: KNN_reg_params = { 'KNN__n_neighbors' : np.arange(1,50) , 'KNN__weights' : ['uniform','distance', gaussian_kernel, cauchy_kernel] , 'KNN__leaf_size' : np.arange(1,50) , 'KNN__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 'KNN__p' : [1,2,3] , } model_name = 'KNN' knn_rds1,knn_rds2 = full_auto_search_print_plot(KNeighborsRegressor, model_name, KNN_reg_params, randomized=True) estimators1.append(knn_rds1) estimators2.append(knn_rds2) estimators_names.append(model_name) # ## Decision Tree # In[50]: full_validation_curve(DecisionTreeRegressor,"max_depth",np.arange(1,15,1)) full_validation_curve(DecisionTreeRegressor,"min_samples_split",np.arange(2,400,10)) full_validation_curve(DecisionTreeRegressor,"min_samples_leaf",np.arange(1,100,1)) #full_validation_curve(DecisionTreeRegressor,"min_weight_fraction_leaf",np.arange(0.0,0.5,0.01)) full_validation_curve(DecisionTreeRegressor,"max_leaf_nodes",np.arange(2,100,1)) #full_validation_curve(DecisionTreeRegressor,"min_impurity_decrease",np.arange(0,100,10)) full_validation_curve(DecisionTreeRegressor,"ccp_alpha",np.arange(0.0,100,10)) # In[51]: DT_reg_params = { 'DTree__splitter' : ['best','random'] , 'DTree__max_depth' : np.arange(10,15,1), 'DTree__max_features' : ['auto', 'sqrt', 'log2'], 'DTree__min_samples_split' : np.arange(30,150), 'DTree__min_samples_leaf' : np.arange(1,20), #'DTree__min_weight_fraction_leaf':np.arange(0.0,0.1,0.01), 'DTree__max_leaf_nodes':np.arange(20,100,1), #'DTree__min_impurity_decrease':np.arange(0,100,1), 'DTree__ccp_alpha':np.arange(0,100,1), } model_name = 'DTree' dt_rds1,dt_rds2 = full_auto_search_print_plot(DecisionTreeRegressor, model_name, DT_reg_params, randomized=True) estimators1.append(dt_rds1) estimators2.append(dt_rds2) estimators_names.append(model_name) # ## ExtraTree Regressor # In[52]: full_validation_curve(ExtraTreeRegressor,"max_depth",np.arange(1,15,1)) full_validation_curve(ExtraTreeRegressor,"min_samples_split",np.arange(2,400,10)) full_validation_curve(ExtraTreeRegressor,"min_samples_leaf",np.arange(1,100,1)) #full_validation_curve(ExtraTreeRegressor,"min_weight_fraction_leaf",np.arange(0.0,0.5,0.01)) full_validation_curve(ExtraTreeRegressor,"max_leaf_nodes",np.arange(2,100,1)) #full_validation_curve(ExtraTreeRegressor,"min_impurity_decrease",np.arange(0,100,10)) full_validation_curve(ExtraTreeRegressor,"ccp_alpha",np.arange(0.0,100,10)) # In[53]: ET_reg_params = { 'ExtraTree__splitter' : ['best','random'] , 'ExtraTree__max_depth' : np.arange(10,15,1), 'ExtraTree__max_features' : ['auto', 'sqrt', 'log2'], 'ExtraTree__min_samples_split' : np.arange(30,150), 'ExtraTree__min_samples_leaf' : np.arange(1,20), #'ExtraTree__min_weight_fraction_leaf':np.arange(0.0,0.1,0.01), 'ExtraTree__max_leaf_nodes':np.arange(20,100,1), #'ExtraTree__min_impurity_decrease':np.arange(0,100,1), 'ExtraTree__ccp_alpha':np.arange(0,100,1), } model_name = 'ExtraTree' ext_rds1,ext_rds2 = full_auto_search_print_plot(ExtraTreeRegressor, model_name, ET_reg_params, randomized=True) estimators1.append(ext_rds1) estimators2.append(ext_rds2) estimators_names.append(model_name) # ## AdaBoost Regressor # In[54]: full_validation_curve(AdaBoostRegressor,"n_estimators",np.arange(1,100)) full_validation_curve(AdaBoostRegressor,"learning_rate",np.arange(0.01,1,0.1)) # In[55]: AdaB_reg_params = { 'AdaBoost__n_estimators' : np.arange(1,100) , 'AdaBoost__learning_rate' : np.arange(0.01,1,0.01) , } model_name = 'AdaBoost' ada_rds1,ada_rds2 = full_auto_search_print_plot(AdaBoostRegressor, model_name, AdaB_reg_params, randomized=True) estimators1.append(ada_rds1) estimators2.append(ada_rds2) estimators_names.append(model_name) # ## Gradient Boosting # In[56]: full_validation_curve(GradientBoostingRegressor,"n_estimators",np.arange(1,100,10)) full_validation_curve(GradientBoostingRegressor,"max_depth",np.arange(1,20,1)) full_validation_curve(GradientBoostingRegressor,"learning_rate",np.arange(0.01,1,0.1)) full_validation_curve(GradientBoostingRegressor,"subsample",np.arange(0.01,1,0.1)) # In[57]: GradB_reg_params = { 'GradBoost__n_estimators' : np.arange(1,100) , 'GradBoost__learning_rate' : np.arange(0.01,1,0.01) , 'GradBoost__max_depth' : np.arange(1,20,1) , 'GradBoost__subsample' : np.arange(0.01,1,0.01) , } model_name = 'GradBoost' grb_rds1,grb_rds2 = full_auto_search_print_plot(GradientBoostingRegressor, model_name, GradB_reg_params, randomized=True) estimators1.append(grb_rds1) estimators2.append(grb_rds2) estimators_names.append(model_name) # # Conclusion # ## Difficulties encountered # # One of the first difficulties was to define a good metric to evaluate the performance of our model. After reflection we chose to define three metrics: # - the R2 score which is easily readable because it gives an easily interpretable score (we want to be as close as possible to the value 1) # - the MAE and RMSE which are easily readable because they are in the same unit as the label to be predicted (we can thus easily understand the difference between the predictions and the real values). The RMSE is more sensitive to extremely different predicted values. # # Another difficulty was to choose the ranges for our hyperparameters to test. We finally decided to first make validation curves in order to have an idea of how the score varies according to the value of a hyperparameter. # ## Best predictive performance # In[58]: best_scores1 = list(map(lambda x : x.best_score_ , estimators1)) best_scores2 = list(map(lambda x : x.best_score_ , estimators2)) # In[59]: plt.figure( figsize=(10,4)) plt.scatter(estimators_names,best_scores1, label='Turbine 1', s=150, marker='^', linewidth=1, edgecolor='black', color='blue') plt.scatter(estimators_names,best_scores2, label='Turbine 2', s=150, marker='^', linewidth=1, edgecolor='black', color='red') plt.axhline(y=0.95, color='r', linestyle='--', label='R2 Score = 95%') plt.legend(fancybox=True, bbox_to_anchor=(1.5, 1.1), ncol=1, framealpha=1, shadow=True, borderpad=1) plt.title("Comparison of estimators") plt.ylabel("R2 Score") plt.show() # As expected, linear models perform less well because the data values are not linear. On the other hand, they have a rather correct score (R2 score > 80%) for a very low cost. These models allow us to make a more or less correct estimation at a lower cost and the predicted values are easily interpretable (linearity between the wind speed and the electric power produced). # # Concerning the non-linear models, we have much better performances. We can see that the GradientBoosting and KNN models get the best scores. # # There is no model that disproportionately dominates the others. # ## Comparison of wind turbines # # The data for each wind turbine has its own behavior. However, we could see that they are similar in many points: # - the electrical power is almost totally explained by the speed # - the shape of the distribution of electrical power as a function of the speed are similar for the two datasets. # # However, they have their own characteristics: turbine 1 has two power regimes at very high speed (visible by the shape of the points forming two horizontal lines) while the power produced by turbine 2 follows its "natural" curve. Therefore, the models have better performances on the data set of turbine 2 than on that of turbine 1 (as can be seen on the previous graph) # # However, there is no model that would be more adapted to one or more adapted to the other. The best performing models on one are also the best performing models on the other. # # # ## Influence of hyperparameters # # In the previous section we could see different curve validations. What emerges is that for linear models, the smaller the alpha hyperparameter the better the scores, however, if it is too small it can overfit the training data. We find the same phenomenon for the `KNN` model with the hyperparameter `n_neighbors` (which corresponds to the number of neighbors), the parameter `min_samples_leaf`,`min_samples_split`,`ccp_alpha` for `DecisionTree`, `ExtraTree`, `RandomForest`, and the `max_depth` for GradientBoosting. # # The other hyperparameters do not seem to cause overffiting (the score evolves in a similar way between the training and the test data) #

# #

# # # Appendix : Statement # # **Homework 1** # # To be done in groups of 2 students. # # **Predicting wind energy** # # # The two datasets contain the following features. # # - P : electrical power in kW. # # - V : wind speed in $m.s^{-1}$. # # - Dir : wind direction in degrees °. # # - T : temperature in °C. # # # **Your job** # # - Try to understand the data, visualize it. # - The main aim is then to predict the electrical power produced by the wind turbines. # # **We want to see the following things** # # - Prediction results and error obtained for several machine learning methods for the two wind turbines. # - A discussion about the impact of the hyperparameters of the methods and their calibration on your predictions. # - A clear comparison between your solutions (table, and/or plots comparing the results obtained with different methods). # - Comment about the comparison of both wind turbines. # - A discussion about the difficulties encountered if applicable. # - Aim : best possible predictive performance, in a clean notebook, with clear graphs and explanations. # # **Hand in your assignment before 18/10/21 23h55:** # # - via Moodle for all Université de Paris students # - via email if Moodle not possible for Paris 1 students, subject [Machine Learning DM1], aurelie.fischer@u-paris.fr # # #

Table of Contents