#!/usr/bin/env python # coding: utf-8 # _This notebook contains code and comments from Section 7.2 of the book [Ensemble Methods for Machine Learning](https://www.manning.com/books/ensemble-methods-for-machine-learning). Please see the book for additional details on this topic. This notebook and code are released under the [MIT license](https://github.com/gkunapuli/ensemble-methods-notebooks/blob/master/LICENSE)._ # # ## 7.4 Case Study: Demand Forecasting # Demand forecasting is an important problem that arises in many business contexts, where the goal is to predict the demand for a certain product or commodity. Accurately predicting demand is critical for downstream supply chain management and optimization: to ensure that there is enough supply to meet needs and not too much that there is waste. # # In this section, we study the problem of Bike Rental Forecasting. As we see below, the nature of the problem (and especially, the targets/labels) is quite similar to those arising in the areas of weather prediction and analytics, insurance and risk analytics, health informatics, energy demand forecasting, business intelligence and many others. # # The Bike Rental data set was the first of several similar publicly available data sets that tracks the usage of bicycle sharing services in major metropolitan areas. These data sets are made publicly available through the UCI Machine Learning Repository. # # This data set, first made available in 2013, tracks hourly and daily bicycle rentals of Capital Bike Sharing in Washington DC. In addition, the data set also contains several features describing the weather as well as the time of day and day of the year. # The overall goal of the problem is to predict the bike rental demand depending on the time of day, the season, and the weather. The demand is measured in total number of users, a count! The total number of users is further composed of casual and registered users. # # The number of registered users appears to be fairly consistent across the year, since these are users who presumably use bike sharing as a regular transportation option rather than a recreational activity. This is akin to commuters who have a monthly/annual bus pass for their daily commutes as opposed to, say tourists, who only buy bus tickets as needed. # Keeping this in mind, we construct a derived data set for our case study that can be used to build a model to forecast the rental bike demand of casual users. # # **Problem**: Predict the number of casual bike rental users depending on the time of day, season and weather. # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[2]: # # Load the original UCI data set, drop some columns and save it for this Case Study # data = pd.read_csv('./data/ch07/bikesharing-hour.csv') # data = data.drop(['dteday', 'instant', 'yr', 'registered', 'cnt'], axis=1) # data.to_csv('./data/ch07/bikesharing.csv', index=False) # ### 7.4.1 Preprocessing # Let’s preprocess this data set by normalizing the features, that is, we ensure that each feature is zero mean, unit standard deviation. Normalization is not always the best approach to deal with discrete features. For now though, let’s use this simple preprocessing and keep our focus on ensembles for regression. In Chapter 8, we delve more into preprocessing strategies for these types of features. # # The listing below shows our preprocessing steps: it splits the data into training (80% of the data) and test sets (remaining 20% of the data) and applies normalization to the features. As always, we’ll hold out the test set from the training process so that we can evaluate the performance of each of our trained models on the test set. # In[3]: data = pd.read_csv('./data/ch07/bikesharing.csv') pd.options.display.float_format = '{:,.3f}'.format data.describe() # **Listing 7.8**: Preprocessing the Bike Rental Data Set # In[4]: # Get indices for the features and labels labels = data.columns.get_loc('casual') features = np.setdiff1d(np.arange(0, len(data.columns), 1), labels) # Split into train and test sets from sklearn.model_selection import train_test_split trn, tst = train_test_split(data, test_size=0.2, random_state=42) Xtrn, ytrn = trn.values[:, features], trn.values[:, labels] Xtst, ytst = tst.values[:, features], tst.values[:, labels] # Normalize the data from sklearn.preprocessing import StandardScaler preprocessor = StandardScaler().fit(Xtrn) Xtrn, Xtst = preprocessor.transform(Xtrn), preprocessor.transform(Xtst) # In[5]: fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(9, 4)) counts, bins = np.histogram(ytrn, bins=25) ax[0].hist(bins[:-1], bins, weights=counts, rwidth=0.75, edgecolor='k') ax[0].set_title('Distribution of count targets, $log(1 + y)$', fontsize=12) ax[0].set_xlabel('Number of bikes rented, y', fontsize=12) ax[0].set_ylabel('Number of training examples', fontsize=12) counts, bins = np.histogram(np.log(1 + ytrn), bins=25) ax[1].hist(bins[:-1], bins, weights=counts, rwidth=0.75, edgecolor='k') ax[1].set_title('Distribution of log-transform of count targets, $log(1 + y)$', fontsize=12) ax[1].set_xlabel('log(Number of bikes rented), $log(1 + y)$', fontsize=12) ax[1].set_ylabel('Number of training examples', fontsize=12) fig.tight_layout() # plt.savefig('./figures/CH07_F13_Kunapuli.png', format='png', dpi=300, bbox_inches='tight') # plt.savefig('./figures/CH07_F13_Kunapuli.pdf', format='pdf', dpi=300, bbox_inches='tight') # --- # ### 7.4.2 Generalized Linear Models and Stacking # # **Listing 7.9**: Training GLMs for Bike Rental Prediction # In[6]: from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.linear_model import Ridge, PoissonRegressor, TweedieRegressor parameters = {'GLM: Linear': {'alpha': 10 ** np.arange(-4., 1.)}, 'GLM: Poisson': {'alpha': 10 ** np.arange(-4., 1.)}, 'GLM: Tweedie': {'alpha': 10 ** np.arange(-4., 1.), 'power': np.linspace(1.1, 1.9, num=5)}} glms = {'GLM: Linear': Ridge(), 'GLM: Poisson': PoissonRegressor(max_iter=1000), 'GLM: Tweedie': TweedieRegressor(max_iter=1000)} best_glms = {} results = pd.DataFrame() for glm_type, glm in glms.items(): param_tuner = GridSearchCV(glm, parameters[glm_type], cv=5, refit=True, verbose=2) param_tuner.fit(Xtrn, ytrn) best_glms[glm_type] = param_tuner.best_estimator_ ypred_trn = best_glms[glm_type].predict(Xtrn) ypred_tst = best_glms[glm_type].predict(Xtst) res = {'Method': glm_type, 'Train MSE': mean_squared_error(ytrn, ypred_trn), 'Train MAE': mean_absolute_error(ytrn, ypred_trn), 'Train R2': r2_score(ytrn, ypred_trn), 'Test MSE': mean_squared_error(ytst, ypred_tst), 'Test MAE': mean_absolute_error(ytst, ypred_tst), 'Test R2': r2_score(ytst, ypred_tst)} results = pd.concat([results, pd.DataFrame([res])], ignore_index=True) # In[7]: results # **Listing 7.10**: Stacking GLMs for Bike Rental Prediction # In[8]: from sklearn.neural_network import MLPRegressor from sklearn.ensemble import StackingRegressor base_estimators = list(best_glms.items()) meta_learner = MLPRegressor(hidden_layer_sizes=(25, 25, 25), max_iter=1000, activation='relu') stack = StackingRegressor(base_estimators, final_estimator=meta_learner) stack.fit(Xtrn, ytrn) ypred_trn = stack.predict(Xtrn) ypred_tst = stack.predict(Xtst) res = {'Method': 'GLM Stack', 'Train MSE': mean_squared_error(ytrn, ypred_trn), 'Train MAE': mean_absolute_error(ytrn, ypred_trn), 'Train R2': r2_score(ytrn, ypred_trn), 'Test MSE': mean_squared_error(ytst, ypred_tst), 'Test MAE': mean_absolute_error(ytst, ypred_tst), 'Test R2': r2_score(ytst, ypred_tst)} results = pd.concat([results, pd.DataFrame([res])], ignore_index=True) # In[9]: results # --- # ### 7.4.2 Random Forest and ExtraTrees # # **Listing 7.11**: Random Forest and ExtraTrees for Bike Rental Prediction # In[10]: from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor parameters = {'n_estimators': np.arange(200, 600, step=100), 'max_depth': np.arange(4, 7, step=1)} print(parameters) ensembles = {'RF: Squared Error': RandomForestRegressor(criterion='squared_error'), 'RF: Poisson': RandomForestRegressor(criterion='poisson'), 'XT: Squared Error': ExtraTreesRegressor(criterion='squared_error'), 'XT: Poisson': ExtraTreesRegressor(criterion='poisson')} for ens_type, ensemble in ensembles.items(): param_tuner = GridSearchCV(ensemble, parameters, cv=5, refit=True, verbose=2) param_tuner.fit(Xtrn, ytrn) ypred_trn = param_tuner.best_estimator_.predict(Xtrn) ypred_tst = param_tuner.best_estimator_.predict(Xtst) res = {'Method': ens_type, 'Train MSE': mean_squared_error(ytrn, ypred_trn), 'Train MAE': mean_absolute_error(ytrn, ypred_trn), 'Train R2': r2_score(ytrn, ypred_trn), 'Test MSE': mean_squared_error(ytst, ypred_tst), 'Test MAE': mean_absolute_error(ytst, ypred_tst), 'Test R2': r2_score(ytst, ypred_tst)} results = pd.concat([results, pd.DataFrame([res])], ignore_index=True) # In[11]: results # ### 7.4.3 Gradient Boosting Models # # **Note**: When executed, the parameter selection below occassionally produces ``FitFailedWarning`` for some parameter combinations. This is a known bug in xgboost's implementation of the ``poisson-nloglik`` evaluation metric, and can be ignored for now. # # **Listing 7.12**: XGBoost for Bike Rental Prediction # In[12]: from xgboost import XGBRegressor from sklearn.model_selection import RandomizedSearchCV parameters = {'max_depth': np.arange(2, 7, step=1), 'learning_rate': 2**np.arange(-8., 2., step=2), 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8], 'reg_alpha': [0, 0.01, 0.1, 1, 10], 'reg_lambda': [0, 0.01, 0.1, 1e-1, 1, 10]} print(parameters) ensembles = {'XGB: Squared Error': XGBRegressor(objective='reg:squarederror', eval_metric='poisson-nloglik'), 'XGB: Pseudo Huber': XGBRegressor(objective='reg:pseudohubererror', eval_metric='poisson-nloglik'), 'XGB: Poisson': XGBRegressor(objective='count:poisson', eval_metric='poisson-nloglik'), 'XGB: Tweedie': XGBRegressor(objective='reg:tweedie', eval_metric='poisson-nloglik')} for ens_type, ensemble in ensembles.items(): if ens_type == 'XGB: Tweedie': parameters['tweedie_variance_power'] = np.linspace(1.1, 1.9, num=9) param_tuner = RandomizedSearchCV(ensemble, parameters, n_iter=50, cv=5, refit=True, verbose=2) param_tuner.fit(Xtrn, ytrn, eval_set=[(Xtst, ytst)], verbose=False) ypred_trn = param_tuner.best_estimator_.predict(Xtrn) ypred_tst = param_tuner.best_estimator_.predict(Xtst) res = {'Method': ens_type, 'Train MSE': mean_squared_error(ytrn, ypred_trn), 'Train MAE': mean_absolute_error(ytrn, ypred_trn), 'Train R2': r2_score(ytrn, ypred_trn), 'Test MSE': mean_squared_error(ytst, ypred_tst), 'Test MAE': mean_absolute_error(ytst, ypred_tst), 'Test R2': r2_score(ytst, ypred_tst)} results = pd.concat([results, pd.DataFrame([res])], ignore_index=True) # In[13]: np.sum(ytst == 0), ytst.shape # In[14]: results # In[15]: from lightgbm import LGBMRegressor parameters = {'max_depth': np.arange(2, 7, step=1), 'learning_rate': 2**np.arange(-8., 2., step=2), 'bagging_fraction': [0.4, 0.5, 0.6, 0.7, 0.8], 'lambda_l1': [0, 0.01, 0.1, 1, 10], 'lambda_l2': [0, 0.01, 0.1, 1e-1, 1, 10]} print(parameters) ensembles = {'LGBM: Squared Error': LGBMRegressor(objective='mse'), 'LGBM: Absolute Error': LGBMRegressor(objective='mae'), 'LGBM: Huber': LGBMRegressor(objective='huber'), 'LGBM: Quantile': LGBMRegressor(objective='quantile'), 'LGBM: Poisson': LGBMRegressor(objective='poisson'), 'LGBM: Tweedie': LGBMRegressor(objective='tweedie'), } for ens_type, ensemble in ensembles.items(): if ens_type == 'LGBM: Tweedie': parameters['tweedie_variance_power'] = np.linspace(1.1, 1.9, num=9) param_tuner = RandomizedSearchCV(ensemble, parameters, n_iter=50, cv=5, refit=True, verbose=2) param_tuner.fit(Xtrn, ytrn, eval_set=[(Xtst, ytst)], eval_metric='poisson') ypred_trn = param_tuner.best_estimator_.predict(Xtrn) ypred_tst = param_tuner.best_estimator_.predict(Xtst) res = {'Method': ens_type, 'Train MSE': mean_squared_error(ytrn, ypred_trn), 'Train MAE': mean_absolute_error(ytrn, ypred_trn), 'Train R2': r2_score(ytrn, ypred_trn), 'Test MSE': mean_squared_error(ytst, ypred_tst), 'Test MAE': mean_absolute_error(ytst, ypred_tst), 'Test R2': r2_score(ytst, ypred_tst)} results = pd.concat([results, pd.DataFrame([res])], ignore_index=True) # --- # #### Visualizing all the results together as a bar chart # In[16]: results # In[17]: # Save all the results so that we can plot without running the whole lot of experiments again results.to_csv('./data/ch07/case-study-results.csv', index=False) # In[9]: results = pd.read_csv('./data/ch07/case-study-results.csv') groups = {'Linear\nmodels': ['GLM: Linear', 'GLM: Poisson', 'GLM: Tweedie'], 'Meta\nlearning': ['GLM Stack'], 'Random\nforest': ['RF: Squared Error', 'RF: Poisson'], 'Extra\nTrees': ['XT: Squared Error', 'XT: Poisson'], 'Gradient\nboosting': ['LGBM: Squared Error', 'LGBM: Absolute Error', 'LGBM: Huber', 'LGBM: Poisson', 'LGBM: Tweedie'], 'Newton\nboosting': ['XGB: Squared Error', 'XGB: Pseudo Huber', 'XGB: Poisson', 'XGB: Tweedie']} # fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 4)) # for j, metric in enumerate(['Test MAE', 'Test MSE', 'Test R2']): fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9, 4)) for j, metric in enumerate(['Test R2']): for i, (methods, group) in enumerate(groups.items()): yy = results[results.Method.isin(group)][metric].values xx = i + (np.arange(0, len(yy)) - np.median(np.arange(0, len(yy)))) * 0.2 ax.bar(xx, yy, width=0.15, alpha=0.3) for k in range(len(group)): ax.text(xx[k]-0.04, 0.05, group[k], rotation='vertical', fontsize=12) ax.set_xticks(range(6)) ax.set_xticklabels(list(groups.keys()), fontsize=12) ax.set_ylabel('R2 score', fontsize=12) ax.set_title('Test set performance with R2 score', fontsize=12) fig.tight_layout() # plt.savefig('./figures/CH07_F14_Kunapuli.png', format='png', dpi=300, bbox_inches='tight') # plt.savefig('./figures/CH07_F14_Kunapuli.pdf', format='pdf', dpi=300, bbox_inches='tight') # In[8]: import pandas as pd df = pd.read_csv('./data/ch07/case-study-results.csv') print(df.to_string(index=False)) # In[ ]: