#!/usr/bin/env python # coding: utf-8 # # Structure of the notebook # # ### I. Data cleaning : # # #### 1. Checking data types. # #### 2. Treatment of missing data. # #### 3. Checking the existence of duplicates. # #### 4. Checking the existence of constant and quasi-constant features. # #### 5. Verification of the data range constraint for numerical variables. # #### 6. Investigation of the existence of rare labels in categorical variables. # # ### II. Exploratory data analysis # # #### 1. Exploring the distribution of numerical variables. # #### 2. Exploring the distribution of the target variable. # # ### III. Tuning and comparing the models. # # #### 1. Tuning the models. # #### 2. Fitting the different pipelines. # #### 3. Comparing the models. # #### 4. Choosing the best model. # # ### IV. Model interpretation using SHAP. # # # # In[161]: import re import time import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm import pylab from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection from feature_engine.encoding import RareLabelEncoder, OneHotEncoder, OrdinalEncoder, CountFrequencyEncoder, MeanEncoder, PRatioEncoder from feature_engine.wrappers import SklearnTransformerWrapper from feature_engine.outliers import OutlierTrimmer import scipy.stats as stats from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score, train_test_split from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler from sklearn.feature_selection import SelectKBest, mutual_info_classif from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, precision_score, recall_score from xgboost import XGBClassifier import optuna import shap import pickle # In[5]: df = pd.read_csv('customer_booking.csv',encoding = 'ISO-8859-1') df.head() # ### I. Data cleaning : # # #### 1. Checking data types # In[6]: df.info() # In[7]: df.describe() # #### 2. Treatment of missing data. # In[8]: # A function to verify if there is some missing values in other forms than "NaN". def miss_val(df, column, value): """ A function to get the indices of the rows with missing values Inputs: - The dataframe object. - The name of the column. - The value to verify. Outputs: - list containing the indices of the rows with missing values """ unique_vls = df[column].unique() null_rows = [] for i in range(len(unique_vls)): if str(unique_vls[i]) == str(value): # returns the index of the missing column null_rows.append(df.loc[df[column]==unique_vls[i], df.columns].index) return null_rows # In[9]: # Checking the following values. vals = ['', '.', '?', ' ', 'MISSING', 'missing', 'Missing', '!', 'NULL', 'Null','null'] for column in df.columns : for j in vals: if len(miss_val(df, column, j))> 0: print("The used value for column '{0}' is '{1}'".format(column, j)) # #### 3. Checking the existence of duplicates. # Duplicated rows can be a source of data leakage, so it is a good practice to remove them. # In[10]: # Getting the duplicated columns dup = df.duplicated() df[dup] # In[11]: df = df.drop_duplicates() df # #### 4. Checking the existence of constant and quasi-constant features. # # Constant and quasi-constant features contain almost no information that the model can use to discriminate between data points. So the best thing to do when we have this kind of feature is to drop them. # # In[12]: # Checking the existence of constant and quasi-constant features in the clients dataset # Variables showing the same value in a percentage of observations greater than # "tol" will be considered as constant/quasi-constant and dropped. sel_2_clt = DropConstantFeatures(tol=0.99) sel_2_clt.fit(df) # In[13]: sel_2_clt.features_to_drop_ # So there is no constant features. # #### 5. Verification of the data range constraint for numerical variables. # # In[14]: # Getting the target variable y = df.pop('booking_complete') # Getting categorical features cat_ftr = [col for col in df.columns if df[col].dtype == 'O'] # Getting numerical features num_ftr= [col for col in df.columns if col not in cat_ftr] # In[15]: # Checking the number of features print((len(cat_ftr) + len(num_ftr)) == len(df.columns)) # In[16]: # Investigating the existence of negative values. for col in num_ftr: if df[col].min() < 0: print("Feature {} has negative values".format(col)) # So there is no negative values # #### 6. Investigation of the existence of rare labels in categorical variables. # # Rare labels can cause the following issues : # # - The data points with rare categories are so few that the algorithm does not get any information from them, which turns them into a source of noise that can make the model overfit the data. # # - Rare labels may appear only in the test set which will make it hard for the model to evaluate them. # # In[17]: # The percentage of rows with a given category for col in cat_ftr: if col != 'id': for cat in df[col].unique(): perct = round((len(df[df[col] == cat])/len(df)) * 100, 2) print("For feature '{}' the percentage of datapoints with category '{}' is: \n{} %".format(col,cat,perct)) print("-------------------------------------------------------------------") # In[18]: # Grouping rare categories in a new category called "other" enc_1 = RareLabelEncoder(variables=cat_ftr ,n_categories=1, tol=0.0003, replace_with='other') enc_1.fit(df) # In[19]: df = enc_1.transform(df) df.head() # ### II. Exploratory data analysis. # #### 1. Exploring the distribution of numerical variables. # In[20]: df[num_ftr].hist(figsize=(20,20)) plt.show() # #### 2. Exploring the distribution of the target variable # In[21]: sns.countplot(data=pd.DataFrame(y), x='booking_complete') plt.show() # We can clearly see that our target variable is unbalanced, which implies that we should use the "f-1" score as our evaluation metric. This will allow us to get a clear picture of our model's performance. # Using " accuracy" as a metric will lead to choosing the wrong model. The reason for this can be explained by the example of a dummy model that always predicts the most common class, this model will be considered a good model in an evaluation process where we use "accuracy" as a metric. # ### III. Tuning and comparing the models. # In[22]: # Splitting the data train_X, test_X, train_y, test_y = train_test_split(df, y, test_size=0.30) print(len(train_X), (len(train_y))) # #### 1. Tuning the models. # In[33]: # A function to fit and tune Random Forest def objective(trial): # Defining the scalers space scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust']) if scalers == 'minmax': scaler = MinMaxScaler() elif scalers == 'robust': scaler = RobustScaler() else: scaler = StandardScaler() # Defining the encoder space encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio']) if encoders == "Ordinal": encoder = OrdinalEncoder() elif encoders == 'OneHot': encoder = OneHotEncoder() elif encoders == "Count": encoder = CountFrequencyEncoder() elif encoders == 'Mean': encoder = MeanEncoder() else : encoder = PRatioEncoder() # The outliers trimmer: out_trimmer = trial.suggest_categorical('outlier_trimmer',[None, "Trimmer"]) if out_trimmer == 'Timmer': out_trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=1.5) else: out_trimmer = None # Defining the number of features for the second phase of feature selection using mutual inforamtion selector k = trial.suggest_int('k', low=2, high=len(train_X.columns)) # Defining the n_estimators space n_estims = trial.suggest_int('n_estimators', low=20, high=600) # Defining the max_depth space max_dep = trial.suggest_int('max_depth', low=2, high=20) # Preprocessing steps for the numerical variables num_preproc = Pipeline(steps=[('scaler',scaler), ('trimmer', out_trimmer)]) # Preprocessing steps for the categorical variables cat_preproc = Pipeline(steps=[('encoder', encoder)]) preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor',num_preproc, num_ftr), ('cat_preprocessor', cat_preproc, cat_ftr)]) model_pipe = Pipeline(steps=[("preprocessing", preprocessor), ("selector", SelectKBest(mutual_info_classif, k=k)), ("model", RandomForestClassifier(n_estimators=n_estims, max_depth=max_dep, n_jobs=-1))]) cv_score = cross_val_score(model_pipe, train_X, train_y, cv=2, scoring ='f1').mean() return cv_score study_1 = optuna.create_study(direction='maximize') study_1.optimize(objective, n_trials=300, show_progress_bar=True) # In[34]: pickle.dump(study_1, open('optuna_study_3_BA.sav', 'wb')) # In[28]: # Tuning the logistic regression model def objective(trial): # Defining the scalers space scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust']) if scalers == 'minmax': scaler = MinMaxScaler() elif scalers == 'robust': scaler = RobustScaler() else: scaler = StandardScaler() # Defining the encoder space encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio']) if encoders == "Ordinal": encoder = OrdinalEncoder() elif encoders == 'OneHot': encoder = OneHotEncoder() elif encoders == "Count": encoder = CountFrequencyEncoder() elif encoders == 'Mean': encoder = MeanEncoder() else : encoder = PRatioEncoder() # The outliers trimmer: out_trimmer = trial.suggest_categorical('outlier_trimmer',[None, "Trimmer"]) if out_trimmer == 'Timmer': out_trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=1.5) else: out_trimmer = None # Defining the number of features for the second phase of feature selection using mutual inforamtion selector k = trial.suggest_int('k', low=2, high=len(train_X.columns)) # Defining the "C" space C = trial.suggest_loguniform('C', low=0.001, high=1) # numerical features preprocessor num_prec = Pipeline(steps=[('scaler',scaler)]) # categorical features preprocessor cat_prec = Pipeline(steps=[('encoder', encoder)]) # Column transformer preprocessor = ColumnTransformer(transformers=[('numerical preprocessor', num_prec, num_ftr), ('categorical preprocessor', cat_prec, cat_ftr)]) log_model_pipe = Pipeline(steps=[('preprocessor', preprocessor), ("selector", SelectKBest(mutual_info_classif, k=k)), ('classifier', LogisticRegression(C=C, solver='saga'))]) cv_score = cross_val_score(log_model_pipe, train_X, train_y, cv=5, scoring='f1').mean() return cv_score study_2 = optuna.create_study(direction='maximize') study_2.optimize(objective, n_trials=300, show_progress_bar=True) # In[29]: pickle.dump(study_2, open('optuna_study_1_BA.sav', 'wb')) # In[30]: # Tuning the decision tree classifier def objective(trial): # Defining the scalers space scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust']) if scalers == 'minmax': scaler = MinMaxScaler() elif scalers == 'robust': scaler = RobustScaler() else: scaler = StandardScaler() # Defining the encoder space encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio']) if encoders == "Ordinal": encoder = OrdinalEncoder() elif encoders == 'OneHot': encoder = OneHotEncoder() elif encoders == "Count": encoder = CountFrequencyEncoder() elif encoders == 'Mean': encoder = MeanEncoder() else : encoder = PRatioEncoder() # The outliers trimmer: out_trimmer = trial.suggest_categorical('outlier_trimmer',[None, "Trimmer"]) if out_trimmer == 'Timmer': out_trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=1.5) else: out_trimmer = None # Defining the number of features for the second phase of feature selection using mutual inforamtion selector k = trial.suggest_int('k', low=2, high=len(train_X.columns)) # Criterion space criterion = trial.suggest_categorical('criterion', ['gini', 'entropy']) # max depth space max_depth = trial.suggest_int('max_depth', low=2, high=25) # preprocessing steps for numerical features preproc_num = Pipeline(steps=[('scaler', scaler)]) # preprocessing steps for categorical features preproc_cat = Pipeline(steps=[('encoder', encoder)]) # Column transformer preprocessor = ColumnTransformer(transformers = [('numerical preprocessing', preproc_num, num_ftr), ('categorical preprocessing', preproc_cat, cat_ftr)]) # model pipeline gb_model_pipe = Pipeline(steps=[('preprocessing', preprocessor), ("selector", SelectKBest(mutual_info_classif, k=k)), ('classifier', DecisionTreeClassifier(max_depth=max_depth, criterion=criterion))]) cv_score = cross_val_score(gb_model_pipe, train_X, train_y, cv=2, scoring='f1').mean() return cv_score study_3 = optuna.create_study(direction='maximize') study_3.optimize(objective, n_trials=300, show_progress_bar=True) # In[31]: pickle.dump(study_3, open('optuna_study_2_BA.sav', 'wb')) # #### 2. Fitting the different pipelines. # In[41]: # A function to fit a pipeline def fit_pipe(scaler, trimmer, encoder, selector, classifier, cat_ft, num_ft, X_train, y_train): # Creating the pipeline # Preprocessing steps for the numerical variables num_preproc = Pipeline(steps=[('scaler',scaler), ('trimmer',trimmer)]) # Preprocessing steps for the categorical variables cat_preproc = Pipeline(steps=[('encoder', encoder)]) preprocessor = ColumnTransformer(transformers=[('cat_preprocessor', cat_preproc, cat_ft),('numerical_preprocessor',num_preproc, num_ft)]) pipe = Pipeline(steps=[("preprocessing", preprocessor), ('selector', selector), ("model", classifier)]) pipe.fit(train_X, train_y) return pipe # __Fitting the logistic regression pipeline__ # # In[48]: # Getting the best parameters study_2.best_params # In[53]: # Fitting the pipeline scaler_1 = MinMaxScaler() enc_1 = PRatioEncoder() trimmer_1 = None selector_1 = SelectKBest(mutual_info_classif, k=12) params_1 = {'C':0.9957349726099026} classifier_1 = LogisticRegression(**params_1) log_pipe = fit_pipe(scaler_1,trimmer_1, enc_1, selector_1, classifier_1, cat_ftr, num_ftr, train_X, train_y) # #### Fitting the decision tree pipeline # In[49]: # Getting the best parameters study_3.best_params # In[52]: # Fitting the pipeline scaler_2 = RobustScaler() enc_2 = OrdinalEncoder() trimmer_2 = None selector_2 = SelectKBest(mutual_info_classif, k=11) params_2 = {'criterion': 'entropy', 'max_depth': 24} classifier_2 = DecisionTreeClassifier(**params_2) tree_pipe = fit_pipe(scaler_2,trimmer_2, enc_2, selector_2, classifier_2, cat_ftr, num_ftr, train_X, train_y) # #### Fitting the random forest pipeline # In[54]: # Getting the best parameters study_1.best_params # In[55]: # Fitting the pipeline scaler_3 = RobustScaler() enc_3 = PRatioEncoder() trimmer_3 = None selector_3 = SelectKBest(mutual_info_classif, k=5) params_3 = {'n_estimators': 24, 'max_depth': 20} classifier_3 = RandomForestClassifier(**params_3) RF_pipe = fit_pipe(scaler_3,trimmer_3, enc_3, selector_3, classifier_3, cat_ftr, num_ftr, train_X, train_y) # #### 3. Comparing the models. # In[68]: # Comparing the models : def compare_models(pipe, test_X, test_y): # Getting the averaged test score score = f1_score(test_y, pipe.predict(test_X)) # Getting the prediction time for test set pred_start = time.time() pipe.score(test_X, test_y) pred_end = time.time() pred_time = pred_end - pred_start return score, pred_time # In[105]: pipes = {"Random Forest":RF_pipe, "Logistic Regression":log_pipe, "Decision Tree":tree_pipe} scores_list = [] pred_time_list = [] model_names = [] for key, value in pipes.items(): score, pred_time = compare_models(value, test_X, test_y) scores_list.append(score) pred_time_list.append(pred_time) model_names.append(key) # In[106]: # Transformation into numpy arrays scores_list = np.array(scores_list).reshape((3,)) pred_time_list = np.array(pred_time_list).reshape((3,)) model_names = np.array(model_names).reshape((3,)) # In[107]: # Creating the dataframe df_dict = {'f-1 scores':scores_list, 'prediction time': pred_time_list, 'models':model_names} results = pd.DataFrame(df_dict).set_index('models') results.sort_values(by='f-1 scores', ascending = False) # The decision tree classifier is the classifier with the highest F-1 score. # In[128]: # Printing the classification report and the confusion matrix for each model for name, pipe in pipes.items(): print("The classification report for the '{}' classifier".format(name)) print("----------------------------------------------") print(classification_report(test_y, pipe.predict(test_X))) # In[124]: # Visualizing the confusion matrix for the random forest classifier cfm = confusion_matrix(test_y, RF_pipe.predict(test_X)) ConfusionMatrixDisplay(cfm).plot() # In[126]: # Visualizing the confusion matrix for the decision tree classifier cfm=confusion_matrix(test_y, tree_pipe.predict(test_X)) ConfusionMatrixDisplay(cfm).plot() # In[130]: # Visualizing the confusion matrix for the logistic regression classifier cfm=confusion_matrix(test_y, log_pipe.predict(test_X)) ConfusionMatrixDisplay(cfm).plot() # #### 4. Choosing the best model. # The decision tree classifier is the one with the highest capability to distinguish the minority class (1) in the test set. It correctly identified 28.4% of the positive examples. # On the other hand the Random Forest classifier correctly predicted 17.9% of the positive examples and the Logistic Regression classifier made an exact prediction for 12.3% of the positive examples. # By assuming that "false negatives" (incorrectly identifying clients that will make a booking as not making a booking) are very expensive for the company we choose the Decision Tree classifier as the best model. # In[136]: # Getting the classifier clf = tree_pipe[-1] clf # In[137]: # Saving the classifier pickle.dump(clf, open('classifier_BA.sav', 'wb')) # In[141]: # Getting the names of the selected features # The selector object selector = tree_pipe[1] # The indices of the features indx = selector.get_support(indices=True) select_ftr = df.columns[indx] print(select_ftr) # In[148]: # The new dataframe with the selected features final_df = df[select_ftr] final_df # In[146]: # # Getting the names of the features after feature selection # Getting numerical features num_ftr = [col for col in final_df.columns if final_df[col].dtype in ['int64', 'float64']] # Getting categorical features cat_ftr = [col for col in final_df.columns if col not in num_ftr] print(len(num_ftr)+len(cat_ftr)) # ### V. Model interpretation using SHAP # SHAP is a technique used to interpret machine learning models. It explains a prediction by calculating what are known as Shapley values, which were developed in Game Theory. These values represent the contributions of the different explanatory variables to the prediction in question. # In[149]: # Scaling numerical features scaler = SklearnTransformerWrapper(RobustScaler()) final_df = scaler.fit_transform(final_df) final_df # In[151]: # Encoding categorical features enc = OrdinalEncoder() final_df = enc.fit_transform(final_df, y) final_df # In[152]: # The code to get shap values explainer = shap.TreeExplainer(clf) shap_values = explainer.shap_values(final_df) # Plotting feature importance shap.summary_plot(shap_values[0], final_df, max_display=26, plot_type='bar') # The three most predictive features are : # - Purchase lead. # - Length of stay. # - Flight day. # In[154]: # Investigating the global effect of each feature on the model output(pushing it down or up). shap.summary_plot(shap_values[0], final_df, max_display=26) # From this visual we can conclude that: # # - A small purchase lead (the number of days between travel date and booking date) increases the chances of the client completing the booking. # - A longer period of stay influences negatively the chances of the client completing the booking. # - When the number of passengers is high, the client is less likely to make a booking. # - Asking for extra baggage and a flight meal decreases the chances of the client completing the booking. # - Making a booking on the weekend (according to the encoding of the feature flight day) increases its chances to be completed. # # # # # # # # In[ ]: