#!/usr/bin/env python
# coding: utf-8

# # Structure of the notebook 
# 
# ### I. Data cleaning :
# 
#   #### 1. Checking data types.
#   #### 2. Treatment of  missing data.
#   #### 3. Checking the existence of duplicates.
#   #### 4. Checking the existence of constant and quasi-constant features.
#   #### 5. Verification of the data range constraint for numerical variables.
#   #### 6. Investigation of the existence of rare labels in categorical variables.
# 
# ### II. Exploratory data analysis
# 
#   #### 1. Exploring the distribution of numerical variables.
#   #### 2. Exploring the distribution of the target variable.
# 
# ### III. Tuning  and comparing the models.
# 
#  #### 1. Tuning the models.
#  #### 2. Fitting the different pipelines.
#  #### 3. Comparing the models.
#  #### 4. Choosing the best model.
# 
# ###  IV. Model interpretation using SHAP.
# 

#  

#  

# In[161]:


import re 
import time
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels.api as sm
import pylab
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder, OrdinalEncoder, CountFrequencyEncoder, MeanEncoder, PRatioEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.outliers import OutlierTrimmer
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
import optuna
import shap
import pickle


# In[5]:


df = pd.read_csv('customer_booking.csv',encoding = 'ISO-8859-1')
df.head()


# ### I. Data cleaning :
# 

# #### 1. Checking data types

# In[6]:


df.info()


# In[7]:


df.describe()


# #### 2. Treatment of  missing data.

# In[8]:


# A function to verify if there is some missing values in other forms than "NaN".
def miss_val(df, column, value):
    """ 
    A function to get the indices of the rows with missing values
    
        Inputs:
               - The dataframe object.
               - The name of the column.
               - The value to verify.
        Outputs:
               - list containing the indices of the rows with missing values
    """
    unique_vls = df[column].unique()
    null_rows = []
    for i in range(len(unique_vls)):
        if str(unique_vls[i]) == str(value):
            # returns the index of the missing column
            null_rows.append(df.loc[df[column]==unique_vls[i], df.columns].index)
    return null_rows


# In[9]:


# Checking the following values.
vals = ['', '.', '?', ' ', 'MISSING', 'missing', 'Missing', '!', 'NULL', 'Null','null']

for column in df.columns :
    for j in vals:
        if len(miss_val(df, column, j))> 0:
            print("The used value for column '{0}' is '{1}'".format(column, j))
        

# #### 3. Checking the existence of duplicates.

# Duplicated rows can be a source of data leakage, so it is  a good practice to remove them.

# In[10]:


# Getting the duplicated columns
dup = df.duplicated()
df[dup]


# In[11]:


df = df.drop_duplicates()
df


# #### 4. Checking the existence of constant and quasi-constant features.
# 

# Constant and quasi-constant features contain almost no information that the model can use to discriminate between data points. So the best thing to do when we have this kind of feature is to drop them.
# 

# In[12]:


# Checking the existence of constant and quasi-constant features in the clients dataset
# Variables showing the same value in a percentage of observations greater than 
# "tol" will be considered as constant/quasi-constant and dropped.
sel_2_clt = DropConstantFeatures(tol=0.99)
sel_2_clt.fit(df)


# In[13]:


sel_2_clt.features_to_drop_


# So there is no constant features.

# #### 5. Verification of the data range constraint for numerical variables.
# 

# In[14]:


# Getting the target variable
y = df.pop('booking_complete')

# Getting  categorical features  
cat_ftr = [col for col in df.columns if df[col].dtype == 'O']

# Getting numerical features
num_ftr= [col for col in df.columns if col not in cat_ftr]


# In[15]:


# Checking the number of features 
print((len(cat_ftr) + len(num_ftr)) == len(df.columns))


# In[16]:


# Investigating the existence of negative values.

for col in num_ftr:
    if df[col].min() < 0:
        print("Feature {} has negative values".format(col))


# So there is no negative values 

# #### 6. Investigation of the existence of rare labels in categorical variables.
# 

# Rare labels can cause the following issues :
#     
#    - The data points with rare categories are so few that the algorithm does not get any information from them, which turns them into a source of noise that can make the model overfit the data.
#     
#    - Rare labels may appear only in the test set which will make it hard for the model to evaluate them. 
# 

# In[17]:


# The percentage of rows with a given category
for col in cat_ftr:
    if col != 'id':
        for cat in df[col].unique():
            perct = round((len(df[df[col] == cat])/len(df)) * 100, 2)
            print("For feature '{}' the percentage of datapoints with category '{}' is: \n{} %".format(col,cat,perct))
            print("-------------------------------------------------------------------")
          

# In[18]:


# Grouping rare categories in a new category called "other"
enc_1 = RareLabelEncoder(variables=cat_ftr ,n_categories=1, tol=0.0003, replace_with='other')
enc_1.fit(df)


# In[19]:


df = enc_1.transform(df)
df.head()


# ### II. Exploratory data analysis.

# #### 1. Exploring the distribution of numerical variables.

# In[20]:


df[num_ftr].hist(figsize=(20,20))
plt.show()


# #### 2. Exploring the distribution of the target variable

# In[21]:


sns.countplot(data=pd.DataFrame(y), x='booking_complete')
plt.show()


# We can clearly see that our target variable is unbalanced, which implies that we should use the "f-1" score as our evaluation metric. This will allow us to get a clear picture of our model's performance.
# Using " accuracy" as a metric will lead to choosing the wrong model. The reason for this can be explained by the example of a dummy model that always predicts the most common class, this model will be considered a good model in an evaluation process where we use "accuracy" as a metric. 

# ### III. Tuning  and comparing the models.

# In[22]:


# Splitting the data
train_X, test_X, train_y, test_y = train_test_split(df, y, test_size=0.30)
print(len(train_X), (len(train_y)))


# #### 1. Tuning the models.

# In[33]:


# A function to fit and tune Random Forest

def objective(trial):
    
    # Defining the scalers space 
    scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust'])
    if scalers == 'minmax':
        scaler = MinMaxScaler()
    elif scalers == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()
    
    # Defining the encoder space
    encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio'])
    if encoders == "Ordinal":
        encoder = OrdinalEncoder()
    elif encoders == 'OneHot':
        encoder = OneHotEncoder()
    elif encoders == "Count":
        encoder = CountFrequencyEncoder()
    elif encoders == 'Mean':
        encoder = MeanEncoder()
    else :
        encoder = PRatioEncoder()
        
    # The outliers trimmer:
    out_trimmer = trial.suggest_categorical('outlier_trimmer',[None, "Trimmer"])
    
    if out_trimmer == 'Timmer':
        out_trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=1.5)
    else:
        out_trimmer = None
        
    # Defining the number of features for the second phase of feature selection using mutual inforamtion selector
    k = trial.suggest_int('k', low=2, high=len(train_X.columns))
    
    # Defining the n_estimators space
    n_estims = trial.suggest_int('n_estimators', low=20, high=600)
    
    # Defining the max_depth space
    max_dep = trial.suggest_int('max_depth', low=2, high=20)
        
    # Preprocessing steps for the numerical variables
    num_preproc = Pipeline(steps=[('scaler',scaler),
                                  ('trimmer', out_trimmer)])

    # Preprocessing steps for the categorical variables
    cat_preproc = Pipeline(steps=[('encoder', encoder)])

    preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor',num_preproc, num_ftr),
                                               ('cat_preprocessor', cat_preproc, cat_ftr)])

    model_pipe = Pipeline(steps=[("preprocessing", preprocessor),
                                 ("selector", SelectKBest(mutual_info_classif, k=k)),
                                 ("model", RandomForestClassifier(n_estimators=n_estims, max_depth=max_dep, n_jobs=-1))])
    
    cv_score = cross_val_score(model_pipe, train_X, train_y, cv=2, scoring ='f1').mean()
    
    return cv_score

study_1 = optuna.create_study(direction='maximize')
study_1.optimize(objective, n_trials=300, show_progress_bar=True)


# In[34]:


pickle.dump(study_1, open('optuna_study_3_BA.sav', 'wb'))


# In[28]:


# Tuning the logistic regression model
def objective(trial):
    
    # Defining the scalers space 
    scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust'])
    if scalers == 'minmax':
        scaler = MinMaxScaler()
    elif scalers == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()
    
    # Defining the encoder space
    encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio'])
    if encoders == "Ordinal":
        encoder = OrdinalEncoder()
    elif encoders == 'OneHot':
        encoder = OneHotEncoder()
    elif encoders == "Count":
        encoder = CountFrequencyEncoder()
    elif encoders == 'Mean':
        encoder = MeanEncoder()
    else :
        encoder = PRatioEncoder()
        
    # The outliers trimmer:
    out_trimmer = trial.suggest_categorical('outlier_trimmer',[None, "Trimmer"])
    
    if out_trimmer == 'Timmer':
        out_trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=1.5)
    else:
        out_trimmer = None
        
    # Defining the number of features for the second phase of feature selection using mutual inforamtion selector
    k = trial.suggest_int('k', low=2, high=len(train_X.columns))
        
    # Defining the "C" space 
    C = trial.suggest_loguniform('C', low=0.001, high=1)
    
    # numerical features preprocessor 
    num_prec = Pipeline(steps=[('scaler',scaler)])
    
    # categorical features preprocessor
    cat_prec = Pipeline(steps=[('encoder', encoder)])
    
    # Column transformer
    preprocessor = ColumnTransformer(transformers=[('numerical preprocessor', num_prec, num_ftr),
                                                  ('categorical preprocessor', cat_prec, cat_ftr)])
    
    log_model_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                                    ("selector", SelectKBest(mutual_info_classif, k=k)),
                                    ('classifier', LogisticRegression(C=C, solver='saga'))])
    
    cv_score = cross_val_score(log_model_pipe, train_X, train_y, cv=5, scoring='f1').mean()
    return cv_score

study_2 = optuna.create_study(direction='maximize')
study_2.optimize(objective, n_trials=300, show_progress_bar=True)


# In[29]:


pickle.dump(study_2, open('optuna_study_1_BA.sav', 'wb'))


# In[30]:


# Tuning the decision tree classifier

def objective(trial):
    
    # Defining the scalers space 
    scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust'])
    if scalers == 'minmax':
        scaler = MinMaxScaler()
    elif scalers == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()
    
    # Defining the encoder space
    encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio'])
    if encoders == "Ordinal":
        encoder = OrdinalEncoder()
    elif encoders == 'OneHot':
        encoder = OneHotEncoder()
    elif encoders == "Count":
        encoder = CountFrequencyEncoder()
    elif encoders == 'Mean':
        encoder = MeanEncoder()
    else :
        encoder = PRatioEncoder()
        
    # The outliers trimmer:
    out_trimmer = trial.suggest_categorical('outlier_trimmer',[None, "Trimmer"])
    
    if out_trimmer == 'Timmer':
        out_trimmer = OutlierTrimmer(capping_method='iqr', tail='both', fold=1.5)
    else:
        out_trimmer = None
        
    # Defining the number of features for the second phase of feature selection using mutual inforamtion selector
    k = trial.suggest_int('k', low=2, high=len(train_X.columns))
        
    # Criterion space 
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    # max depth space
    max_depth = trial.suggest_int('max_depth', low=2, high=25)
    
     # preprocessing steps for numerical features
    preproc_num = Pipeline(steps=[('scaler', scaler)])
    
    # preprocessing steps for categorical features
    preproc_cat = Pipeline(steps=[('encoder', encoder)])
    
    # Column transformer 
    preprocessor = ColumnTransformer(transformers = [('numerical preprocessing', preproc_num, num_ftr),
                                                    ('categorical preprocessing', preproc_cat, cat_ftr)])
    
    # model pipeline
    gb_model_pipe = Pipeline(steps=[('preprocessing', preprocessor),
                                    ("selector", SelectKBest(mutual_info_classif, k=k)),
                                   ('classifier', DecisionTreeClassifier(max_depth=max_depth,
                                                                        criterion=criterion))])
    cv_score = cross_val_score(gb_model_pipe, train_X, train_y, cv=2, scoring='f1').mean()
    return cv_score

study_3 = optuna.create_study(direction='maximize')
study_3.optimize(objective, n_trials=300, show_progress_bar=True)

    
# In[31]:


pickle.dump(study_3, open('optuna_study_2_BA.sav', 'wb'))


# #### 2. Fitting the different pipelines.

# In[41]:


# A function to fit a pipeline 

def fit_pipe(scaler, trimmer, encoder, selector,  classifier, cat_ft, num_ft, X_train, y_train):
    
    # Creating the pipeline
    
    # Preprocessing steps for the numerical variables
    num_preproc = Pipeline(steps=[('scaler',scaler),
                                  ('trimmer',trimmer)])

    # Preprocessing steps for the categorical variables
    cat_preproc = Pipeline(steps=[('encoder', encoder)])

    preprocessor = ColumnTransformer(transformers=[('cat_preprocessor', cat_preproc, cat_ft),('numerical_preprocessor',num_preproc, num_ft)])

    pipe = Pipeline(steps=[("preprocessing", preprocessor),
                           ('selector', selector),
                            ("model", classifier)])

    pipe.fit(train_X, train_y)
    
    return  pipe


# __Fitting the logistic regression pipeline__
# 

# In[48]:


# Getting the best parameters 
study_2.best_params


# In[53]:


# Fitting the pipeline
scaler_1 = MinMaxScaler()
enc_1 = PRatioEncoder()
trimmer_1 = None
selector_1 = SelectKBest(mutual_info_classif, k=12)
params_1 = {'C':0.9957349726099026}
classifier_1 = LogisticRegression(**params_1)
log_pipe = fit_pipe(scaler_1,trimmer_1, enc_1, selector_1, classifier_1, cat_ftr, num_ftr, train_X, train_y)


# #### Fitting the decision tree pipeline 

# In[49]:


# Getting the best parameters 
study_3.best_params


# In[52]:


# Fitting the pipeline
scaler_2 = RobustScaler()
enc_2 = OrdinalEncoder()
trimmer_2 = None
selector_2 = SelectKBest(mutual_info_classif, k=11)
params_2 = {'criterion': 'entropy', 'max_depth': 24}
classifier_2 = DecisionTreeClassifier(**params_2)
tree_pipe = fit_pipe(scaler_2,trimmer_2, enc_2, selector_2, classifier_2, cat_ftr, num_ftr, train_X, train_y)


# #### Fitting the random forest pipeline

# In[54]:


# Getting the best parameters 
study_1.best_params


# In[55]:


# Fitting the pipeline
scaler_3 = RobustScaler()
enc_3 = PRatioEncoder()
trimmer_3 = None
selector_3 = SelectKBest(mutual_info_classif, k=5)
params_3 = {'n_estimators': 24, 'max_depth': 20}
classifier_3 = RandomForestClassifier(**params_3)
RF_pipe = fit_pipe(scaler_3,trimmer_3, enc_3, selector_3, classifier_3, cat_ftr, num_ftr, train_X, train_y)


# #### 3. Comparing the models.

# In[68]:


# Comparing the models :

def compare_models(pipe, test_X, test_y):
        
    # Getting the averaged test score
    score = f1_score(test_y, pipe.predict(test_X))
    
    # Getting the prediction time for test set
    pred_start = time.time()
    pipe.score(test_X, test_y)
    pred_end = time.time()
    pred_time = pred_end - pred_start
    
    return score, pred_time
    
        
# In[105]:


pipes = {"Random Forest":RF_pipe, "Logistic Regression":log_pipe, "Decision Tree":tree_pipe}

scores_list = []
pred_time_list = []
model_names = []
for key, value in pipes.items():
    score, pred_time = compare_models(value, test_X, test_y)
    scores_list.append(score)
    pred_time_list.append(pred_time)
    model_names.append(key)
    

# In[106]:


# Transformation into numpy arrays
scores_list = np.array(scores_list).reshape((3,))
pred_time_list = np.array(pred_time_list).reshape((3,))
model_names = np.array(model_names).reshape((3,))


# In[107]:


# Creating the dataframe 
df_dict = {'f-1 scores':scores_list,
          'prediction time': pred_time_list,
          'models':model_names}
results = pd.DataFrame(df_dict).set_index('models')
results.sort_values(by='f-1 scores', ascending = False)


# The decision tree classifier is the classifier with the highest F-1 score.

# In[128]:


# Printing the classification report and the confusion matrix for each model

for name, pipe in pipes.items():
    print("The classification report for the '{}' classifier".format(name))
    print("----------------------------------------------")
    print(classification_report(test_y, pipe.predict(test_X)))

    
# In[124]:


# Visualizing the confusion matrix for the random forest classifier
cfm = confusion_matrix(test_y, RF_pipe.predict(test_X))
ConfusionMatrixDisplay(cfm).plot()


# In[126]:


# Visualizing the confusion matrix for the decision tree classifier

cfm=confusion_matrix(test_y, tree_pipe.predict(test_X))
ConfusionMatrixDisplay(cfm).plot()


# In[130]:


# Visualizing the confusion matrix for the logistic regression classifier

cfm=confusion_matrix(test_y, log_pipe.predict(test_X))
ConfusionMatrixDisplay(cfm).plot()


# #### 4. Choosing the best model.

# The decision tree classifier is the one with the highest capability to distinguish the minority class (1) in the test set. It correctly identified 28.4% of the positive examples. 
# On the other hand the Random Forest classifier correctly predicted 17.9% of the positive examples and the Logistic Regression classifier made an exact prediction for 12.3% of the positive examples.
# By assuming that "false negatives" (incorrectly identifying clients that will make a booking as not making a booking) are very expensive for the company we choose the Decision Tree classifier as the best model.

# In[136]:


# Getting the classifier
clf = tree_pipe[-1]
clf


# In[137]:


# Saving the classifier 
pickle.dump(clf, open('classifier_BA.sav', 'wb'))


# In[141]:


# Getting the names of the selected features

# The selector object
selector = tree_pipe[1]

# The indices of the features
indx = selector.get_support(indices=True)

select_ftr = df.columns[indx]
print(select_ftr)


# In[148]:


# The new dataframe with the selected features 
final_df = df[select_ftr]
final_df


# In[146]:


# # Getting the names of the features after feature selection 
# Getting numerical features 
num_ftr = [col for col in final_df.columns if final_df[col].dtype in ['int64', 'float64']]

# Getting categorical features 
cat_ftr = [col for col in final_df.columns if col not in num_ftr]

print(len(num_ftr)+len(cat_ftr))


# ### V. Model interpretation using SHAP
# SHAP is a technique used to interpret machine learning models. It explains a prediction by calculating what are known as Shapley values, which were developed in Game Theory. These values represent the contributions of the different explanatory variables to the prediction in question.

# In[149]:


# Scaling numerical features 
scaler = SklearnTransformerWrapper(RobustScaler())
final_df = scaler.fit_transform(final_df)
final_df


# In[151]:


# Encoding categorical features 
enc = OrdinalEncoder()
final_df = enc.fit_transform(final_df, y)
final_df


# In[152]:


# The code to get shap values 
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(final_df)

# Plotting feature importance 
shap.summary_plot(shap_values[0], final_df, max_display=26, plot_type='bar')


# The three most predictive features are :
# - Purchase lead.
# - Length of stay.
# - Flight day.

# In[154]:


# Investigating the global effect of each feature on the model output(pushing it down or up).
shap.summary_plot(shap_values[0], final_df, max_display=26)


# From this visual we can conclude that:
# 
# - A small purchase lead (the  number of days between travel date and booking date) increases the chances of the client completing the booking.
# - A longer period of stay influences negatively the chances of the client completing the booking.
# - When the number of passengers is high, the client is less likely to make a booking.
# - Asking for extra baggage and a flight meal decreases the chances of the client completing the booking.
# - Making a booking on the weekend (according to the encoding of the feature flight day) increases its chances to be completed.
# 
# 
# 
# 
# 
# 
#  

# In[ ]: