#!/usr/bin/env python # coding: utf-8 # # Geodemographic Segmentation Model # # Dataset: https://www.superdatascience.com/training/ # # Author: Filipa C. S. Rodrigues (filipacsrodrigues@gmail.com) # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings("ignore") from __future__ import division import pandas import csv import numpy as np import statsmodels.api as sm import statsmodels.discrete.discrete_model as smdis import statsmodels.stats.outliers_influence as outliers import matplotlib.pyplot as plt # In[2]: df = pandas.DataFrame.from_csv('Churn-Modelling.csv', index_col=None) df[:5] # #### Variables # # - Exited: dependent variable (y) --> is binary # - Gender, Geography: categorical independent variable (x) # - All the remaining variables: numeric independent variable (x) # In[3]: df_y = df['Exited'] df_x = df.drop(['Exited'], axis = 1) # Create dummy variables for the categorical variables: # In[4]: dummy = pandas.get_dummies(df_x['Gender']) df_x = dummy.join(df_x) dummy = pandas.get_dummies(df_x['Geography']) df_x = dummy.join(df_x) df_x[:5] # Only one dummy variable should be used to avoid the "dummy variable trap": # In[5]: df_x = df_x.drop(['Gender', 'Male', 'France', 'Geography'], axis =1) df_x[:5] # Add a constant \begin{align} b_0 \end{align} to the model: # In[6]: df_x = sm.add_constant(df_x) df_x[:2] # Exclude the variables that should not affect the model: # In[7]: df_x = df_x.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1) df_x[:2] # Create a model with all the remaining variables: # In[8]: model1 = smdis.Logit(df_y, df_x).fit() print model1.summary() print "\n_____P-values____" p = model1.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model1.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # Note: # # __"Pseudo R-squ."__ - pseudo R-squared. # Logistic regression does not have an equivalent to the R-squared that is found in OLS regression; however, many people have tried to come up with one. There are a wide variety of pseudo-R-square statistics. Because this statistic does not mean what R-square means in OLS regression (the proportion of variance explained by the predictors), we suggest interpreting this statistic with great caution. (source: http://www.ats.ucla.edu/stat/stata/output/stata_logistic.htm) # # # __Backward Elimination__ # New model without the variable with the highest p-value: "Spain" # In[9]: df_x1 = df_x.drop(['Spain'], axis = 1) df_x1[:2] # In[10]: model2 = smdis.Logit(df_y, df_x1).fit() print model2.summary() print "\n_____P-values____" p = model2.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model2.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # New model without the variable with the highest p-value: "HasCrCard" # In[11]: df_x2 = df_x1.drop(['HasCrCard'], axis = 1) df_x2[:2] # In[12]: model3 = smdis.Logit(df_y, df_x2).fit() print model3.summary() print "\n_____P-values____" p = model3.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model3.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # New model without the variable with the highest p-value: "EstimatedSalary" # In[13]: df_x3 = df_x2.drop(['EstimatedSalary'], axis = 1) df_x3[:2] # In[14]: model4 = smdis.Logit(df_y, df_x3).fit() print model4.summary() print "\n_____P-values____" p = model4.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model4.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # New model without the variable with the highest p-value: "Tenure" # In[15]: df_x4 = df_x3.drop(['Tenure'], axis = 1) df_x4[:2] # In[16]: model5 = smdis.Logit(df_y, df_x4).fit() print model5.summary() print "\n_____P-values____" p = model5.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model5.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # Since the Accuracy decreases, the variable "Tenure" should be used in the model. # Now, the variable with the highest p-value is "NumOfProducts" with 0.032, which is above of the defined threshold - 0.05. # In[17]: df_x3[:5] # #### Transforming Independent Variables # Most common transformations: # # 1. $$\sqrt{x}$$ # 2. $$x^2$$ # 3. $$ln(x)$$ # Transform "Balance" variable with ln(x): # # Original value: Balance (in 1000$): # # Bal2 = Bal1 + 1unit --> Bal2 = Bal1 + 1000$ # # Scenario1: Bal1 = 1000$ # --> Bal2 = 1000$ + 1000$ = 2000$ # Scenario2: Bal1 = 10000$ # --> Bal2 = 10000$ + 1000$ = 11000$ # # Log10(Balance + 1): # # log10(Bal2) = log10(Bal1) + 1unit --> Bal2 = Bal1*10 # # Scenario1: Bal1 = 1000$ # --> Bal2 = 1000*10 = 10000 # Scenario2: Bal1 = 10000$ # ---> Bal2 = 10000*10 = 100000 # # Using the original value, means that if someone that starts with a balance of 1000$ and has an increase of 1000$, and someone that starts with a balance of 10000$ and has the same amount of increasing, are two completly different things. # # Using the ln transformation, it has the same affect on any person regardless of their starting point. It always a 10 times increase so a unit increase has a consistent increase in the balance variable which is 10 times. # # So regardless of who we're segmenting we can say that the effect of a one unit increase in the new transformed variable is consistent throughout our population and that is much more powerful because that does not restrict the logistic regression. # # In[18]: df_x4 = df_x3 df_x4['Balance_log'] = df_x4['Balance'].map(lambda x: np.log10(x+1)) df_x4[:2] # In[19]: model6 = smdis.Logit(df_y, df_x4).fit() print model6.summary() print "\n_____P-values____" p = model6.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model6.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # The accuracy is improved! # #### Derived Variables # # Sometimes it is useful too account for some effects, for example in our case, the balance and age might be correlated, because the older a person is the more whealth he/she can accumulate. # # So a new variable can be created: # # $$WealthAccumlation = \frac{Balance}{Age}$$ # In[20]: df_x5 = df_x4 df_x5['WealthAccumulation'] = df_x5['Balance']/ df_x5['Age'] df_x5[:2] # In[21]: model7 = smdis.Logit(df_y, df_x5).fit() print model7.summary() print "\n_____P-values____" p = model7.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model7.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # This new variable had a negative effect in the model accuracy. # # However, one should have in consideration that might be some collinearity effects because wealth accumulation includes the variables balance and age that are already in the model. So basically this means that this new variable might be somehow correlated with the other two and including all of them in the model might cause some damage, so on of them should be excluded. # # #### Multicollinearity # # Multicollinearity is a phenomenon in which two or more predictor variables in a multiple regression model are highly correlated, meaning that one can be linearly predicted from the others with a substantial degree of accuracy. # # The "Variance Inflation Factors - VIF" can be used to measure the degree of multicollinearity. # # Minimum possible value = 1.0 # Values > 5 may indicate a collinearity problem. # # $$VIF(j) = \frac{1}{( 1 - R(j)^2)}$$ # # where R(j) is the multiple correlarion coefficient between variable j and the other independent variables # # In[22]: i = 0 for column in df_x5.columns: print column + " %s" % outliers.variance_inflation_factor(df_x5.values, i) i += 1 # Rerun without the "Balance_log" variable: # In[23]: df_x6 = df_x5.drop('Balance_log', axis = 1) # In[24]: model8 = smdis.Logit(df_y, df_x6).fit() print model8.summary() print "\n_____P-values____" p = model8.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model8.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # In[25]: i = 0 for column in df_x6.columns: print column + " %s" % outliers.variance_inflation_factor(df_x6.values, i) i += 1 # Create new variable log for wealth accumulation: # In[26]: df_x7 = df_x5 df_x7['WealthAccumulation_log'] = df_x7['WealthAccumulation'].map(lambda x: np.log10(x + 1)) df_x7[:2] # In[27]: df_x7 = df_x7.drop(['Balance', 'WealthAccumulation'], axis = 1) # In[28]: model9 = smdis.Logit(df_y, df_x7).fit() print model9.summary() print "\n_____P-values____" p = model9.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model9.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # In[29]: i = 0 for column in df_x7.columns: print column + " %s" % outliers.variance_inflation_factor(df_x7.values, i) i += 1 # "Balance_log" and "WealthAccumulation_log" have too large values which means that these two variables are basically the same thing. So one of these variables must be excluded! # In[30]: df_x8 = df_x7.drop(['Balance_log'], axis = 1) # In[31]: model10 = smdis.Logit(df_y, df_x8).fit() print model10.summary() print "\n_____P-values____" p = model10.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model10.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # Correlation between variables: # In[32]: df_x5[['Age', 'Balance_log', 'WealthAccumulation_log', 'WealthAccumulation']].corr() # As one can observe, the "WealthAccumulation_log" and "Balance_log" variables are highly correlated - they are basically the same which is very bad for the model. # # Thumb rule: anything 0.9 is very high correlation. Correlations above 0.5 should be addressed. # In[33]: df_final = df_x5.drop(['Balance', 'WealthAccumulation', 'WealthAccumulation_log'], axis = 1) # In[34]: model11 = smdis.Logit(df_y, df_final).fit() print model11.summary() print "\n_____P-values____" p = model11.pvalues print p print "\n_____Highest p-value____" print "\n %s \n" % p[p ==max(p)] print "\n____Confusion Matrix___" cm = model11.pred_table() print cm print "\nNumber of cases correctly predicted: %s (%s %%)" % (cm[0][0] + cm[1][1], (cm[0][0] + cm[1][1])*100/np.sum(cm)) # In[35]: i = 0 for column in df_final.columns: print column + " %s" % outliers.variance_inflation_factor(df_final.values, i) i += 1 # Now, none of the variables presents multicollinearity. # #### CAP - Cumulative Accuracy Profile Curve # In[36]: df_cap = df[['RowNumber', 'Exited']] df_cap['phat'] = model11.predict() # In[37]: df_cap.sort('phat', ascending=False)[:5] # In[153]: Total_Exited = sum(df_cap['Exited']) print "Total Exited: %s" % Total_Exited Total_Records = len(df) print "Total number of clients: %s" % Total_Records Exit_Ratio = Total_Exited/Total_Records print "Ratio of exits: %s " % Exit_Ratio # First, I select the clients that have a higher probability. # In[111]: df_cap['total_selected'] = range(1,len(df_cap)+1) df_cap['total_selected_per'] = df_cap['total_selected'].map(lambda x: round(x*100/Total_Records, 1)) df_cap['random_selection'] = df_cap['total_selected'].map(lambda x: x*Exit_Ratio) df_cap['random_selection_per'] = df_cap['total_selected'].map(lambda x: round(x*100/Total_Records, 1)) # In[112]: df_cap = df_cap.sort('phat', ascending=False) # In[113]: df_cap['model_select'] = df_cap.Exited.cumsum() df_cap['model_select_per'] = df_cap['model_select'].map(lambda x: round(x*100/Total_Exited, 1)) # In[114]: df_cap[:5] # In[115]: df_cap = df_cap.sort('RowNumber') plt.plot(df_cap['total_selected_per'], df_cap['random_selection_per'], label = 'random model' ) plt.plot(df_cap['total_selected_per'], df_cap['model_select_per'], label = 'Good Model' ) plt.plot([50, 50], [80, 0], 'r--', lw=1, label = 'x = 50%, y = X') plt.plot([0, 50], [80, 80], 'r--', lw=1) plt.plot([10, 100], [99, 99], 'y-', lw=3, label = 'perfect model') plt.plot([0, 10], [0, 99], 'y-', lw=3) plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1) plt.xlabel('Total Contacted') plt.ylabel('Total Purchased') plt.show() # The closer the green line is to the yellow line, the better the model. The closer to the blue line, the worse. # # __Accuracy Ratio__: # - Take the area under the perfect model (yellow line) to the random model (blue line) - # $$a_p$$ # - Take the area under the good model (green line) to the random model (blue line) - # # $$a_r$$ # - So the accuracy ratio: # # $$AR = \frac{a_r}{a_p}$$ # # that is between 0 and 1, and the closer the ratio is to 1 the better. # # __Rule of thumb:__ # # Another way of evaluating the model is to look to the red line, x = 50%, and y = X: # # If: # # 90% < X < 100% -> Too Good (overfitting?) # 80% < X < 90% -> Very Good # 70% < X < 80% -> Good # 60% < X < 70% -> Poor # X < 60% -> Rubbish # #### Test model with test data # In[67]: df_test = pandas.DataFrame.from_csv('Churn-Modelling-Test-Data.csv', index_col=None) df_test[:5] # In[68]: df_test = sm.add_constant(df_test) # In[69]: df_test_y = df_test['Exited'] df_test = df_test.drop(['Exited'], axis = 1) # In[70]: dummy = pandas.get_dummies(df_test['Gender']) df_test= dummy.join(df_test) dummy = pandas.get_dummies(df_test['Geography']) df_test = dummy.join(df_test) df_test[:5] # In[71]: df_test['Balance_log'] = df_test['Balance'].map(lambda x: np.log10(x+1)) # Check the parameters included in the final model (model11) and rerun the model with the same parameters but now including the test data: # In[125]: model11.params # In[126]: df_test = df_test[['Germany', 'Female', 'CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'IsActiveMember', 'Balance_log', 'const']] # In[127]: df_total = df_final.append(df_test) # In[128]: model_final = smdis.Logit(df_y, df_total[:10000]).fit() # In[139]: y_pred = model_final.predict(df_total[10000:]) y_pred[:5] # Let's construct the CAP curve: # In[142]: df_pred = pandas.DataFrame() df_pred['phat'] = y_pred df_pred['Exited'] = df_test_y df_pred = df_pred.sort( 'phat', ascending=False) df_pred[:5] # In[152]: Total_Exited = sum(df_pred['Exited']) print "Total Exited: %s" % Total_Exited Total_Records = len(df_pred) print "Total number of clients: %s" % Total_Records Exit_Ratio = Total_Exited/Total_Records print "Ratio of exits: %s" % Exit_Ratio # In[143]: df_pred['total_selected'] = range(1,len(df_pred)+1) df_pred['total_selected_per'] = df_pred['total_selected'].map(lambda x: round(x*100/Total_Records, 1)) df_pred['random_selection'] = df_pred['total_selected'].map(lambda x: x*Exit_Ratio) df_pred['random_selection_per'] = df_pred['total_selected'].map(lambda x: round(x*100/Total_Records, 1)) # In[144]: df_pred = df_pred.sort('phat', ascending=False) # In[145]: df_pred['model_select'] = df_pred.Exited.cumsum() df_pred['model_select_per'] = df_pred['model_select'].map(lambda x: round(x*100/Total_Exited, 1)) # In[146]: df_pred[:5] # In[151]: plt.plot(df_pred['total_selected_per'], df_pred['random_selection_per'], label = 'random model' ) plt.plot(df_pred['total_selected_per'], df_pred['model_select_per'], label = 'Good Model' ) plt.plot([50, 50], [76, 0], 'r--', lw=1, label = 'x = 50%, y = X') plt.plot([0, 50], [76, 76], 'r--', lw=1) plt.plot([10, 100], [99, 99], 'y-', lw=3, label = 'perfect model') plt.plot([0, 10], [0, 99], 'y-', lw=3) plt.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1) plt.xlabel('Total Contacted') plt.ylabel('Total Purchased') plt.show() # __Conclusions__: # # - The model is performing above 80%, so a bit worse than the training data (a drop of about 3%). # - The concentration of data is about 10 times less than the training data which explains the ruggedness of the curve. # ### Parameters Interpretation # Odds ratio: # # $$odds = \frac{p}{1-p}$$ # # Multiple logistic regression: # # $$ln(\frac{p}{1-p}) = b_0 + b_0x_1 + ... + b_nx_n$$ # # So we can write: # # $$ln(odds) = b_0 + b_0x_1 + ... + b_nx_n$$ # # $$<=> odds = e^{b_0 + b_0x_1 + ... + b_nx_n}$$ # # $$<=> odds = e^{b_0} + e^{b_0x_1} + ... + e^{b_nx_n}$$ # # which means that, if we increase an unit to the $$x_1$$ variable, for example: # # $$e^{b_1x_1} -> e^{b_1(x_1 + 1)} = e^{b_1x_1}e^{b_1}$$ # # which means that, increading an independente variable, e.g. $$x_1$$, by 1 unit, will increase the odds by a multiplicative factor of $$e^{b_i}$$ # # So let's transform the coefficients of our model into odds ratios: # # # In[166]: odds_ratio = np.exp(model_final.params) odds_ratio # - "Germany" has the greater odds ratio, which means that being in Germany has the highest impact for people leaving the bank - most important independent variable that influences the model. If we change from not Germany to Germany, the odds ratio increases by 2.1. # - The next one is gender. One we go from a male customer to a female customer with all else held constant, the odds ratio of a female custormer leaving is 1.7 times grater that a Male customer leaving. So basically the odds ratio increases by 70%. # - Being an "ActiveMember" has the oppositive effect because the value is less than 1. This means that when we go from a member who is not active to a member that is active, the odds ratio gets multiplied by 0.34. So basically it drops by 66% which is also great in terms of retention. # - Anything above 0.8 and bellow 1.2 has not a great impact and can be ignored. #