#!/usr/bin/env python # coding: utf-8 # # BANK MARKETING (With social and economic context) # ## Summary # In[1]: from wand.image import Image imageFromPdf = Image(filename='project21.pdf',resolution=100) pages = len(imageFromPdf.sequence) image = Image(width=imageFromPdf.width,height=imageFromPdf.height * pages) for i in range(pages): image.composite(imageFromPdf.sequence[i],top=imageFromPdf.height * i,left=0) image.format="png" image # #### Importing necessary modules # In[2]: import warnings warnings.filterwarnings('ignore') import seaborn as sns import pandas as pd import researchpy import matplotlib.pyplot as plt import plotly.graph_objects as go from plotly.subplots import make_subplots # #### Data overview # In[3]: data=pd.read_csv("/home/user/Downloads/bank-additional-full.csv",sep=';') data.head() # # 1 . Preprocessing # ## 1.1 Data Cleaning # In[4]: data.shape # In[5]: data[data.duplicated(keep=False)] # In[6]: data=data.drop_duplicates() # In[7]: data.shape # In[8]: data.isna().sum() # In[9]: data.dtypes # ### 1.12 Handling Outliers # - checking for outliers # In[10]: for columns in (data.dtypes[data.dtypes!='object'].index): plt.figure() data.boxplot([columns]) # - pdays,previous have outliers, but both can't be changed using iqr,considering seperatly # #### 1. Column:- pdays # - pdays values can't be handled by iqr => all values will be changed to 999 # - created new column by binning old pdays # In[11]: data['y']=data['y'].map({'yes':1,'no':0}) # In[12]: data.corr()['pdays']['y'] # In[13]: data['pdays']=pd.cut(data['pdays'],bins=[-0.9,5,31,999],labels=['0 to 5 days','6 to 31 days','no previous cnt']) # In[14]: crosstab,res=researchpy.crosstab(data['y'],data['pdays'],test='chi-square') res.iloc[[2]] # #### 2. Column:- previous # In[15]: data.corr()['previous']['y'] # In[16]: data['previous'].value_counts() # In[17]: data['previous']=pd.cut(data['previous'],bins=[-0.9,0,1,3,7],labels=['not contacted','1day','2 or 3 days','4 to 7 days']) # In[18]: data['previous'].value_counts() # In[19]: crosstab,res=researchpy.crosstab(data['y'],data['previous'],test='chi-square') res.loc[[2]] # #### 3. Columns:- age,campaign,cons.conf.idx # In[20]: from scipy.stats import iqr list_box=['age','campaign','cons.conf.idx','duration'] for column in list_box: UT=data[column].quantile(.75)+iqr(data[column])*1.5 LT=data[column].quantile(.25)-iqr(data[column])*1.5 data[column]=data[column].clip(lower=LT,upper=UT) # ### 1.13 Handling Unknown # - finding percentage of 'unknown' present # In[21]: unknown_pcnt=data.isin(['unknown']).sum()*(100/len(data)) unknown_pcnt # #### 1. Column:-default # In[22]: data['default'].value_counts() # - default has >20% unknown # In[23]: cross,res=researchpy.crosstab(data['y'],data['default'],test='chi-square') res # - unknown changing using mode # In[24]: data_cp=data.copy() data_cp['default'][data_cp['default']=='unknown']=data_cp['default'].mode()[0] # In[25]: cross,res=researchpy.crosstab(data_cp['y'],data_cp['default'],test='chi-square') res.iloc[[2]] # - 0.09---->0.003 # - for default column decided to consider unknown # #### 2. Other columns with unknown # - filled all other variables with mode except default # In[26]: list_n=unknown_pcnt[unknown_pcnt>0][unknown_pcnt<20].index for column in list_n: data[column][data[column]=='unknown']=data[column].mode()[0] # ## 1.2 EDA # ### 1.21 Object&category variables # #### 1. Column:- job, y # ##### Jobs are grouped according to yes, no # In[27]: cross_tab=(pd.crosstab(index=data['y'],columns=data['job'],margins=True)) cross_tab # ##### percentage wise distribution of each jobs among total persons campaigned # In[28]: cross_tab=pd.crosstab(index=data['y'],columns=data['job'],margins=True,normalize='index')*100 cross_tab # - ##### Job has a strong association with subscription # In[29]: cross_tab1=pd.crosstab(index=data['job'],columns=data['y'],normalize='index')*100 x=cross_tab1.index fig = go.Figure(go.Bar(x=x, y=cross_tab1[0].values, name='no')) fig.add_trace(go.Bar(x=x, y=cross_tab1[1].values, name='yes')) fig.update_layout(barmode='stack') fig.show() # In[30]: ## y fig = go.Figure(data=[go.Pie(labels=['no','yes'], values=data['y'].value_counts().values, hole=.5,marker=dict(colors=['lightskyblue','limegreen']))]) fig.update_layout(annotations=[dict(text='y', x=0.5, y=0.5, font_size=20, showarrow=False)]) fig.show() # - #### campaign covered mostly admin,among them only $12$% subscribed # - #### for student and retired persons percentage of subscription is more compared to other jobs # - #### student subscribed around $31$% # - #### retired subscribed around $ 25$% # - #### there is imbalance in yes,no in y so need smoting # ### 2. columns:-marital, poutcome, month, day of week # In[31]: cs_marital=(pd.crosstab(data['y'],data['marital'],margins=True,normalize=True))*100 cs_marital # In[32]: from plotly.subplots import make_subplots fig=make_subplots(rows=2, cols=2) cs_marital=pd.crosstab(index=data['marital'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_marital.index,y=cs_marital[1].values,name='yes',marker_color='gold'),row=1,col=1) fig.add_trace(go.Bar(x=cs_marital.index,y=cs_marital[0].values,name='no',marker_color='coral'),row=1,col=1) fig.update_xaxes(title_text="marital status", row=1, col=1) # cs_poutcome=pd.crosstab(index=data['poutcome'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_poutcome.index,y=cs_poutcome[1].values,showlegend=False,marker_color='gold'),row=1,col=2) fig.add_trace(go.Bar(x=cs_poutcome.index,y=cs_poutcome[0].values,showlegend=False,marker_color='coral'),row=1,col=2) fig.update_xaxes(title_text="previous campaign outcome", row=1, col=2) # cs_month=pd.crosstab(index=data['month'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_month.index,y=cs_month[1].values,name='yes',showlegend=False,marker_color='gold'),row=2,col=1) fig.add_trace(go.Bar(x=cs_month.index,y=cs_month[0].values,name='no',showlegend=False,marker_color='coral'),row=2,col=1) fig.update_xaxes(title_text="last contact month", row=2, col=1) # cs_week_day=pd.crosstab(index=data['day_of_week'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_week_day.index,y=cs_week_day[1].values,showlegend=False,marker_color='gold'),row=2,col=2) fig.add_trace(go.Bar(x=cs_week_day.index,y=cs_week_day[0].values,showlegend=False,marker_color='coral'),row=2,col=2) fig.update_xaxes(title_text="last contact day", row=2, col=2) fig.update_layout(barmode='stack',height=900, width=700) fig.show() # - marital status shows weak association with subscription,there is no much difference in percetage # - among single catagory subscribed -->14% # - last contact month has a very strong association (march,dec,oct,sep shows higher subscription) # - day possess very weak association # ### 3. columns:-housing ,education,Loan, contact # In[33]: fig=make_subplots(rows=2, cols=2) cs_housing=pd.crosstab(index=data['housing'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_housing.index,y=cs_housing[1].values,name='yes',marker_color='midnightblue'),row=1,col=1) fig.add_trace(go.Bar(x=cs_housing.index,y=cs_housing[0].values,name='no',marker_color='orchid'),row=1,col=1) fig.update_xaxes(title_text="housing loan", row=1, col=1) # cs_education=pd.crosstab(index=data['education'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_education.index,y=cs_education[1].values,showlegend=False,marker_color='midnightblue'),row=1,col=2) fig.add_trace(go.Bar(x=cs_education.index,y=cs_education[0].values,showlegend=False,marker_color='orchid'),row=1,col=2) fig.update_xaxes(title_text="education", row=1, col=2) # cs_loan=pd.crosstab(index=data['loan'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_loan.index,y=cs_loan[1].values,name='yes',marker_color='midnightblue',showlegend=False),row=2,col=1) fig.add_trace(go.Bar(x=cs_loan.index,y=cs_loan[0].values,name='no',marker_color='orchid',showlegend=False),row=2,col=1) fig.update_xaxes(title_text="Personal Loan", row=2, col=1) # cs_contact=pd.crosstab(index=data['contact'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_contact.index,y=cs_education[1].values,showlegend=False,marker_color='midnightblue'),row=2,col=2) fig.add_trace(go.Bar(x=cs_contact.index,y=cs_education[0].values,showlegend=False,marker_color='orchid'),row=2,col=2) fig.update_xaxes(title_text="Communication Type", row=2, col=2) fig.update_layout(barmode='stack',height=900, width=700) fig.show() # - housing loan has very weak association with subscription # - education has a weak association with subscription # - illiterate 22% subscribed # - Communication type has moderate assosciation(cellular 10%) # - Personal loan has very weak association # ### 4. Columns:- pday, previous # In[34]: fig=make_subplots(rows=1, cols=2) cs_pdaynew=pd.crosstab(index=data['pdays'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_pdaynew.index,y=cs_pdaynew[1].values,name='yes',marker_color='midnightblue'),row=1,col=1) fig.add_trace(go.Bar(x=cs_pdaynew.index,y=cs_pdaynew[0].values,name='no',marker_color='orchid'),row=1,col=1) fig.update_xaxes(title_text="no of days passed from previous campaign", row=1, col=1) # cs_previous=pd.crosstab(index=data['previous'],columns=data['y'],normalize='index')*100 fig.add_trace(go.Bar(x=cs_previous.index,y=cs_previous[1].values,showlegend=False,marker_color='midnightblue'),row=1,col=2) fig.add_trace(go.Bar(x=cs_previous.index,y=cs_previous[0].values,showlegend=False,marker_color='orchid'),row=1,col=2) fig.update_xaxes(title_text="no.of contacts before campaign", row=1, col=2) # # - number of days that passed by after the client was last contacted from a previous campaign has a very strong association # - number of contacts performed before this campaign and for this client has a strong association # ### 1.22 continuous and discrete variables # ### 1. Column:-cons.conf.idx # In[35]: data_cp1=data.copy() # In[36]: data_cp1['cons.conf.idx']=pd.cut(data_cp1['cons.conf.idx'],bins=7) # In[37]: lis_con=data_cp1['cons.conf.idx'].value_counts().sort_index().index # In[38]: lis_con1=[str(x) for x in lis_con] cs_cons=pd.crosstab(index=data_cp1['cons.conf.idx'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_con1,y=cs_cons[1].values,name='yes',marker_color='palegreen')) fig.add_trace(go.Bar(x=lis_con1,y=cs_cons[0].values,name='no',marker_color='violet')) fig.update_xaxes(title_text="cons.conf.idx") # - it has a strong association with subscription # - as correlation indicate we can see an inverse relation # ### 2. Column:-duration # In[39]: data_cp1['duration']=data_cp1['duration']/60 # In[40]: data_cp1['duration']=pd.cut(data_cp1['duration'],bins=[-0.5,0,1,2,3,4,5,6,7,8,9,10,11]) # In[41]: lis_dur=data_cp1['duration'].value_counts().sort_index().index lis_dur1=[str(x) for x in lis_dur] # In[42]: lis_dur1=['0','(1.0, 2.0]','(2.0, 3.0]','(3.0, 4.0]','(4.0, 5.0]','(5.0, 6.0]','(6.0, 7.0]', '(7.0, 8.0]','(8.0, 9.0]','(9.0, 10.0]','(10.0, 11.0]'] # In[43]: cs_dur=pd.crosstab(index=data_cp1['duration'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_dur1,y=cs_dur[1].values,name='yes',marker_color='gray')) fig.add_trace(go.Bar(x=lis_dur1,y=cs_dur[0].values,name='no',marker_color='limegreen')) fig.update_xaxes(title_text="duration") # - duration shows high correlation # - 0 duration implies not subcribed # # ### 3. Column:-euribor3m # In[44]: data_cp1['euribor3m']=pd.qcut(data_cp1['euribor3m'],q=10) # In[45]: lis_eur=data_cp1['euribor3m'].value_counts().sort_index().index lis_eur1=[str(x) for x in lis_eur] # In[46]: cs_eur=pd.crosstab(index=data_cp1['euribor3m'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_eur1,y=cs_eur[1].values,name='yes',marker_color='black')) fig.add_trace(go.Bar(x=lis_eur1,y=cs_eur[0].values,name='no',marker_color='tomato')) fig.update_xaxes(title_text="euribor 3 month rate") # - shows strong correlation # - as interbank interest rate increase percentage of subscription decrease # ### 4. Column:-age # In[47]: data_cp1['age']=pd.qcut(data_cp1['age'],q=10) # In[48]: lis_age=data_cp1['age'].value_counts().sort_index().index lis_age1=[str(x) for x in lis_age] # In[49]: cs_age=pd.crosstab(index=data_cp1['age'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_age1,y=cs_age[1].values,name='yes',marker_color='gold')) fig.add_trace(go.Bar(x=lis_age1,y=cs_age[0].values,name='no',marker_color='coral')) fig.update_xaxes(title_text="age") # - a good no.of subscription is comes from customers of age in the range of 17 to 31 and 55 to 70 # - most no of subscription comes from student's and retired customers # ### 5. Column:-campaign # In[50]: data.corr()['campaign']['y'] # In[51]: cs_camp=pd.crosstab(index=data['campaign'],columns=data['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=cs_camp.index,y=cs_camp[1].values,name='yes',marker_color='midnightblue')) fig.add_trace(go.Bar(x=cs_camp.index,y=cs_camp[0].values,name='no',marker_color='orchid')) fig.update_xaxes(title_text="campaign") # - doesnt shows any good relation # ### 6. Column:-emp.var.rate # In[52]: data_cp1['emp.var.rate']=pd.cut(data_cp1['emp.var.rate'],bins=4) # In[53]: lis_emp=data_cp1['emp.var.rate'].value_counts().sort_index().index lis_emp1=[str(x) for x in lis_emp] # In[54]: cs_emp=pd.crosstab(index=data_cp1['emp.var.rate'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_emp1,y=cs_emp[1].values,name='yes',marker_color='black')) fig.add_trace(go.Bar(x=lis_emp1,y=cs_emp[0].values,name='no',marker_color='tomato')) fig.update_xaxes(title_text="emp.var.rate") # ### 7. Column:-nr.employed # In[55]: data_cp1['nr.employed']=pd.cut(data_cp1['nr.employed'],bins=4) # In[56]: lis_nr=data_cp1['nr.employed'].value_counts().sort_index().index lis_nr1=[str(x) for x in lis_nr] # In[57]: cs_nr=pd.crosstab(index=data_cp1['nr.employed'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_nr1,y=cs_emp[1].values,name='yes',marker_color='midnightblue')) fig.add_trace(go.Bar(x=lis_nr1,y=cs_emp[0].values,name='no',marker_color='orchid')) fig.update_xaxes(title_text="nr.employed") # - no of subscription more when number of employees less # ### 6. Column:-cons.price.idx # In[58]: data_cp1['cons.price.idx']=pd.qcut(data_cp1['cons.price.idx'],q=4) # In[59]: lis_price=data_cp1['cons.price.idx'].value_counts().sort_index().index lis_price1=[str(x) for x in lis_price] # In[60]: cs_price=pd.crosstab(index=data_cp1['cons.price.idx'],columns=data_cp1['y'],normalize='index')*100 fig=go.Figure(go.Bar(x=lis_price1,y=cs_emp[1].values,name='yes',marker_color='black')) fig.add_trace(go.Bar(x=lis_price1,y=cs_emp[0].values,name='no',marker_color='tomato')) fig.update_xaxes(title_text="cons.price.idx") # - for low consumer price index showing more percentage of subscription # # 2. Model Building # ## 2.1 Splitting, Smoting data # - dropping y and day_of_week # In[61]: X=data.drop(['y','day_of_week'],axis=1) # In[62]: y=data['y'] # In[63]: X.shape # In[64]: X=pd.get_dummies(X) # - splitting # In[65]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42) # In[66]: from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42) X_res, y_res = sm.fit_resample(X_train, y_train) # ## 2.2 Logistic regression # In[67]: from sklearn.linear_model import LogisticRegression l_reg=LogisticRegression() # In[33]: from sklearn.model_selection import GridSearchCV solver_values = ['newton-cg','sag','lbfgs'] penalty_values = ['l1', 'l2', 'elasticnet'] parameters = {'solver':solver_values , 'penalty': penalty_values } # In[42]: logistic= GridSearchCV(l_reg, parameters) logistic.fit(X_res, y_res) # In[43]: # obtaining best parameters logistic.best_params_ # In[66]: logistic_reg = LogisticRegression(penalty = 'l2',solver='newton-cg') logistic_reg.fit(X_res, y_res) # In[67]: y_pred=logistic_reg.predict(X_test) # In[68]: from sklearn.metrics import confusion_matrix confusion_matrix(y_test,y_pred) # In[69]: # accuracy score from sklearn.metrics import accuracy_score accuracy_score(y_test,y_pred) # In[70]: from sklearn.metrics import classification_report print(classification_report(y_test,y_pred)) # ## 2.3 Ridge classifier # In[71]: from sklearn.linear_model import RidgeClassifier ridge = RidgeClassifier() parameters = {'alpha': [ 1,4,4.9,4.8,4.7,4.75, 5, 5.5,10, 20]} ridge_clr = GridSearchCV(ridge, parameters) ridge_clr.fit(X_res, y_res) # In[51]: ridge_clr.best_params_ # In[72]: ridge_classifier=RidgeClassifier(alpha=4.7) ridge_classifier.fit(X_res, y_res) # In[73]: y_pred=ridge_classifier.predict(X_test) # In[74]: from sklearn.metrics import confusion_matrix confusion_matrix(y_test,y_pred) # In[75]: from sklearn.metrics import accuracy_score accuracy_score(y_test,y_pred) # In[76]: from sklearn.metrics import classification_report print(classification_report(y_test,y_pred)) # ## 2.4 RFC # In[78]: from sklearn.ensemble import RandomForestClassifier cls=RandomForestClassifier() # In[57]: n_estimators=[98,100] criterion=['gini','entropy'] max_depth=[3,5,10] max_features =["log2","sqrt"] parameters={'n_estimators': n_estimators,'criterion':criterion,'max_features':max_features,'max_depth':max_depth} #this will undergo 4*2*3 = 24 iterations RFC_cls = GridSearchCV(cls, parameters) RFC_cls.fit(X_res,y_res) # In[58]: RFC_cls.best_params_ # In[84]: rfc=RandomForestClassifier(n_estimators=100,criterion='entropy',max_depth=10,max_features='log2') # In[85]: rfc.fit(X_res,y_res) # In[86]: y_pred=rfc.predict(X_test) # In[87]: #accuracy score accuracy_score(y_test,y_pred) # In[88]: print(classification_report(y_test,y_pred)) # ## 2.5 SVC # In[55]: from sklearn.svm import SVC sv=SVC() # In[92]: parameters = {'kernel':['linear','poly']} # In[94]: sv_c=GridSearchCV(sv, parameters) sv_c.fit(X_res, y_res) # In[56]: sv_c.best_params_ # In[56]: svclassifier=SVC(kernel='linear') svclassifier.fit(X_res,y_res) # In[57]: ypred=svclassifier.predict(X_test) # In[58]: accuracy_score(y_test,ypred) # In[59]: from sklearn.metrics import classification_report print(classification_report(y_test,ypred)) # ## 2.6 XGBoost # In[40]: import xgboost as xgb xg_cl = xgb.XGBClassifier() # In[46]: max_depth=[3,5,10] gamma=[0,1e-3,1e-2,1e-3,1e-2,0.1] parameters={'gamma':gamma,'max_depth':max_depth} xg=GridSearchCV(xg_cl, parameters) xg.fit(X_res, y_res) # In[47]: xg.best_params_ # In[49]: xgboost=xgb.XGBClassifier(gamma=0,max_depth=10) xgboost.fit(X_res,y_res) # In[50]: ypred=xgboost.predict(X_test) # In[52]: from sklearn.metrics import accuracy_score accuracy_score(y_test,ypred) # In[54]: from sklearn.metrics import classification_report print(classification_report(y_test,ypred))