#!/usr/bin/env python
# coding: utf-8

# # BANK MARKETING (With social and economic context)

# ## Summary

# In[1]:


from wand.image import Image  
imageFromPdf = Image(filename='project21.pdf',resolution=100)  
pages = len(imageFromPdf.sequence)  

image = Image(width=imageFromPdf.width,height=imageFromPdf.height * pages)  
for i in range(pages):  
    image.composite(imageFromPdf.sequence[i],top=imageFromPdf.height * i,left=0)  
image.format="png"  
image


# #### Importing necessary modules

# In[2]:


import warnings 
warnings.filterwarnings('ignore')
import seaborn as sns
import pandas as pd
import researchpy
import matplotlib.pyplot as plt
import plotly.graph_objects as go 
from plotly.subplots import make_subplots


# #### Data overview

# In[3]:


data=pd.read_csv("/home/user/Downloads/bank-additional-full.csv",sep=';')
data.head()


# # 1 . Preprocessing

# ## 1.1 Data Cleaning

# In[4]:


data.shape


# In[5]:


data[data.duplicated(keep=False)]


# In[6]:


data=data.drop_duplicates()


# In[7]:


data.shape


# In[8]:


data.isna().sum()


# In[9]:


data.dtypes


# ### 1.12 Handling Outliers

# - checking for outliers

# In[10]:


for columns in (data.dtypes[data.dtypes!='object'].index):
    plt.figure()
    data.boxplot([columns])


# - pdays,previous have outliers, but both can't be changed using iqr,considering seperatly

# #### 1. Column:- pdays

# - pdays values can't be handled by iqr => all values will be changed to 999 
# - created new column by binning old pdays

# In[11]:


data['y']=data['y'].map({'yes':1,'no':0})


# In[12]:


data.corr()['pdays']['y']


# In[13]:


data['pdays']=pd.cut(data['pdays'],bins=[-0.9,5,31,999],labels=['0 to 5 days','6 to 31 days','no previous cnt'])


# In[14]:


crosstab,res=researchpy.crosstab(data['y'],data['pdays'],test='chi-square')
res.iloc[[2]]


# #### 2. Column:- previous

# In[15]:


data.corr()['previous']['y']


# In[16]:


data['previous'].value_counts()


# In[17]:


data['previous']=pd.cut(data['previous'],bins=[-0.9,0,1,3,7],labels=['not contacted','1day','2 or 3 days','4 to 7 days'])


# In[18]:


data['previous'].value_counts()


# In[19]:


crosstab,res=researchpy.crosstab(data['y'],data['previous'],test='chi-square')
res.loc[[2]]


# #### 3. Columns:- age,campaign,cons.conf.idx

# In[20]:


from scipy.stats import iqr
list_box=['age','campaign','cons.conf.idx','duration']
for column in list_box:
    UT=data[column].quantile(.75)+iqr(data[column])*1.5
    LT=data[column].quantile(.25)-iqr(data[column])*1.5
    data[column]=data[column].clip(lower=LT,upper=UT)


# ### 1.13 Handling Unknown

# - finding percentage of 'unknown' present

# In[21]:


unknown_pcnt=data.isin(['unknown']).sum()*(100/len(data))
unknown_pcnt


# #### 1. Column:-default

# In[22]:


data['default'].value_counts()


# - default has >20% unknown

# In[23]:


cross,res=researchpy.crosstab(data['y'],data['default'],test='chi-square')
res


# - unknown changing using mode

# In[24]:


data_cp=data.copy()
data_cp['default'][data_cp['default']=='unknown']=data_cp['default'].mode()[0]


# In[25]:


cross,res=researchpy.crosstab(data_cp['y'],data_cp['default'],test='chi-square')
res.iloc[[2]]


# - 0.09---->0.003
# - for default column decided to consider unknown 

# #### 2. Other columns with unknown

# - filled all other variables with mode except default

# In[26]:


list_n=unknown_pcnt[unknown_pcnt>0][unknown_pcnt<20].index
for column in list_n:
    data[column][data[column]=='unknown']=data[column].mode()[0]


# ## 1.2 EDA

# ### 1.21 Object&category variables

# #### 1.  Column:- job, y

# ##### Jobs are grouped according to yes, no

# In[27]:


cross_tab=(pd.crosstab(index=data['y'],columns=data['job'],margins=True))
cross_tab


# ##### percentage wise distribution of each jobs among total persons campaigned

# In[28]:


cross_tab=pd.crosstab(index=data['y'],columns=data['job'],margins=True,normalize='index')*100
cross_tab


# - ##### Job has a strong association with subscription

# In[29]:


cross_tab1=pd.crosstab(index=data['job'],columns=data['y'],normalize='index')*100
x=cross_tab1.index
fig = go.Figure(go.Bar(x=x, y=cross_tab1[0].values, name='no'))
fig.add_trace(go.Bar(x=x, y=cross_tab1[1].values, name='yes'))
fig.update_layout(barmode='stack')

fig.show()


# In[30]:


## y
fig = go.Figure(data=[go.Pie(labels=['no','yes'], values=data['y'].value_counts().values, hole=.5,marker=dict(colors=['lightskyblue','limegreen']))])
fig.update_layout(annotations=[dict(text='y', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()


# - #### campaign covered mostly admin,among them only $12$% subscribed
# - #### for student and retired persons percentage of subscription is more compared to other jobs
# - #### student subscribed around $31$%
# - #### retired subscribed around $ 25$% 
# - #### there is imbalance in yes,no in y so need smoting

# ### 2. columns:-marital, poutcome, month, day of week

# In[31]:


cs_marital=(pd.crosstab(data['y'],data['marital'],margins=True,normalize=True))*100
cs_marital


# In[32]:


from plotly.subplots import make_subplots
fig=make_subplots(rows=2, cols=2)
cs_marital=pd.crosstab(index=data['marital'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_marital.index,y=cs_marital[1].values,name='yes',marker_color='gold'),row=1,col=1)
fig.add_trace(go.Bar(x=cs_marital.index,y=cs_marital[0].values,name='no',marker_color='coral'),row=1,col=1)
fig.update_xaxes(title_text="marital status", row=1, col=1)
#
cs_poutcome=pd.crosstab(index=data['poutcome'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_poutcome.index,y=cs_poutcome[1].values,showlegend=False,marker_color='gold'),row=1,col=2)
fig.add_trace(go.Bar(x=cs_poutcome.index,y=cs_poutcome[0].values,showlegend=False,marker_color='coral'),row=1,col=2)
fig.update_xaxes(title_text="previous campaign outcome", row=1, col=2)
#
cs_month=pd.crosstab(index=data['month'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_month.index,y=cs_month[1].values,name='yes',showlegend=False,marker_color='gold'),row=2,col=1)
fig.add_trace(go.Bar(x=cs_month.index,y=cs_month[0].values,name='no',showlegend=False,marker_color='coral'),row=2,col=1)
fig.update_xaxes(title_text="last contact month", row=2, col=1)
#
cs_week_day=pd.crosstab(index=data['day_of_week'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_week_day.index,y=cs_week_day[1].values,showlegend=False,marker_color='gold'),row=2,col=2)
fig.add_trace(go.Bar(x=cs_week_day.index,y=cs_week_day[0].values,showlegend=False,marker_color='coral'),row=2,col=2)
fig.update_xaxes(title_text="last contact day", row=2, col=2)
fig.update_layout(barmode='stack',height=900, width=700)
fig.show()


# - marital status shows weak association with subscription,there is no much difference in percetage
# - among single catagory subscribed -->14%
# - last contact month has a very strong association (march,dec,oct,sep shows higher subscription)
# - day possess very weak association

# ### 3. columns:-housing ,education,Loan, contact

# In[33]:


fig=make_subplots(rows=2, cols=2)
cs_housing=pd.crosstab(index=data['housing'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_housing.index,y=cs_housing[1].values,name='yes',marker_color='midnightblue'),row=1,col=1)
fig.add_trace(go.Bar(x=cs_housing.index,y=cs_housing[0].values,name='no',marker_color='orchid'),row=1,col=1)
fig.update_xaxes(title_text="housing loan", row=1, col=1)
#
cs_education=pd.crosstab(index=data['education'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_education.index,y=cs_education[1].values,showlegend=False,marker_color='midnightblue'),row=1,col=2)
fig.add_trace(go.Bar(x=cs_education.index,y=cs_education[0].values,showlegend=False,marker_color='orchid'),row=1,col=2)
fig.update_xaxes(title_text="education", row=1, col=2)
#
cs_loan=pd.crosstab(index=data['loan'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_loan.index,y=cs_loan[1].values,name='yes',marker_color='midnightblue',showlegend=False),row=2,col=1)
fig.add_trace(go.Bar(x=cs_loan.index,y=cs_loan[0].values,name='no',marker_color='orchid',showlegend=False),row=2,col=1)
fig.update_xaxes(title_text="Personal Loan", row=2, col=1)
#
cs_contact=pd.crosstab(index=data['contact'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_contact.index,y=cs_education[1].values,showlegend=False,marker_color='midnightblue'),row=2,col=2)
fig.add_trace(go.Bar(x=cs_contact.index,y=cs_education[0].values,showlegend=False,marker_color='orchid'),row=2,col=2)
fig.update_xaxes(title_text="Communication Type", row=2, col=2)
fig.update_layout(barmode='stack',height=900, width=700)
fig.show()


# - housing loan has very weak association with subscription
# - education has a weak association with subscription
# - illiterate 22% subscribed
# - Communication type has moderate assosciation(cellular 10%)
# - Personal loan has very weak association

# ### 4. Columns:- pday, previous

# In[34]:


fig=make_subplots(rows=1, cols=2)
cs_pdaynew=pd.crosstab(index=data['pdays'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_pdaynew.index,y=cs_pdaynew[1].values,name='yes',marker_color='midnightblue'),row=1,col=1)
fig.add_trace(go.Bar(x=cs_pdaynew.index,y=cs_pdaynew[0].values,name='no',marker_color='orchid'),row=1,col=1)
fig.update_xaxes(title_text="no of days passed from previous campaign", row=1, col=1)
#
cs_previous=pd.crosstab(index=data['previous'],columns=data['y'],normalize='index')*100
fig.add_trace(go.Bar(x=cs_previous.index,y=cs_previous[1].values,showlegend=False,marker_color='midnightblue'),row=1,col=2)
fig.add_trace(go.Bar(x=cs_previous.index,y=cs_previous[0].values,showlegend=False,marker_color='orchid'),row=1,col=2)
fig.update_xaxes(title_text="no.of contacts before campaign", row=1, col=2)
#


# - number of days that passed by after the client was last contacted from a previous campaign   has a very strong association
# - number of contacts performed before this campaign and for this client has a strong   association        

# ### 1.22 continuous and discrete variables

# ### 1. Column:-cons.conf.idx

# In[35]:


data_cp1=data.copy()


# In[36]:


data_cp1['cons.conf.idx']=pd.cut(data_cp1['cons.conf.idx'],bins=7)


# In[37]:


lis_con=data_cp1['cons.conf.idx'].value_counts().sort_index().index


# In[38]:


lis_con1=[str(x) for x in lis_con]
cs_cons=pd.crosstab(index=data_cp1['cons.conf.idx'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_con1,y=cs_cons[1].values,name='yes',marker_color='palegreen'))
fig.add_trace(go.Bar(x=lis_con1,y=cs_cons[0].values,name='no',marker_color='violet'))
fig.update_xaxes(title_text="cons.conf.idx")


# - it has a strong association with subscription
# - as correlation indicate we can see an inverse relation

# ### 2. Column:-duration

# In[39]:


data_cp1['duration']=data_cp1['duration']/60


# In[40]:


data_cp1['duration']=pd.cut(data_cp1['duration'],bins=[-0.5,0,1,2,3,4,5,6,7,8,9,10,11])


# In[41]:


lis_dur=data_cp1['duration'].value_counts().sort_index().index
lis_dur1=[str(x) for x in lis_dur]


# In[42]:


lis_dur1=['0','(1.0, 2.0]','(2.0, 3.0]','(3.0, 4.0]','(4.0, 5.0]','(5.0, 6.0]','(6.0, 7.0]', '(7.0, 8.0]','(8.0, 9.0]','(9.0, 10.0]','(10.0, 11.0]']


# In[43]:


cs_dur=pd.crosstab(index=data_cp1['duration'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_dur1,y=cs_dur[1].values,name='yes',marker_color='gray'))
fig.add_trace(go.Bar(x=lis_dur1,y=cs_dur[0].values,name='no',marker_color='limegreen'))
fig.update_xaxes(title_text="duration")


# - duration shows high correlation
# - 0  duration implies not subcribed
# 

# ### 3. Column:-euribor3m

# In[44]:


data_cp1['euribor3m']=pd.qcut(data_cp1['euribor3m'],q=10)


# In[45]:


lis_eur=data_cp1['euribor3m'].value_counts().sort_index().index
lis_eur1=[str(x) for x in lis_eur]


# In[46]:


cs_eur=pd.crosstab(index=data_cp1['euribor3m'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_eur1,y=cs_eur[1].values,name='yes',marker_color='black'))
fig.add_trace(go.Bar(x=lis_eur1,y=cs_eur[0].values,name='no',marker_color='tomato'))
fig.update_xaxes(title_text="euribor 3 month rate")


# - shows strong correlation
# - as interbank interest rate increase percentage of subscription decrease

# ### 4. Column:-age

# In[47]:


data_cp1['age']=pd.qcut(data_cp1['age'],q=10)


# In[48]:


lis_age=data_cp1['age'].value_counts().sort_index().index
lis_age1=[str(x) for x in lis_age]


# In[49]:


cs_age=pd.crosstab(index=data_cp1['age'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_age1,y=cs_age[1].values,name='yes',marker_color='gold'))
fig.add_trace(go.Bar(x=lis_age1,y=cs_age[0].values,name='no',marker_color='coral'))
fig.update_xaxes(title_text="age")


# - a good no.of subscription is comes from customers of age in the range of 17 to 31 and 55 to 70
# - most no of subscription comes from student's and retired customers 

# ### 5. Column:-campaign

# In[50]:


data.corr()['campaign']['y']


# In[51]:


cs_camp=pd.crosstab(index=data['campaign'],columns=data['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=cs_camp.index,y=cs_camp[1].values,name='yes',marker_color='midnightblue'))
fig.add_trace(go.Bar(x=cs_camp.index,y=cs_camp[0].values,name='no',marker_color='orchid'))
fig.update_xaxes(title_text="campaign")


# - doesnt shows any good relation

# ### 6. Column:-emp.var.rate

# In[52]:


data_cp1['emp.var.rate']=pd.cut(data_cp1['emp.var.rate'],bins=4)


# In[53]:


lis_emp=data_cp1['emp.var.rate'].value_counts().sort_index().index
lis_emp1=[str(x) for x in lis_emp]


# In[54]:


cs_emp=pd.crosstab(index=data_cp1['emp.var.rate'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_emp1,y=cs_emp[1].values,name='yes',marker_color='black'))
fig.add_trace(go.Bar(x=lis_emp1,y=cs_emp[0].values,name='no',marker_color='tomato'))
fig.update_xaxes(title_text="emp.var.rate")


# ### 7. Column:-nr.employed

# In[55]:


data_cp1['nr.employed']=pd.cut(data_cp1['nr.employed'],bins=4)


# In[56]:


lis_nr=data_cp1['nr.employed'].value_counts().sort_index().index
lis_nr1=[str(x) for x in lis_nr]


# In[57]:


cs_nr=pd.crosstab(index=data_cp1['nr.employed'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_nr1,y=cs_emp[1].values,name='yes',marker_color='midnightblue'))
fig.add_trace(go.Bar(x=lis_nr1,y=cs_emp[0].values,name='no',marker_color='orchid'))
fig.update_xaxes(title_text="nr.employed")


# - no of subscription more when number of employees less

# ### 6. Column:-cons.price.idx

# In[58]:


data_cp1['cons.price.idx']=pd.qcut(data_cp1['cons.price.idx'],q=4)


# In[59]:


lis_price=data_cp1['cons.price.idx'].value_counts().sort_index().index
lis_price1=[str(x) for x in lis_price]


# In[60]:


cs_price=pd.crosstab(index=data_cp1['cons.price.idx'],columns=data_cp1['y'],normalize='index')*100
fig=go.Figure(go.Bar(x=lis_price1,y=cs_emp[1].values,name='yes',marker_color='black'))
fig.add_trace(go.Bar(x=lis_price1,y=cs_emp[0].values,name='no',marker_color='tomato'))
fig.update_xaxes(title_text="cons.price.idx")


# - for low consumer price index showing more percentage of subscription

# # 2. Model Building

# ## 2.1 Splitting, Smoting data 

# - dropping y and day_of_week

# In[61]:


X=data.drop(['y','day_of_week'],axis=1)


# In[62]:


y=data['y']


# In[63]:


X.shape


# In[64]:


X=pd.get_dummies(X)


# - splitting

# In[65]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)


# In[66]:


from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)


# ## 2.2 Logistic regression

# In[67]:


from sklearn.linear_model import LogisticRegression
l_reg=LogisticRegression()


# In[33]:


from sklearn.model_selection import GridSearchCV
solver_values = ['newton-cg','sag','lbfgs']
penalty_values = ['l1', 'l2', 'elasticnet']
parameters = {'solver':solver_values , 'penalty': penalty_values }


# In[42]:


logistic= GridSearchCV(l_reg, parameters)
logistic.fit(X_res, y_res)


# In[43]:


# obtaining best parameters
logistic.best_params_


# In[66]:


logistic_reg = LogisticRegression(penalty = 'l2',solver='newton-cg')
logistic_reg.fit(X_res, y_res)


# In[67]:


y_pred=logistic_reg.predict(X_test)


# In[68]:


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)


# In[69]:


# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)


# In[70]:


from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))


# ## 2.3 Ridge classifier

# In[71]:


from sklearn.linear_model import RidgeClassifier
ridge = RidgeClassifier()
parameters = {'alpha': [ 1,4,4.9,4.8,4.7,4.75, 5, 5.5,10, 20]}
ridge_clr = GridSearchCV(ridge, parameters)
ridge_clr.fit(X_res, y_res)


# In[51]:


ridge_clr.best_params_


# In[72]:


ridge_classifier=RidgeClassifier(alpha=4.7)
ridge_classifier.fit(X_res, y_res)


# In[73]:


y_pred=ridge_classifier.predict(X_test)


# In[74]:


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)


# In[75]:


from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)


# In[76]:


from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))


# ## 2.4 RFC

# In[78]:


from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier()


# In[57]:


n_estimators=[98,100] 
criterion=['gini','entropy'] 
max_depth=[3,5,10] 
max_features =["log2","sqrt"]
parameters={'n_estimators': n_estimators,'criterion':criterion,'max_features':max_features,'max_depth':max_depth} #this will undergo 4*2*3 = 24 iterations
RFC_cls = GridSearchCV(cls, parameters)
RFC_cls.fit(X_res,y_res)


# In[58]:


RFC_cls.best_params_


# In[84]:


rfc=RandomForestClassifier(n_estimators=100,criterion='entropy',max_depth=10,max_features='log2')


# In[85]:


rfc.fit(X_res,y_res)


# In[86]:


y_pred=rfc.predict(X_test)


# In[87]:


#accuracy score
accuracy_score(y_test,y_pred)


# In[88]:


print(classification_report(y_test,y_pred))


# ## 2.5 SVC

# In[55]:


from sklearn.svm import SVC  
sv=SVC()


# In[92]:


parameters = {'kernel':['linear','poly']}


# In[94]:


sv_c=GridSearchCV(sv, parameters)
sv_c.fit(X_res, y_res)


# In[56]:


sv_c.best_params_


# In[56]:


svclassifier=SVC(kernel='linear')
svclassifier.fit(X_res,y_res)


# In[57]:


ypred=svclassifier.predict(X_test)


# In[58]:


accuracy_score(y_test,ypred)


# In[59]:


from sklearn.metrics import classification_report
print(classification_report(y_test,ypred))


# ## 2.6 XGBoost

# In[40]:


import xgboost as xgb
xg_cl = xgb.XGBClassifier()


# In[46]:


max_depth=[3,5,10]
gamma=[0,1e-3,1e-2,1e-3,1e-2,0.1]
parameters={'gamma':gamma,'max_depth':max_depth}
xg=GridSearchCV(xg_cl, parameters)
xg.fit(X_res, y_res)


# In[47]:


xg.best_params_


# In[49]:


xgboost=xgb.XGBClassifier(gamma=0,max_depth=10)
xgboost.fit(X_res,y_res)


# In[50]:


ypred=xgboost.predict(X_test)


# In[52]:


from sklearn.metrics import accuracy_score
accuracy_score(y_test,ypred)


# In[54]:


from sklearn.metrics import classification_report
print(classification_report(y_test,ypred))