#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns import warnings warnings.filterwarnings("ignore") from sklearn.metrics import roc_auc_score from sklearn.model_selection import cross_val_score from sklearn.metrics import roc_curve from xgboost import XGBClassifier from lightgbm import LGBMClassifier from catboost import CatBoostClassifier from scipy.stats import randint as sp_randint from sklearn.model_selection import RandomizedSearchCV from mlxtend.classifier import StackingClassifier from sklearn.linear_model import LogisticRegression from imblearn.over_sampling import SMOTE plt.style.use('fivethirtyeight') sns.set(color_codes=True) # In[2]: train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") # In[3]: train.head() # In[4]: train.info() # In[4]: print("Number of records in train data {}".format(train.shape[0])) print("Number of records in test data {}".format(test.shape[0])) # In[6]: train.columns #

1. Exploratory Data Analysis

#

1.1. Missing Values

# * In train dataset, **`Credit_Product`** there is **11.93 %** missing data points. # * In test dataset, **`Credit_Product`** there is **11.89 %** missing data points. # In[7]: train.isnull().mean()*100 # In[8]: test.isnull().mean()*100 #

1.2. Target Variable Distribution

# The target variable **`Is_Lead`** has imbalance between the two classes **0** (**76.27%**) and **1** (**23.72**). We need to use over-sampling techniques like **`SMOTE`** to balance the dataset. # In[9]: plt.figure(figsize=(10,6)) sns.countplot(x = "Is_Lead", data=train) plt.title("Is_Lead Distribution") plt.show() # In[10]: train['Is_Lead'].value_counts(normalize=True)*100 #

1.3. Numerical Features Distribution

# The following charts on Numerical Features provide us witth the following insights : # * The Age distribution in train dataset and test dataset is almost similar # * Avg_Account_Balance feature is skewed. # * Customers aged between 40-60 have greater interest in credit cards. # * Customers in their 20s and 30s and less interested # * The Avg_Account_Balance, Vintage distribution in train dataset and test dataset is almost similar. # In[11]: numerical_feat = [feat for feat in train.columns if (train[feat].dtypes=='int64') & ~(feat=='Is_Lead')] interested = train['Is_Lead'] == 1 fig, axs = plt.subplots(ncols=2, nrows=len(numerical_feat), figsize=(20,20)) plt.subplots_adjust(right=1.5) for i , col in enumerate(numerical_feat) : # Distribution of Is Lead in feature sns.distplot(train[~interested][col], label='Not Interested', hist=True, color='#e74c3c', ax=axs[i][0]) sns.distplot(train[interested][col], label='Interested', hist=True, color='#2ecc71', ax=axs[i][0]) axs[i][0].set_xlabel('') axs[i][0].tick_params(axis='x', labelsize=20) axs[i][0].tick_params(axis='y', labelsize=20) axs[i][0].legend(loc='upper right', prop={'size': 20}) axs[i][0].set_title('Distribution of Is Lead in {}'.format(col), size=20, y=1.05) # Distribution of feature in dataset sns.distplot(train[col], label='Training Set', hist=False, color='#e74c3c', ax=axs[i][1]) sns.distplot(test[col], label='Test Set', hist=False, color='#2ecc71', ax=axs[i][1]) axs[i][1].set_xlabel('') axs[i][1].tick_params(axis='x', labelsize=20) axs[i][1].tick_params(axis='y', labelsize=20) axs[i][1].legend(loc='upper right', prop={'size': 20}) axs[i][1].set_title('Distribution of {} in dataset'.format(col), size=20, y=1.05) plt.show() # In[12]: for feat in numerical_feat : print('{} feature has a skewness of {} and kurtosis of {} in train set'.format(feat, train[feat].skew().round(3) , train[feat].kurtosis().round(3))) print('{} feature has a skewness of {} and kurtosis of {} in test set'.format(feat, test[feat].skew().round(3) , test[feat].kurtosis().round(3))) print() #

1.4. Categorical Features Distribution

# The following charts on Categorical Features provide us witth the following insights : # * Salaried person are less likely to take up credit cards. Only among Entrepreneur the number of customers interested to take up credit cards is more # * Male customers are present more in the dataset than females. # * Customers who were active in last 3 months have slightly more interest in credit cards then other customers # * Number of Customers having credit products who are interested in Credit Card is more than those who donot have a Credit Product. # In[13]: categorical_feat = [col for col in train.columns if (train[col].dtypes=='object') & (col!='ID')] fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(20,20)) plt.subplots_adjust(right=1.5) for i , col in enumerate(categorical_feat) : plt.subplot(2, 3, i+1) # Distribution of Is Lead in feature sns.countplot(x= col, data = train, hue='Is_Lead') plt.xlabel('') plt.ylabel('') plt.tick_params(axis='x', labelsize=20) plt.tick_params(axis='y', labelsize=20) plt.legend(loc='upper right', prop={'size': 20}) plt.title('Count of Is Lead in {}'.format(col), size=20, y=1.05) plt.show() #

1.5. Correlations

# * **`Age`** and **`Vintage`** has highest correlation (__0.63__) in train dataset and (__0.62__) in test dataset. # In[14]: fig, axs = plt.subplots(nrows=2, figsize=(20, 20)) sns.heatmap(train.drop(['ID'], axis=1).corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 12}) sns.heatmap(test.drop(['ID'], axis=1).corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 12}) for i in range(2): axs[i].tick_params(axis='x', labelsize=10) axs[i].tick_params(axis='y', labelsize=10) axs[0].set_title('Training Set Correlations', size=15) axs[1].set_title('Test Set Correlations', size=15) plt.show() #

1.6. Outliers

# In the train dataset **`Avg_Account_Balance`** has outliers. # In[15]: fig, axs = plt.subplots(ncols=3, nrows=1, figsize=(10,6)) plt.subplots_adjust(right=1.5) for i , col in enumerate(numerical_feat) : plt.subplot(1, 3, i+1) sns.boxplot(x=col, data=train) plt.xlabel('') plt.ylabel('') plt.tick_params(axis='x', labelsize=12) plt.tick_params(axis='y', labelsize=12) #plt.legend(loc='upper right', prop={'size': 20}) plt.title('Boxplot of Is Lead in {}'.format(col), size=12, y=1.05) plt.show() # Data Insights # # * Age 40-65, salaried are interested to buy credit card. Most of the Entrepreneurs also seem interested but not all. # * Customers who already have any credit product are likely to buy credit card. # * Salaried people with Channel code X1 haven't shown much interest in the past. # * There are only 2 Entrepreneurs who don't have any credit product. # * 66% of total Customers falling in Entrepreneural category in Occupation have shown interest in the past followed by 27.6% Self Employed, 24.5% in Others category and 16% Salaried. # In[21]: total_customers_by_occu = train.groupby(['Occupation'])['ID'].count().reset_index() total_customers_by_occu.rename(columns={'ID' : 'Total Customers'}, inplace=True) num_customers_by_occupation = train.groupby(['Is_Lead', 'Occupation'])['ID'].count().reset_index() num_customers_by_occupation = pd.merge(num_customers_by_occupation, total_customers_by_occu , how='inner', on='Occupation') num_customers_by_occupation['% of Total'] = round((num_customers_by_occupation['ID'] / num_customers_by_occupation['Total Customers']) * 100, 1) num_customers_by_occupation = num_customers_by_occupation.sort_values(by=['Is_Lead', '% of Total'], ascending = [False, False]) num_customers_by_occupation.drop(columns=['ID', 'Total Customers'], inplace=True, axis=1) df = pd.crosstab(num_customers_by_occupation['Occupation'], num_customers_by_occupation['Is_Lead'], values=num_customers_by_occupation['% of Total'], aggfunc=sum) # In[27]: # Function to plot stacked bars with annotations def plot_stack_bars(df, title_, y_label, size_=(20,10), rot_=0, legend_='upper_right'): ax = df.plot(kind='bar', stacked=True, figsize=size_, rot=rot_, title=title_) annotate_plot(ax, textsize=15) plt.legend([0, 1], loc=legend_) plt.ylabel(y_label) plt.show() def annotate_plot(ax, pad=1, colour='white', textsize=14): for i in ax.patches: val = str(round(i.get_height(),1)) if val=='0.0': continue ax.annotate(val , ((i.get_x()+i.get_width()/2)*pad-0.05, (i.get_y()+i.get_height()/2)*pad), color=colour, size=textsize) # In[28]: plot_stack_bars(df, 'Occupation', '% of Total', rot_=0) #

2. Data Cleaning

#

2.1. Missing Value of Credit_Product

# The missing values in the feature **`Credit_Product`** will be imputed with **`No_Info`** # In[16]: # filling null values with No_Info train['Credit_Product'] = train['Credit_Product'].fillna("No_Info") test['Credit_Product'] = test['Credit_Product'].fillna("No_Info") # In[17]: train['Credit_Product'].value_counts() # In[18]: plt.figure(figsize=(10,6)) sns.countplot(x = "Is_Lead", data = train, hue = "Credit_Product") plt.title("Count of Credit_Product for each class label") plt.show() # Among all customers, Customers having a Credit Product have a greater chance of having a Credit Card. #

3. Feature Engineering

#

3.1. One Hot Encoding of Categorical Features

# In[19]: categorical_columns = ["Gender","Region_Code","Occupation","Channel_Code","Credit_Product","Is_Active"] # train = pd.concat([train, pd.get_dummies(train, columns=categorical_columns)], axis=1) # test = pd.concat([test, pd.get_dummies(test, columns=categorical_columns)], axis=1) train = pd.concat([train, pd.get_dummies(train[categorical_columns])], axis=1) test = pd.concat([test, pd.get_dummies(test[categorical_columns])], axis=1) # In[20]: train.columns # __Dropping Irrelevent Columns__ # In[21]: train.drop(columns=categorical_columns, inplace=True) test.drop(columns = categorical_columns, inplace = True) # In[22]: train.head() # In[23]: # Dropping 'ID' column from train train.drop(columns=['ID'], inplace=True) # In[24]: df_test = test.copy() # In[25]: # Dropping 'ID' column from test dataset df_test.drop(columns=['ID'], inplace=True) # In[26]: train.info() # In[27]: # Spliting into X (features) and Y (Target Variable) X = train.drop(columns=['Is_Lead']) y = train[['Is_Lead']] # In[28]: y.value_counts(normalize=True) # There is a class imbalance observed in the target feature. About **76.27%** customers are not interested in credit card, and about **23.72%** are interested in credit card. To address this issue Oversampling techniques like **SMOTE** is needed in order to balance the class imbalance. #

4. Oversampling (Handling Class Imbalance)

# In[29]: smote = SMOTE(random_state=42, n_jobs=-1) X_train_smote, y_train_smote = smote.fit_resample(X,y) X_train_smote.shape, y_train_smote.shape, y_train_smote.value_counts() #

5. Modelling

# * For Modelling both Light GBM and Xgboost model is used. # * The dimensionality of data is low. # * For combining the predictions made by XGBoost and Light GBM, stacking is used #

5.1. Light GBM Model

# In[30]: param_dist = {"n_estimators":sp_randint(40,100), "colsample_bytree":np.array([0.5,0.6,0.7,0.8,0.9,1]), "subsample":np.array([0.5,0.6,0.7,0.8,0.9,1]), "reg_lambda":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]), "reg_alpha":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]), "min_child_samples": sp_randint(25,65), "max_depth": sp_randint(1,20)} clf_lgbm = LGBMClassifier(boosting_type = "gbdt",n_jobs =-1,random_state = 10) # Randomized Search CV for finding the best parameters under roc_auc scoring lgbm_cv = RandomizedSearchCV(clf_lgbm, param_distributions=param_dist, n_iter=20,cv=10,scoring='roc_auc',random_state=42,verbose=1) lgbm_cv.fit(X_train_smote,y_train_smote) print('mean test scores',lgbm_cv.cv_results_['mean_test_score']) # In[31]: print('The best parameters {}'.format(lgbm_cv.best_params_)) # In[32]: lgb_clf_model = LGBMClassifier(colsample_bytree=0.8, max_depth=16, min_child_samples=39, n_estimators=86, random_state=10, reg_alpha=1.0, reg_lambda=0.001, subsample=0.8) lgb_clf_model.fit(X_train_smote,y_train_smote) proba = lgb_clf_model.predict_proba(X)[:,1] train_score = roc_auc_score(y,proba) cv_score = cross_val_score(lgb_clf_model,X,y,scoring="roc_auc",verbose=2,cv =5).mean() print('The Mean CV Score : ',cv_score) print('Train Score : ',train_score) # In[33]: # lgbm_pred_y = lgb_clf_model.predict_proba(df_test)[:,1] # lgbm_predictions = pd.DataFrame() # lgbm_predictions["ID"] = test["ID"] # lgbm_predictions["Is_Lead"] = lgbm_pred_y # # Saving to a csv file.... # lgbm_predictions.to_csv("Light_GBM_predictions.csv", index = False) # __Feature Importance for Light Gradient Boosting Model__ # In[34]: feature_imp = pd.DataFrame(sorted(zip(lgb_clf_model.feature_importances_,X.columns)), columns=['Value','Feature']) plt.figure(figsize=(10, 10)) sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.show() #

5.2. XGBoost Model

# In[35]: # param_dist = {"n_estimators":sp_randint(40,100), # "colsample_bytree":np.array([0.5,0.6,0.7,0.8,0.9,1]), # "subsample":np.array([0.5,0.6,0.7,0.8,0.9,1]), # "reg_lambda":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]), # "reg_alpha":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]), # "min_child_samples": sp_randint(25,65), # "max_depth": sp_randint(1,20)} # clf_xgb_model = XGBClassifier(boosting_type = "gbdt",n_jobs =-1,random_state = 0,verbosity =0,scale_pos_weight = 3.2158) # # Randomized Search CV for finding the best parameters under roc_auc scoring # xgb_random_cv_model = RandomizedSearchCV(clf_xgb_model, param_distributions=param_dist, # n_iter=20,cv=5,scoring='roc_auc',random_state=42,verbose=1) # xgb_random_cv_model.fit(X_train_smote,y_train_smote) # # Mean CV Test Scores # print('mean CV test scores',xgb_random_cv_model.cv_results_['mean_test_score']) # In[36]: #xgb_random_cv_model.best_estimator_ # In[37]: xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', boosting_type='gbdt', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=7, min_child_samples=52, min_child_weight=1 ,monotone_constraints='()', n_estimators=46, n_jobs=-1, num_parallel_tree=1, random_state=0, reg_alpha=1e-05, reg_lambda=100.0, scale_pos_weight=3.2158, subsample=0.8, tree_method='exact', validate_parameters=1, verbosity=0) xgb_model.fit(X_train_smote,y_train_smote) proba = xgb_model.predict_proba(X)[:,1] train_score = roc_auc_score(y,proba) cv_score = cross_val_score(xgb_model,X,y,scoring="roc_auc",verbose=2,cv =5).mean() print(cv_score) print(train_score) # In[38]: # xgbm_pred_y = xgb_model.predict_proba(df_test)[:,1] # xgbm_predictions = pd.DataFrame() # xgbm_predictions["ID"] = test["ID"] # xgbm_predictions["Is_Lead"] = xgbm_pred_y # # Saving to a csv file.... # xgbm_predictions.to_csv("Xgboost_predictions.csv", index = False) # __Feature Importance for Xgboost__ # In[39]: from xgboost import plot_importance # In[40]: fig, ax = plt.subplots(figsize=(15,20)) plot_importance(xgb_model , ax=ax) #

5.3. Stacking of Xgboost and Light GBM Classifier

# In[53]: # Xgboost Classifier xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', boosting_type='gbdt', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=7, min_child_samples=52, min_child_weight=1 ,monotone_constraints='()', n_estimators=46, n_jobs=-1, num_parallel_tree=1, random_state=0, reg_alpha=1e-05, reg_lambda=100.0, scale_pos_weight=3.2158, subsample=0.8, tree_method='exact', validate_parameters=1, verbosity=0) xgb_model.fit(X,y) # In[54]: # Light Gradient Boosting Classifier lgbm_model = LGBMClassifier(colsample_bytree=0.8, max_depth=16, min_child_samples=39, n_estimators=86, random_state=10, reg_alpha=1.0, reg_lambda=0.001, subsample=0.8) lgbm_model.fit(X,y) # In[43]: # Stacking Classifier alpha = [1e-5, 0.01,0.001,1] for i in alpha: lr = LogisticRegression(C = i ) stack_clf = StackingClassifier([lgbm_model,xgb_model], meta_classifier=lr, use_probas=True ) stack_clf.fit(X,y) train_proba = stack_clf.predict_proba(X)[:,1] train_score = roc_auc_score(y,train_proba) cv_score = cross_val_score(stack_clf, X,y,scoring="roc_auc",verbose=0,cv =5).mean() print("Stacking classifier for alpha = %f, train score is %f and cv_score is %f"%(i, train_score, cv_score)) # In[55]: # Using the Stacking Classifier for the best value of alpha lr = LogisticRegression(C =0.0001) stack_clf =StackingClassifier([lgbm_model,xgb_model], meta_classifier=lr, use_probas=True ) stack_clf.fit(X,y) train_proba = stack_clf.predict_proba(X)[:,1] train_score = roc_auc_score(y,train_proba) cv_score = cross_val_score(stack_clf, X,y,scoring="roc_auc",verbose=0,cv =5).mean() print("Stacking classifier for alpha = %f, train score is %f and cv_score is %f"%(i, train_score, cv_score)) # In[56]: # Finding the probabilistic lead prediction using the Stacking Classifier stack_clf_pred = stack_clf.predict_proba(df_test)[:,1] stack_clf_predictions = pd.DataFrame() stack_clf_predictions["ID"] = test["ID"] stack_clf_predictions["Is_Lead"] = stack_clf_pred # In[57]: stack_clf_predictions.to_csv("Stacking_XGB_LightGBM_predictions.csv", index = False)