#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
plt.style.use('fivethirtyeight')
sns.set(color_codes=True)
# In[2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# In[3]:
train.head()
# In[4]:
train.info()
# In[4]:
print("Number of records in train data {}".format(train.shape[0]))
print("Number of records in test data {}".format(test.shape[0]))
# In[6]:
train.columns
#
1. Exploratory Data Analysis
# 1.1. Missing Values
# * In train dataset, **`Credit_Product`** there is **11.93 %** missing data points.
# * In test dataset, **`Credit_Product`** there is **11.89 %** missing data points.
# In[7]:
train.isnull().mean()*100
# In[8]:
test.isnull().mean()*100
# 1.2. Target Variable Distribution
# The target variable **`Is_Lead`** has imbalance between the two classes **0** (**76.27%**) and **1** (**23.72**). We need to use over-sampling techniques like **`SMOTE`** to balance the dataset.
# In[9]:
plt.figure(figsize=(10,6))
sns.countplot(x = "Is_Lead", data=train)
plt.title("Is_Lead Distribution")
plt.show()
# In[10]:
train['Is_Lead'].value_counts(normalize=True)*100
# 1.3. Numerical Features Distribution
# The following charts on Numerical Features provide us witth the following insights :
# * The Age distribution in train dataset and test dataset is almost similar
# * Avg_Account_Balance feature is skewed.
# * Customers aged between 40-60 have greater interest in credit cards.
# * Customers in their 20s and 30s and less interested
# * The Avg_Account_Balance, Vintage distribution in train dataset and test dataset is almost similar.
# In[11]:
numerical_feat = [feat for feat in train.columns if (train[feat].dtypes=='int64') & ~(feat=='Is_Lead')]
interested = train['Is_Lead'] == 1
fig, axs = plt.subplots(ncols=2, nrows=len(numerical_feat), figsize=(20,20))
plt.subplots_adjust(right=1.5)
for i , col in enumerate(numerical_feat) :
# Distribution of Is Lead in feature
sns.distplot(train[~interested][col], label='Not Interested', hist=True, color='#e74c3c', ax=axs[i][0])
sns.distplot(train[interested][col], label='Interested', hist=True, color='#2ecc71', ax=axs[i][0])
axs[i][0].set_xlabel('')
axs[i][0].tick_params(axis='x', labelsize=20)
axs[i][0].tick_params(axis='y', labelsize=20)
axs[i][0].legend(loc='upper right', prop={'size': 20})
axs[i][0].set_title('Distribution of Is Lead in {}'.format(col), size=20, y=1.05)
# Distribution of feature in dataset
sns.distplot(train[col], label='Training Set', hist=False, color='#e74c3c', ax=axs[i][1])
sns.distplot(test[col], label='Test Set', hist=False, color='#2ecc71', ax=axs[i][1])
axs[i][1].set_xlabel('')
axs[i][1].tick_params(axis='x', labelsize=20)
axs[i][1].tick_params(axis='y', labelsize=20)
axs[i][1].legend(loc='upper right', prop={'size': 20})
axs[i][1].set_title('Distribution of {} in dataset'.format(col), size=20, y=1.05)
plt.show()
# In[12]:
for feat in numerical_feat :
print('{} feature has a skewness of {} and kurtosis of {} in train set'.format(feat, train[feat].skew().round(3)
, train[feat].kurtosis().round(3)))
print('{} feature has a skewness of {} and kurtosis of {} in test set'.format(feat, test[feat].skew().round(3)
, test[feat].kurtosis().round(3)))
print()
# 1.4. Categorical Features Distribution
# The following charts on Categorical Features provide us witth the following insights :
# * Salaried person are less likely to take up credit cards. Only among Entrepreneur the number of customers interested to take up credit cards is more
# * Male customers are present more in the dataset than females.
# * Customers who were active in last 3 months have slightly more interest in credit cards then other customers
# * Number of Customers having credit products who are interested in Credit Card is more than those who donot have a Credit Product.
# In[13]:
categorical_feat = [col for col in train.columns if (train[col].dtypes=='object') & (col!='ID')]
fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(20,20))
plt.subplots_adjust(right=1.5)
for i , col in enumerate(categorical_feat) :
plt.subplot(2, 3, i+1)
# Distribution of Is Lead in feature
sns.countplot(x= col, data = train, hue='Is_Lead')
plt.xlabel('')
plt.ylabel('')
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
plt.legend(loc='upper right', prop={'size': 20})
plt.title('Count of Is Lead in {}'.format(col), size=20, y=1.05)
plt.show()
# 1.5. Correlations
# * **`Age`** and **`Vintage`** has highest correlation (__0.63__) in train dataset and (__0.62__) in test dataset.
# In[14]:
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))
sns.heatmap(train.drop(['ID'], axis=1).corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 12})
sns.heatmap(test.drop(['ID'], axis=1).corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 12})
for i in range(2):
axs[i].tick_params(axis='x', labelsize=10)
axs[i].tick_params(axis='y', labelsize=10)
axs[0].set_title('Training Set Correlations', size=15)
axs[1].set_title('Test Set Correlations', size=15)
plt.show()
# 1.6. Outliers
# In the train dataset **`Avg_Account_Balance`** has outliers.
# In[15]:
fig, axs = plt.subplots(ncols=3, nrows=1, figsize=(10,6))
plt.subplots_adjust(right=1.5)
for i , col in enumerate(numerical_feat) :
plt.subplot(1, 3, i+1)
sns.boxplot(x=col, data=train)
plt.xlabel('')
plt.ylabel('')
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
#plt.legend(loc='upper right', prop={'size': 20})
plt.title('Boxplot of Is Lead in {}'.format(col), size=12, y=1.05)
plt.show()
# Data Insights
#
# * Age 40-65, salaried are interested to buy credit card. Most of the Entrepreneurs also seem interested but not all.
# * Customers who already have any credit product are likely to buy credit card.
# * Salaried people with Channel code X1 haven't shown much interest in the past.
# * There are only 2 Entrepreneurs who don't have any credit product.
# * 66% of total Customers falling in Entrepreneural category in Occupation have shown interest in the past followed by 27.6% Self Employed, 24.5% in Others category and 16% Salaried.
# In[21]:
total_customers_by_occu = train.groupby(['Occupation'])['ID'].count().reset_index()
total_customers_by_occu.rename(columns={'ID' : 'Total Customers'}, inplace=True)
num_customers_by_occupation = train.groupby(['Is_Lead', 'Occupation'])['ID'].count().reset_index()
num_customers_by_occupation = pd.merge(num_customers_by_occupation, total_customers_by_occu , how='inner', on='Occupation')
num_customers_by_occupation['% of Total'] = round((num_customers_by_occupation['ID'] / num_customers_by_occupation['Total Customers']) * 100, 1)
num_customers_by_occupation = num_customers_by_occupation.sort_values(by=['Is_Lead', '% of Total'], ascending = [False, False])
num_customers_by_occupation.drop(columns=['ID', 'Total Customers'], inplace=True, axis=1)
df = pd.crosstab(num_customers_by_occupation['Occupation'], num_customers_by_occupation['Is_Lead'], values=num_customers_by_occupation['% of Total'], aggfunc=sum)
# In[27]:
# Function to plot stacked bars with annotations
def plot_stack_bars(df, title_, y_label, size_=(20,10), rot_=0, legend_='upper_right'):
ax = df.plot(kind='bar', stacked=True, figsize=size_, rot=rot_, title=title_)
annotate_plot(ax, textsize=15)
plt.legend([0, 1], loc=legend_)
plt.ylabel(y_label)
plt.show()
def annotate_plot(ax, pad=1, colour='white', textsize=14):
for i in ax.patches:
val = str(round(i.get_height(),1))
if val=='0.0':
continue
ax.annotate(val , ((i.get_x()+i.get_width()/2)*pad-0.05, (i.get_y()+i.get_height()/2)*pad), color=colour, size=textsize)
# In[28]:
plot_stack_bars(df, 'Occupation', '% of Total', rot_=0)
# 2. Data Cleaning
# 2.1. Missing Value of Credit_Product
# The missing values in the feature **`Credit_Product`** will be imputed with **`No_Info`**
# In[16]:
# filling null values with No_Info
train['Credit_Product'] = train['Credit_Product'].fillna("No_Info")
test['Credit_Product'] = test['Credit_Product'].fillna("No_Info")
# In[17]:
train['Credit_Product'].value_counts()
# In[18]:
plt.figure(figsize=(10,6))
sns.countplot(x = "Is_Lead", data = train, hue = "Credit_Product")
plt.title("Count of Credit_Product for each class label")
plt.show()
# Among all customers, Customers having a Credit Product have a greater chance of having a Credit Card.
# 3. Feature Engineering
# 3.1. One Hot Encoding of Categorical Features
# In[19]:
categorical_columns = ["Gender","Region_Code","Occupation","Channel_Code","Credit_Product","Is_Active"]
# train = pd.concat([train, pd.get_dummies(train, columns=categorical_columns)], axis=1)
# test = pd.concat([test, pd.get_dummies(test, columns=categorical_columns)], axis=1)
train = pd.concat([train, pd.get_dummies(train[categorical_columns])], axis=1)
test = pd.concat([test, pd.get_dummies(test[categorical_columns])], axis=1)
# In[20]:
train.columns
# __Dropping Irrelevent Columns__
# In[21]:
train.drop(columns=categorical_columns, inplace=True)
test.drop(columns = categorical_columns, inplace = True)
# In[22]:
train.head()
# In[23]:
# Dropping 'ID' column from train
train.drop(columns=['ID'], inplace=True)
# In[24]:
df_test = test.copy()
# In[25]:
# Dropping 'ID' column from test dataset
df_test.drop(columns=['ID'], inplace=True)
# In[26]:
train.info()
# In[27]:
# Spliting into X (features) and Y (Target Variable)
X = train.drop(columns=['Is_Lead'])
y = train[['Is_Lead']]
# In[28]:
y.value_counts(normalize=True)
# There is a class imbalance observed in the target feature. About **76.27%** customers are not interested in credit card, and about **23.72%** are interested in credit card. To address this issue Oversampling techniques like **SMOTE** is needed in order to balance the class imbalance.
# 4. Oversampling (Handling Class Imbalance)
# In[29]:
smote = SMOTE(random_state=42, n_jobs=-1)
X_train_smote, y_train_smote = smote.fit_resample(X,y)
X_train_smote.shape, y_train_smote.shape, y_train_smote.value_counts()
# 5. Modelling
# * For Modelling both Light GBM and Xgboost model is used.
# * The dimensionality of data is low.
# * For combining the predictions made by XGBoost and Light GBM, stacking is used
# 5.1. Light GBM Model
# In[30]:
param_dist = {"n_estimators":sp_randint(40,100),
"colsample_bytree":np.array([0.5,0.6,0.7,0.8,0.9,1]),
"subsample":np.array([0.5,0.6,0.7,0.8,0.9,1]),
"reg_lambda":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]),
"reg_alpha":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]),
"min_child_samples": sp_randint(25,65),
"max_depth": sp_randint(1,20)}
clf_lgbm = LGBMClassifier(boosting_type = "gbdt",n_jobs =-1,random_state = 10)
# Randomized Search CV for finding the best parameters under roc_auc scoring
lgbm_cv = RandomizedSearchCV(clf_lgbm, param_distributions=param_dist,
n_iter=20,cv=10,scoring='roc_auc',random_state=42,verbose=1)
lgbm_cv.fit(X_train_smote,y_train_smote)
print('mean test scores',lgbm_cv.cv_results_['mean_test_score'])
# In[31]:
print('The best parameters {}'.format(lgbm_cv.best_params_))
# In[32]:
lgb_clf_model = LGBMClassifier(colsample_bytree=0.8, max_depth=16, min_child_samples=39,
n_estimators=86, random_state=10, reg_alpha=1.0,
reg_lambda=0.001, subsample=0.8)
lgb_clf_model.fit(X_train_smote,y_train_smote)
proba = lgb_clf_model.predict_proba(X)[:,1]
train_score = roc_auc_score(y,proba)
cv_score = cross_val_score(lgb_clf_model,X,y,scoring="roc_auc",verbose=2,cv =5).mean()
print('The Mean CV Score : ',cv_score)
print('Train Score : ',train_score)
# In[33]:
# lgbm_pred_y = lgb_clf_model.predict_proba(df_test)[:,1]
# lgbm_predictions = pd.DataFrame()
# lgbm_predictions["ID"] = test["ID"]
# lgbm_predictions["Is_Lead"] = lgbm_pred_y
# # Saving to a csv file....
# lgbm_predictions.to_csv("Light_GBM_predictions.csv", index = False)
# __Feature Importance for Light Gradient Boosting Model__
# In[34]:
feature_imp = pd.DataFrame(sorted(zip(lgb_clf_model.feature_importances_,X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
# 5.2. XGBoost Model
# In[35]:
# param_dist = {"n_estimators":sp_randint(40,100),
# "colsample_bytree":np.array([0.5,0.6,0.7,0.8,0.9,1]),
# "subsample":np.array([0.5,0.6,0.7,0.8,0.9,1]),
# "reg_lambda":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]),
# "reg_alpha":np.array([1e-5,1e-4,1e-3,1e-2,0.1,1,10,100]),
# "min_child_samples": sp_randint(25,65),
# "max_depth": sp_randint(1,20)}
# clf_xgb_model = XGBClassifier(boosting_type = "gbdt",n_jobs =-1,random_state = 0,verbosity =0,scale_pos_weight = 3.2158)
# # Randomized Search CV for finding the best parameters under roc_auc scoring
# xgb_random_cv_model = RandomizedSearchCV(clf_xgb_model, param_distributions=param_dist,
# n_iter=20,cv=5,scoring='roc_auc',random_state=42,verbose=1)
# xgb_random_cv_model.fit(X_train_smote,y_train_smote)
# # Mean CV Test Scores
# print('mean CV test scores',xgb_random_cv_model.cv_results_['mean_test_score'])
# In[36]:
#xgb_random_cv_model.best_estimator_
# In[37]:
xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', boosting_type='gbdt',
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=7, min_child_samples=52,
min_child_weight=1 ,monotone_constraints='()',
n_estimators=46, n_jobs=-1, num_parallel_tree=1, random_state=0,
reg_alpha=1e-05, reg_lambda=100.0, scale_pos_weight=3.2158,
subsample=0.8, tree_method='exact', validate_parameters=1,
verbosity=0)
xgb_model.fit(X_train_smote,y_train_smote)
proba = xgb_model.predict_proba(X)[:,1]
train_score = roc_auc_score(y,proba)
cv_score = cross_val_score(xgb_model,X,y,scoring="roc_auc",verbose=2,cv =5).mean()
print(cv_score)
print(train_score)
# In[38]:
# xgbm_pred_y = xgb_model.predict_proba(df_test)[:,1]
# xgbm_predictions = pd.DataFrame()
# xgbm_predictions["ID"] = test["ID"]
# xgbm_predictions["Is_Lead"] = xgbm_pred_y
# # Saving to a csv file....
# xgbm_predictions.to_csv("Xgboost_predictions.csv", index = False)
# __Feature Importance for Xgboost__
# In[39]:
from xgboost import plot_importance
# In[40]:
fig, ax = plt.subplots(figsize=(15,20))
plot_importance(xgb_model , ax=ax)
# 5.3. Stacking of Xgboost and Light GBM Classifier
# In[53]:
# Xgboost Classifier
xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', boosting_type='gbdt',
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=7, min_child_samples=52,
min_child_weight=1 ,monotone_constraints='()',
n_estimators=46, n_jobs=-1, num_parallel_tree=1, random_state=0,
reg_alpha=1e-05, reg_lambda=100.0, scale_pos_weight=3.2158,
subsample=0.8, tree_method='exact', validate_parameters=1,
verbosity=0)
xgb_model.fit(X,y)
# In[54]:
# Light Gradient Boosting Classifier
lgbm_model = LGBMClassifier(colsample_bytree=0.8, max_depth=16, min_child_samples=39,
n_estimators=86, random_state=10, reg_alpha=1.0,
reg_lambda=0.001, subsample=0.8)
lgbm_model.fit(X,y)
# In[43]:
# Stacking Classifier
alpha = [1e-5, 0.01,0.001,1]
for i in alpha:
lr = LogisticRegression(C = i )
stack_clf = StackingClassifier([lgbm_model,xgb_model], meta_classifier=lr, use_probas=True )
stack_clf.fit(X,y)
train_proba = stack_clf.predict_proba(X)[:,1]
train_score = roc_auc_score(y,train_proba)
cv_score = cross_val_score(stack_clf, X,y,scoring="roc_auc",verbose=0,cv =5).mean()
print("Stacking classifier for alpha = %f, train score is %f and cv_score is %f"%(i, train_score, cv_score))
# In[55]:
# Using the Stacking Classifier for the best value of alpha
lr = LogisticRegression(C =0.0001)
stack_clf =StackingClassifier([lgbm_model,xgb_model], meta_classifier=lr, use_probas=True )
stack_clf.fit(X,y)
train_proba = stack_clf.predict_proba(X)[:,1]
train_score = roc_auc_score(y,train_proba)
cv_score = cross_val_score(stack_clf, X,y,scoring="roc_auc",verbose=0,cv =5).mean()
print("Stacking classifier for alpha = %f, train score is %f and cv_score is %f"%(i, train_score, cv_score))
# In[56]:
# Finding the probabilistic lead prediction using the Stacking Classifier
stack_clf_pred = stack_clf.predict_proba(df_test)[:,1]
stack_clf_predictions = pd.DataFrame()
stack_clf_predictions["ID"] = test["ID"]
stack_clf_predictions["Is_Lead"] = stack_clf_pred
# In[57]:
stack_clf_predictions.to_csv("Stacking_XGB_LightGBM_predictions.csv", index = False)