# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import scipy.stats
datapath='Data/'
# Import data
train_df=pd.read_csv(datapath+'train_u6lujuX_CVtuZ9i.csv')
test_df=pd.read_csv(datapath+'test_Y3wMUE5_7gLdaTN.csv')
train_df.head()
Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y |
1 | LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
2 | LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
3 | LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
4 | LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
# % of target composition
train_df.Loan_Status.value_counts()/len(train_df)
Y 0.687296 N 0.312704 Name: Loan_Status, dtype: float64
# Null values in columns
train_df.isnull().sum() * 100/len(train_df)
Loan_ID 0.000000 Gender 2.117264 Married 0.488599 Dependents 2.442997 Education 0.000000 Self_Employed 5.211726 ApplicantIncome 0.000000 CoapplicantIncome 0.000000 LoanAmount 3.583062 Loan_Amount_Term 2.280130 Credit_History 8.143322 Property_Area 0.000000 Loan_Status 0.000000 dtype: float64
train_df['Credit_History'].nunique()
2
# Add a feature EMI. Formula here https://javatutoring.com/wp-content/uploads/2016/12/emi-calculation-formula.jpg
# On account of time paucity, rate of interest was avaergaed for gender. Definitely a better way worth investigating later.
dataset = [train_df,test_df]
for i in dataset:
l = []
for j in i.Gender.index:
if i.Gender[j] == "Male":
r = 8.65/(12*100)
else:
r = 8.6/(12*100)
P = i.LoanAmount[j]*1000
n = i.Loan_Amount_Term[j]
E = P*r*(1 + r)**n/((1 + r)**n - 1)
l.append(E)
i["EMI"] = l
# Add a ratio feature EMI. This we be capacitating loan amount with compined income per record
for i in dataset:
i["income"] = i["ApplicantIncome"] + i["CoapplicantIncome"]
# i["app_income/loan"] = [x[0]/x[1] for x in zip(i["ApplicantIncome"],i["LoanAmount"])]
i["income_loan_ratio"] = [x[0]/x[1] for x in zip(i["income"],i["LoanAmount"])]
i.drop(["CoapplicantIncome"],axis = 1,inplace = True)
train_df.shape,test_df.shape
((614, 15), (367, 14))
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 614 entries, 0 to 613 Data columns (total 15 columns): Loan_ID 614 non-null object Gender 601 non-null object Married 611 non-null object Dependents 599 non-null object Education 614 non-null object Self_Employed 582 non-null object ApplicantIncome 614 non-null int64 LoanAmount 592 non-null float64 Loan_Amount_Term 600 non-null float64 Credit_History 564 non-null float64 Property_Area 614 non-null object Loan_Status 614 non-null object EMI 578 non-null float64 income 614 non-null float64 income_loan_ratio 592 non-null float64 dtypes: float64(6), int64(1), object(8) memory usage: 72.0+ KB
# Nothing fancy. Simply treat missing values based on data type
for i in train_df.columns[train_df.dtypes == "object"]:
train_df[i].fillna(train_df[i].mode()[0],inplace = True)
for i in train_df.columns[train_df.dtypes != "object"]:
train_df[i].fillna(train_df[i].mean(),inplace = True)
for i in test_df.columns[test_df.dtypes == "object"]:
test_df[i].fillna(train_df[i].mode()[0],inplace = True)
for i in train_df.columns[train_df.dtypes != "object"]:
test_df[i].fillna(test_df[i].mean(),inplace = True)
# Label encode categoricals
for i in list(set(train_df.columns[train_df.dtypes == "object"]) -set(["Loan_ID",'Loan_Status'])):
le = LabelEncoder()
train_df[i] = le.fit_transform(train_df[i].astype("str"))
test_df[i] = le.transform(test_df[i].astype("str"))
# Create and apply model
tra_col = sorted(list(set (train_df.columns)- set(["Loan_ID","Loan_Status"])))
test_df_col = sorted(list(set (test_df.columns)- set(["Loan_ID"])))
print(tra_col,test_df_col)
import xgboost as xgb
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(train_df[tra_col],train_df["Loan_Status"])
# from scipy import sparse
#model.predict(test_df[test_df_col])
['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio'] ['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio']
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1)
#Predict and create submission
result = pd.DataFrame({"Loan_ID":(test_df.Loan_ID),"Loan_Status":model.predict(test_df[test_df_col])}).reset_index(drop = True)
result[["Loan_ID","Loan_Status"]].to_csv("loan_prediction_analyticsvidhya.csv",index = False)