In [29]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

loan_data = pd.read_csv('./02-loan_data.csv')

df = loan_data.copy()
del df['customer_id']

columns_to_round = ['loan_amt_outstanding', 'total_debt_outstanding', 'income']
df[columns_to_round] = df[columns_to_round].apply(lambda x: x.round(2))

In [30]:

df.dtypes

Out[30]:

credit_lines_outstanding      int64
loan_amt_outstanding        float64
total_debt_outstanding      float64
income                      float64
years_employed                int64
fico_score                    int64
default                       int64
dtype: object

In [31]:

df.isnull().sum()

Out[31]:

credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

Our data appears to have already been cleaned.

In [32]:

df_train.head()

Out[32]:

	loan_amt_outstanding	total_debt_outstanding	income	years_employed	fico_score	default
4495	3322.04	1783.15	53092.33	4	588	0
6470	2153.90	6022.02	25992.88	4	687	1
2221	3666.26	5253.54	71090.00	5	622	0
7686	4485.62	6886.23	80505.30	5	681	0
9419	7624.47	14607.81	111898.20	3	756	0

In [33]:

# pred
y_train = df_train.default.values
y_val = df_val.default.values
y_test = df_test.default.values

del df_train['default']
del df_val['default']
del df_test['default']

Model (to predict PD)¶

In [34]:

df.corr()

Out[34]:

	credit_lines_outstanding	loan_amt_outstanding	total_debt_outstanding	income	years_employed	fico_score	default
credit_lines_outstanding	1.000000	0.080249	0.852210	0.022272	-0.087900	-0.258177	0.862815
loan_amt_outstanding	0.080249	1.000000	0.397403	0.835815	-0.158416	-0.031373	0.098978
total_debt_outstanding	0.852210	0.397403	1.000000	0.394397	-0.174353	-0.232246	0.758868
income	0.022272	0.835815	0.394397	1.000000	0.001814	-0.010528	0.016309
years_employed	-0.087900	-0.158416	-0.174353	0.001814	1.000000	0.255873	-0.284506
fico_score	-0.258177	-0.031373	-0.232246	-0.010528	0.255873	1.000000	-0.324515
default	0.862815	0.098978	0.758868	0.016309	-0.284506	-0.324515	1.000000

In [35]:

df.hist(figsize=(10,10))

Out[35]:

array([[<Axes: title={'center': 'credit_lines_outstanding'}>,
        <Axes: title={'center': 'loan_amt_outstanding'}>,
        <Axes: title={'center': 'total_debt_outstanding'}>],
       [<Axes: title={'center': 'income'}>,
        <Axes: title={'center': 'years_employed'}>,
        <Axes: title={'center': 'fico_score'}>],
       [<Axes: title={'center': 'default'}>, <Axes: >, <Axes: >]],
      dtype=object)

It appears that the features follow a Gaussian (normal) distribution. Let's see if we our dataset is really suitable for a Gaussian Naive Bayes. So, we check if each feature, given a specific class of the target variable (default), follows a Gaussian distribution.

In [36]:

# Histogram of features when default = 1
df[df.default==1].hist(figsize=(10,10))

Out[36]:

array([[<Axes: title={'center': 'credit_lines_outstanding'}>,
        <Axes: title={'center': 'loan_amt_outstanding'}>,
        <Axes: title={'center': 'total_debt_outstanding'}>],
       [<Axes: title={'center': 'income'}>,
        <Axes: title={'center': 'years_employed'}>,
        <Axes: title={'center': 'fico_score'}>],
       [<Axes: title={'center': 'default'}>, <Axes: >, <Axes: >]],
      dtype=object)

In [37]:

# Histogram of features when default = 0
df[df.default==0].hist(figsize=(10,10))

Out[37]:

array([[<Axes: title={'center': 'credit_lines_outstanding'}>,
        <Axes: title={'center': 'loan_amt_outstanding'}>,
        <Axes: title={'center': 'total_debt_outstanding'}>],
       [<Axes: title={'center': 'income'}>,
        <Axes: title={'center': 'years_employed'}>,
        <Axes: title={'center': 'fico_score'}>],
       [<Axes: title={'center': 'default'}>, <Axes: >, <Axes: >]],
      dtype=object)

In [38]:

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [39]:

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model_g = GaussianNB()
model_g.fit(df_train, y_train)

Out[39]:

GaussianNB()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [40]:

# Accuracy on the training set
(y_train== model_g.predict(df_train)).mean()

Out[40]:

0.9813333333333333

In [41]:

# Accuracy on the validation set, to see if it needs tuning
(y_val== model_g.predict(df_val)).mean()

Out[41]:

0.9775

We get $97.8\%$ accuracy on the validation set. It looks like we don't need to tune the model.

In [42]:

# Accuracy on the test set:
(y_test== model_g.predict(df_test)).mean()

Out[42]:

0.977

In [43]:

df_full = loan_data.copy()
del df_full['customer_id']
y_fulltrain = df_full.default.values
del df_full['default']

In [44]:

df_full.columns

Out[44]:

Index(['credit_lines_outstanding', 'loan_amt_outstanding',
       'total_debt_outstanding', 'income', 'years_employed', 'fico_score'],
      dtype='object')

In [45]:

def prompt_user():
    data = {}
    data['credit_lines_outstanding'] = float(input("Enter the number of credit lines outstanding: "))
    data['loan_amt_outstanding'] = float(input("Enter the amount of loan outstanding: "))
    data['total_debt_outstanding'] = float(input("Enter the total debt outstanding: "))
    data['income'] = float(input("Enter the income: "))
    data['years_employed'] = float(input("Enter the number of years employed: "))
    data['fico_score'] = float(input("Enter the FICO score: "))

    return data

In [48]:

# takes user input about the loan data
user_data = prompt_user()

This is the function that takes in the properties of a loan and outputs the expected loss

In [49]:

model = GaussianNB().fit(df_full, y_fulltrain)
# loan here is the loan profile dict
def expectedLoss():
    loan = pd.DataFrame(prompt_user(), index=[0])
    #return model.predict_proba(loan)[:,1][0]*100 + '\%'
    prob_default = model.predict_proba(loan)[:,1][0] 
    loan_amt = float(loan.loan_amt_outstanding)
    return prob_default * loan_amt * (1 - 0.1) # recovery rate = 10% (given)

In [50]:

expectedLoss()

/var/folders/h0/722z94dd3fb0pfv4wg3qmdkh0000gn/T/ipykernel_74903/2507886040.py:7: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  loan_amt = float(loan.loan_amt_outstanding)

Out[50]:

564.8283882093126

In [ ]: