import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
loan_data = pd.read_csv('./02-loan_data.csv')
df = loan_data.copy()
del df['customer_id']
columns_to_round = ['loan_amt_outstanding', 'total_debt_outstanding', 'income']
df[columns_to_round] = df[columns_to_round].apply(lambda x: x.round(2))
df.dtypes
credit_lines_outstanding int64 loan_amt_outstanding float64 total_debt_outstanding float64 income float64 years_employed int64 fico_score int64 default int64 dtype: object
df.isnull().sum()
credit_lines_outstanding 0 loan_amt_outstanding 0 total_debt_outstanding 0 income 0 years_employed 0 fico_score 0 default 0 dtype: int64
Our data appears to have already been cleaned.
df_train.head()
loan_amt_outstanding | total_debt_outstanding | income | years_employed | fico_score | default | |
---|---|---|---|---|---|---|
4495 | 3322.04 | 1783.15 | 53092.33 | 4 | 588 | 0 |
6470 | 2153.90 | 6022.02 | 25992.88 | 4 | 687 | 1 |
2221 | 3666.26 | 5253.54 | 71090.00 | 5 | 622 | 0 |
7686 | 4485.62 | 6886.23 | 80505.30 | 5 | 681 | 0 |
9419 | 7624.47 | 14607.81 | 111898.20 | 3 | 756 | 0 |
# pred
y_train = df_train.default.values
y_val = df_val.default.values
y_test = df_test.default.values
del df_train['default']
del df_val['default']
del df_test['default']
df.corr()
credit_lines_outstanding | loan_amt_outstanding | total_debt_outstanding | income | years_employed | fico_score | default | |
---|---|---|---|---|---|---|---|
credit_lines_outstanding | 1.000000 | 0.080249 | 0.852210 | 0.022272 | -0.087900 | -0.258177 | 0.862815 |
loan_amt_outstanding | 0.080249 | 1.000000 | 0.397403 | 0.835815 | -0.158416 | -0.031373 | 0.098978 |
total_debt_outstanding | 0.852210 | 0.397403 | 1.000000 | 0.394397 | -0.174353 | -0.232246 | 0.758868 |
income | 0.022272 | 0.835815 | 0.394397 | 1.000000 | 0.001814 | -0.010528 | 0.016309 |
years_employed | -0.087900 | -0.158416 | -0.174353 | 0.001814 | 1.000000 | 0.255873 | -0.284506 |
fico_score | -0.258177 | -0.031373 | -0.232246 | -0.010528 | 0.255873 | 1.000000 | -0.324515 |
default | 0.862815 | 0.098978 | 0.758868 | 0.016309 | -0.284506 | -0.324515 | 1.000000 |
df.hist(figsize=(10,10))
array([[<Axes: title={'center': 'credit_lines_outstanding'}>, <Axes: title={'center': 'loan_amt_outstanding'}>, <Axes: title={'center': 'total_debt_outstanding'}>], [<Axes: title={'center': 'income'}>, <Axes: title={'center': 'years_employed'}>, <Axes: title={'center': 'fico_score'}>], [<Axes: title={'center': 'default'}>, <Axes: >, <Axes: >]], dtype=object)
It appears that the features follow a Gaussian (normal) distribution. Let's see if we our dataset is really suitable for a Gaussian Naive Bayes. So, we check if each feature, given a specific class of the target variable (default), follows a Gaussian distribution.
# Histogram of features when default = 1
df[df.default==1].hist(figsize=(10,10))
array([[<Axes: title={'center': 'credit_lines_outstanding'}>, <Axes: title={'center': 'loan_amt_outstanding'}>, <Axes: title={'center': 'total_debt_outstanding'}>], [<Axes: title={'center': 'income'}>, <Axes: title={'center': 'years_employed'}>, <Axes: title={'center': 'fico_score'}>], [<Axes: title={'center': 'default'}>, <Axes: >, <Axes: >]], dtype=object)
# Histogram of features when default = 0
df[df.default==0].hist(figsize=(10,10))
array([[<Axes: title={'center': 'credit_lines_outstanding'}>, <Axes: title={'center': 'loan_amt_outstanding'}>, <Axes: title={'center': 'total_debt_outstanding'}>], [<Axes: title={'center': 'income'}>, <Axes: title={'center': 'years_employed'}>, <Axes: title={'center': 'fico_score'}>], [<Axes: title={'center': 'default'}>, <Axes: >, <Axes: >]], dtype=object)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model_g = GaussianNB()
model_g.fit(df_train, y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
# Accuracy on the training set
(y_train== model_g.predict(df_train)).mean()
0.9813333333333333
# Accuracy on the validation set, to see if it needs tuning
(y_val== model_g.predict(df_val)).mean()
0.9775
We get $97.8\%$ accuracy on the validation set. It looks like we don't need to tune the model.
# Accuracy on the test set:
(y_test== model_g.predict(df_test)).mean()
0.977
df_full = loan_data.copy()
del df_full['customer_id']
y_fulltrain = df_full.default.values
del df_full['default']
df_full.columns
Index(['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score'], dtype='object')
def prompt_user():
data = {}
data['credit_lines_outstanding'] = float(input("Enter the number of credit lines outstanding: "))
data['loan_amt_outstanding'] = float(input("Enter the amount of loan outstanding: "))
data['total_debt_outstanding'] = float(input("Enter the total debt outstanding: "))
data['income'] = float(input("Enter the income: "))
data['years_employed'] = float(input("Enter the number of years employed: "))
data['fico_score'] = float(input("Enter the FICO score: "))
return data
# takes user input about the loan data
user_data = prompt_user()
This is the function that takes in the properties of a loan and outputs the expected loss
model = GaussianNB().fit(df_full, y_fulltrain)
# loan here is the loan profile dict
def expectedLoss():
loan = pd.DataFrame(prompt_user(), index=[0])
#return model.predict_proba(loan)[:,1][0]*100 + '\%'
prob_default = model.predict_proba(loan)[:,1][0]
loan_amt = float(loan.loan_amt_outstanding)
return prob_default * loan_amt * (1 - 0.1) # recovery rate = 10% (given)
expectedLoss()
/var/folders/h0/722z94dd3fb0pfv4wg3qmdkh0000gn/T/ipykernel_74903/2507886040.py:7: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead loan_amt = float(loan.loan_amt_outstanding)
564.8283882093126