#!/usr/bin/env python # coding: utf-8 # In[29]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split loan_data = pd.read_csv('./02-loan_data.csv') df = loan_data.copy() del df['customer_id'] columns_to_round = ['loan_amt_outstanding', 'total_debt_outstanding', 'income'] df[columns_to_round] = df[columns_to_round].apply(lambda x: x.round(2)) # In[30]: df.dtypes # In[31]: df.isnull().sum() # Our data appears to have already been cleaned. # In[32]: df_train.head() # In[33]: # pred y_train = df_train.default.values y_val = df_val.default.values y_test = df_test.default.values del df_train['default'] del df_val['default'] del df_test['default'] # ## Model (to predict PD) # In[34]: df.corr() # In[35]: df.hist(figsize=(10,10)) # It appears that the features follow a Gaussian (normal) distribution. Let's see if we our dataset is really suitable for a Gaussian Naive Bayes. So, we check if each feature, given a specific class of the target variable (default), follows a Gaussian distribution. # In[36]: # Histogram of features when default = 1 df[df.default==1].hist(figsize=(10,10)) # In[37]: # Histogram of features when default = 0 df[df.default==0].hist(figsize=(10,10)) # In[38]: df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1) df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1) # In[39]: # Naive Bayes from sklearn.naive_bayes import GaussianNB model_g = GaussianNB() model_g.fit(df_train, y_train) # In[40]: # Accuracy on the training set (y_train== model_g.predict(df_train)).mean() # In[41]: # Accuracy on the validation set, to see if it needs tuning (y_val== model_g.predict(df_val)).mean() # We get $97.8\%$ accuracy on the validation set. It looks like we don't need to tune the model. # In[42]: # Accuracy on the test set: (y_test== model_g.predict(df_test)).mean() # In[43]: df_full = loan_data.copy() del df_full['customer_id'] y_fulltrain = df_full.default.values del df_full['default'] # In[44]: df_full.columns # In[45]: def prompt_user(): data = {} data['credit_lines_outstanding'] = float(input("Enter the number of credit lines outstanding: ")) data['loan_amt_outstanding'] = float(input("Enter the amount of loan outstanding: ")) data['total_debt_outstanding'] = float(input("Enter the total debt outstanding: ")) data['income'] = float(input("Enter the income: ")) data['years_employed'] = float(input("Enter the number of years employed: ")) data['fico_score'] = float(input("Enter the FICO score: ")) return data # In[48]: # takes user input about the loan data user_data = prompt_user() # This is the function that takes in the properties of a loan and outputs the expected loss # In[49]: model = GaussianNB().fit(df_full, y_fulltrain) # loan here is the loan profile dict def expectedLoss(): loan = pd.DataFrame(prompt_user(), index=[0]) #return model.predict_proba(loan)[:,1][0]*100 + '\%' prob_default = model.predict_proba(loan)[:,1][0] loan_amt = float(loan.loan_amt_outstanding) return prob_default * loan_amt * (1 - 0.1) # recovery rate = 10% (given) # In[50]: expectedLoss() # In[ ]: