#!/usr/bin/env python
# coding: utf-8

# In[29]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

loan_data = pd.read_csv('./02-loan_data.csv')

df = loan_data.copy()
del df['customer_id']

columns_to_round = ['loan_amt_outstanding', 'total_debt_outstanding', 'income']
df[columns_to_round] = df[columns_to_round].apply(lambda x: x.round(2))


# In[30]:


df.dtypes


# In[31]:


df.isnull().sum()


# Our data appears to have already been cleaned.

# In[32]:


df_train.head()


# In[33]:


# pred
y_train = df_train.default.values
y_val = df_val.default.values
y_test = df_test.default.values

del df_train['default']
del df_val['default']
del df_test['default']


# ## Model (to predict PD)

# In[34]:


df.corr()


# In[35]:


df.hist(figsize=(10,10))


# It appears that the features follow a Gaussian (normal) distribution. Let's see if we our dataset is really suitable for a Gaussian Naive Bayes. So, we check if each feature, given a specific class of the target variable (default), follows a Gaussian distribution.

# In[36]:


# Histogram of features when default = 1
df[df.default==1].hist(figsize=(10,10))


# In[37]:


# Histogram of features when default = 0
df[df.default==0].hist(figsize=(10,10))


# In[38]:


df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)


# In[39]:


# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model_g = GaussianNB()
model_g.fit(df_train, y_train)


# In[40]:


# Accuracy on the training set
(y_train== model_g.predict(df_train)).mean()


# In[41]:


# Accuracy on the validation set, to see if it needs tuning
(y_val== model_g.predict(df_val)).mean()


# We get $97.8\%$ accuracy on the validation set. It looks like we don't need to tune the model. 

# In[42]:


# Accuracy on the test set:
(y_test== model_g.predict(df_test)).mean()


# In[43]:


df_full = loan_data.copy()
del df_full['customer_id']
y_fulltrain = df_full.default.values
del df_full['default']


# In[44]:


df_full.columns


# In[45]:


def prompt_user():
    data = {}
    data['credit_lines_outstanding'] = float(input("Enter the number of credit lines outstanding: "))
    data['loan_amt_outstanding'] = float(input("Enter the amount of loan outstanding: "))
    data['total_debt_outstanding'] = float(input("Enter the total debt outstanding: "))
    data['income'] = float(input("Enter the income: "))
    data['years_employed'] = float(input("Enter the number of years employed: "))
    data['fico_score'] = float(input("Enter the FICO score: "))

    return data


# In[48]:


# takes user input about the loan data
user_data = prompt_user()


# This is the function that takes in the properties of a loan and outputs the expected loss

# In[49]:


model = GaussianNB().fit(df_full, y_fulltrain)
# loan here is the loan profile dict
def expectedLoss():
    loan = pd.DataFrame(prompt_user(), index=[0])
    #return model.predict_proba(loan)[:,1][0]*100 + '\%'
    prob_default = model.predict_proba(loan)[:,1][0] 
    loan_amt = float(loan.loan_amt_outstanding)
    return prob_default * loan_amt * (1 - 0.1) # recovery rate = 10% (given)


# In[50]:


expectedLoss()


# In[ ]: