#!/usr/bin/env python # coding: utf-8 # # Making a Credit Modeler # # In this project we will focus on credit modelling wich is a well known data science problem that focuses on modeling a borrower's credit risk. We will work with financial lending data from [Lending Club](https://www.lendingclub.com/), this website is a marketplace for personal loans that matches borrowers who are seeking a loan with investors looking to lend money and make a return. # # In the site each borrowers fills out a comprehensive application, providing their past financial history, the reason for the loan and other info. Lending Club evaluates each borrower's credit score using past historical data and assign an interest rate to the borrower in the rangre from 5.32% to 30.99%, at the same time each borrower is given a [grade](https://www.lendingclub.com/investing/investor-education/interest-rates-and-fees) according to the interest rate they were assigned. # # Lending club releases data for all of the approved and declined loan applications periodically [on their website](https://www.lendingclub.com/info/download-data.action), we can select a few diffrent year ranges to download the datasets in CSV format for both approved and declined loans. Also there is a data dictionary in XLS format wich contain information on the different column names towards the bottom of the page and can be also found [here](https://www.lendingclub.com/developers/listed-loans) # # The file used have approved loans data from 2007 to 2011 # # ## Data Cleaning # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[2]: loans_2007 = pd.read_csv('loans_2007.csv') loans_2007.head() # In[3]: loans_2007.shape # This is a huge dataset with 42538 rows and 52 columns. After analyzing the data dictionary we can conclude that the following features (Columns) need to be removed in order to avoid overfitting: # * id: It's a randomly generated field by Lending Club for identification purposes # * member_id: It's also a randomly generated field for identification purposes # * funded_amnt: leaks data from the future (after the loan is already started to be funded) # * funded_amnt_inv: Also leaks data from the future # * grade: contains redundant information as the interest rate column # * sub_grade: contains redundant information as the interest rate # * emp_title: requires other data and a lot of processing to potentially be useful # * issue_d: leaks data from the future # * zip_code: redundant with the addr_state column since only the first 3 digits of the 5 digit zip code are visible # * out_prncp: leaks data from the future # * out_prncp_inv: leaks data from the future # * total_pymnt: leaks data from the future # * total_pymnt_inv: leaks data from the future # * total_rec_prncp: leaks data from the future # * total_rec_int: leaks data from the future # * total_rec_late_fee: leaks data from the future # * recoveries: leaks data from the future # * collection_recovery_fee: leaks data from the future # * last_pymnt_d: leaks data from the future # * last_pymnt_amnt: leaks data from the future # # In[4]: loans_2007.drop(['id', 'member_id', 'funded_amnt', 'funded_amnt_inv', 'grade', 'sub_grade', 'emp_title', 'issue_d', 'zip_code', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt'], axis=1, inplace=True) loans_2007.head(1) # We reduced the number of columns for 52 to 32, a profound review of the columns helped to realized that the column loan_status should be used as the target column. This because it's the only column that directly describes if a loan was paid off on time, had delayed payments, or was defaulted on the borrower. However, this column contains text values that we need to transform into numerical for training a model # In[5]: loans_2007['loan_status'].value_counts(dropna=False) # Acording to the website the loan statuses means # # |Loan status|Meaning| # |:-----------|:-------| # |Fully Paid|Loan has been fully paid off| # |Charged Off|Loan for wich there is no longer a reasonable expectation of further payments| # |Does not meet the credit policy. Status: Fully Paid|While the loan was paid off, the loan application today would no longer meet the credit policy and wouldn't be approved on the marketplace| # |Does not meet the credit policy. Status:Charged Off|While the loan was charged off, the loan application today would no longer meet the credit policy and wouldn't be approved on to the marketplace| # |In Grace Period|The loan is past due but still in the grace period of 15 days| # |Late (16-30 days)|Loan hasn't been paid in 16 to 30 days (late on the current payment)| # |Late (31-120 days)|Loan hasn't been paid in 31 to 120 days (late on the current payment)| # |Current|Loan is up to date on current payments| # |Default|Loan is defaulted on and no payment has been made for more than 121 days| # # As our interest is to predict wich loans will be paid off on time and wich ones won't be, only the Fully Paid and Charged Off values describe the final outcome of the loan. The other values describe loans that are still ongoing and where the jury is still out on if the borrower will pay back the loan on time or not. # # One could say that the Default status is similar to the Charged Off, but in the eyes of Lending Club, loans that are charged off have essentially no chance of being repaid while default one have a small chance, according to this [article](https://help.lendingclub.com/hc/en-us/articles/216127747). # # As we are going to predict which of these 2 values a loan will fall under, we can treat the problem as a binary classification. So we need to remove all the rows that don't contain Fully Paid and Charged Off. Then we need to transform both strings into numbers, 1 and 0 respectively. # In[6]: loans_2007 = loans_2007[(loans_2007['loan_status'] == 'Fully Paid') | (loans_2007['loan_status'] == 'Charged Off')] loans_2007 = loans_2007.replace({'loan_status':{'Fully Paid':1, 'Charged Off': 0}}) loans_2007['loan_status'].value_counts() # Here we found something, there are about 6 times as many loans that were paid off on time that those that werent. This imbalance can cause problems when we run the machine learning algorithms. # # We also need to remove those columns that only have on unique value, because these columns won't be useful for the model since they don't add any information to each loan application. # In[7]: columns = loans_2007.columns cols_to_drop = [] for col in columns: col_series = loans_2007[col].dropna().unique() if len(col_series) == 1: cols_to_drop.append(col) loans_2007.drop(cols_to_drop, axis=1, inplace=True) print(cols_to_drop) # In[8]: loans_2007.shape # ## Preparing the data # # After cleaning the data and selecting the features that we will use in the model, we need to prepare the data to fit and test the model. # # First we need to check the columns that have missing values and then try to find a strategy for handling them # In[9]: null_counts = loans_2007.isnull().sum() print(null_counts[null_counts > 0]) # The missing values in title and last_credit_pull_d can be dropped because they don't have a high amount of missing values, however the rest of the columns need to be filled. # # ### Analysis of each column # # The emp_length column contains data about the employment length in months of each user, NaN values indicates not employed, so we need to transform those values into a category, also this data is frequently used in assessing how risky a potential borrower is. # # On the other hand, revol_util indicates the revolving line utilization rate or the amount of credit the borrower is using relative to all credit. # # title it's the loan title provided by the borrower. # # Let's inspect the values in the column pub_rec_bankruptcies # In[10]: loans_2007['pub_rec_bankruptcies'].value_counts(normalize=True, dropna=False) # While the NaN values only represent 1.8% of the full data, the rest of the data don't have a huge variability, nearly 94% of the values are in the same category. # # This column shows the number of public record bankruptcies, and while it's an interesting data to assess the risk of not getting paid, the little variability in the data generates a bias that we don't want. # # We decided to drop the pub_rec_bankruptcies. As the columns contains categorical data we also decided to drop the rows that have missing values as they represent 2.76% of the data. # In[11]: loans_2007.drop('pub_rec_bankruptcies', axis=1, inplace=True) loans_2007.dropna(axis=0, inplace=True) loans_2007.head(1) # Now let's check how many types of data there are in the dataframe # In[12]: loans_2007.dtypes.value_counts() # We see that we have object (same as strings), float and integers. The model can work with float and integers, but no with object dtypes. So let's isolate the object dtype in a dataframe to explore it # In[13]: object_columns_df = loans_2007.select_dtypes(include=['object']) object_columns_df.head(1) # Some of the columns here represent categorical values, but we need to confirm this by checking the number of unique values in those columns: # * term: number of payments on the loan, either 36 or 60 # * emp_length: number of years the borrower was employed upon time of application # * home_ownership: Is the home ownership status provided by the borrower during registration or obtained from the credit report. The possible values are: RENT, OWN, MORTAGE, OTHER # * verification_status: indicates if the income was verified by Lending Club # * purpose: It's a category provided by the borrower of the loan request. It can have one in 12 possible values # * title: loan title provided by the borrower # * addr_state: borrower's state of residence # # There are also columns that represent numeric values, that need to be converted: # * int_rate interest rate of the loan in % # * revol_util revolving line utilization rate or the amount of credit the borrower is using relative to all credit # # Both columns earliest_cr_line and last_credit_pull_d have date values and would require a good amount of feature engineering for them to be potentially useful. This columns will be dropped. # # Let's check the unique values in the other columns to decide what we will do with them # In[14]: cols = ['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'title', 'addr_state'] for col in cols: print(col) print(loans_2007[col].value_counts()) print("-"*20) # The emp_length can be cleaned and treated as a numerical one since the values have ordering. # # The data in purpose and title is quite similar and the information overlaps. The purpose contains few discrete values, meanwhile the title has quality issueas since many of the values are repeated with slight modifications, so we keep the purpose column and drop title # # The rest of the columns contains multiple discrete values, that will be transformed. # # ### Data transformation # # First let's drop the title, last_credit_pull_d, addr_state, earliest_cr_line columns. # # Then we will replace the values in the emp_length assuming that people who may have been working more than 10 years have only really worked for 10 years, also we assume that people who have worked less than a year or if the information is nor available have worked for 0. # # We will drop the addr_state column because contains 49 different discrete values and we would need 49 dummy variable columns to use this information for classification. This would make the Dataframe much larger and could slow down how quickly the code runs. # # We will also transform the int_rate and revol_util columns to float columns. # In[15]: loans_2007.drop(['title', 'last_credit_pull_d', 'addr_state', 'earliest_cr_line'], axis=1, inplace=True) # In[16]: mapping_dict = { 'emp_length':{ '10+ years': 10, '9 years': 9, '8 years': 8, '7 years': 7, '6 years': 6, '5 years': 5, '4 years': 4, '3 years': 3, '2 years': 2, '1 year': 1, '< 1 year': 0, 'n/a': 0 } } loans_2007.replace(mapping_dict, inplace=True) loans_2007['emp_length'].value_counts() # In[17]: loans_2007['int_rate'] = loans_2007['int_rate'].str.rstrip('%').astype('float') loans_2007['revol_util'] = loans_2007['revol_util'].str.rstrip('%').astype('float') loans_2007[['int_rate', 'revol_util']].dtypes # Now let's encode the home_ownership, verification_status, purpose and term columns as dummy variables so we can use them in the model. # In[18]: dummy_df = pd.get_dummies(loans_2007[['home_ownership', 'verification_status', 'purpose', 'term']]) loans_2007 = pd.concat([loans_2007, dummy_df], axis=1) loans_2007.drop(['home_ownership', 'verification_status', 'purpose', 'term'], axis=1, inplace=True) loans_2007.head(1) # ## Predicting status # # As we said back when we converted the loan_status column to a binary column this is a classification problem. We need an error metric to figure out when our model is performing weel, and when it's performing poorly. In this case we are concerned with false positives and false negatives. Both of these are different types of misclassifications. # # A false positive means that a loan will be paid off on time, but it actually isn't. Meanwhile a false negative, we predict that a loan won't be paid off on time, but it actually would be. Since we're looking at this problem with the eyes of a conservative investor, we need to treat false positives differently than false negatives. A conservative investor would want to minimize risk, and avoid false positives as much as possible. # # We also mentioned earlier that there is a significant class imbalance in the loan_status column. There are 6 times as many loans that were paid off on time (1) than loans that weren't (0). This causes a major issue when we use accuracy as a metric. This because a classifier can predict 1 for every row and still have high accuracy. # # Considering this we decided to use the false positive rate and the false negative rate as our error metric. The false positive rate is the number of false positives divided by the number of false positives plus the number of true negatives. This divides all the cases where we thought a loan would be paid off but it wasn't by all the loans that weren't paid off # # \begin{equation} # \text{False Positive Rate} = \frac{\text{False Positives}} {\text{False Positives} + \text{True Negatives}} # \end{equation} # # True positive rate is the number of true positives divided by the number of true positives plus the number of false negatives. This divides all the cases where we thought a loan would be paid off and it was by all the loans that were paid off # # \begin{equation} # \text{True Positive Rate} = \frac{\text{True Positives}} {\text{True Positives} + \text{False Negatives}} # \end{equation} # ### Logistic regression # # This is the first algorithm we're going to apply for the following reasons # # * It's quick to train and we can iterate more quickly # * It's less prone to overfitting than more complex models like decision trees # * It's easy to interpret # In[19]: columns = loans_2007.columns cols = columns.drop('loan_status') features = loans_2007[cols] target = loans_2007['loan_status'] from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_predict lr = LogisticRegression() predictions = cross_val_predict(lr, features, target, cv=3) predictions = pd.Series(predictions) fp_filter = (predictions == 1) & (loans_2007['loan_status'] == 0) fp = len(predictions[fp_filter]) tp_filter = (predictions == 1) & (loans_2007['loan_status'] == 1) tp = len(predictions[tp_filter]) fn_filter = (predictions == 0) & (loans_2007['loan_status'] == 1) fn = len(predictions[fn_filter]) tn_filter = (predictions == 0) & (loans_2007['loan_status'] == 0) tn = len(predictions[tn_filter]) fpr = fp / (fp + tn) tpr = tp / (tp + fn) print('True Positive Rate:', tpr) print('False Positive Rate:', fpr) # The results shows us that the model predicted mostly ones. If we check the values in predictions we can confirm that # In[20]: predictions.value_counts() # Only 64 rows were predicted as 0, that's 0.17% of the data. This happens because of the imbalance we talked about before. There are two ways to get a classifier to correct for imbalanced classes: # # 1. Use oversampling and undersampling to ensure that the classifier gets input that has a balanced number of each class # 2. Tell the classifier to penalize misclassifications of the less prevalent class more thatn the other class # # Let's look into oversampling and undersampling. They involve taking a sample that contains equal numbers of rows where loan_status is 0, and where loan_status is 1. This way, the classifier is forced to make actual predictions, since predicting all 1s and 0s will only result in 50% accuracy at most. The downside of this technique is that since it has to preserve an equal ratio we need to: # * Throw out many rows of data. If we wanted equal numbers of rows where loan_status is 0 and where loan_status is 1. One way we could do that is to delete rows where loan_status is 1 # * Copy rows multiple times. One way to equalize the 0s and 1s is to copy rows where loan_status is 0 # * Generate fake data. One way to equalize the 0s and 1s is to generate new rows where loan_status is 0 # # The second method, telling the classifier to penalize certain rows more, is much easier to implement using scikit-learn. We can do this by setting the class_weight parameter to balanced when creating the LogisticRegression instance. This will tell scikit-learn to penalize the misclassification of the minority class during the training process. The penalty means that the logistic regression classifier pays more attention to correctly classifying rows where loan_status is 0. This lowers accuracy when loan_status is 1, but raises accuracy when loan_status is 0. # In[21]: lr = LogisticRegression(class_weight="balanced") predictions = cross_val_predict(lr, features, target, cv=3) predictions = pd.Series(predictions) fp_filter = (predictions == 1) & (loans_2007['loan_status'] == 0) fp = len(predictions[fp_filter]) tp_filter = (predictions == 1) & (loans_2007['loan_status'] == 1) tp = len(predictions[tp_filter]) fn_filter = (predictions == 0) & (loans_2007['loan_status'] == 1) fn = len(predictions[fn_filter]) tn_filter = (predictions == 0) & (loans_2007['loan_status'] == 0) tn = len(predictions[tn_filter]) fpr = fp / (fp + tn) tpr = tp / (tp + fn) print('False Positive Rate:', fpr) print('True Positive Rate:', tpr) # In[22]: penalty = { 0: 10, 1: 1 } lr = LogisticRegression(max_iter=10000, class_weight=penalty) predictions = cross_val_predict(lr, features, target, cv=3) predictions = pd.Series(predictions) fp_filter = (predictions == 1) & (loans_2007['loan_status'] == 0) fp = len(predictions[fp_filter]) tp_filter = (predictions == 1) & (loans_2007['loan_status'] == 1) tp = len(predictions[tp_filter]) fn_filter = (predictions == 0) & (loans_2007['loan_status'] == 1) fn = len(predictions[fn_filter]) tn_filter = (predictions == 0) & (loans_2007['loan_status'] == 0) tn = len(predictions[tn_filter]) fpr = fp / (fp + tn) tpr = tp / (tp + fn) print(fpr) print(tpr) # ### Random Forest # # Random forest are able to work with nonlinear data, and learn complex conditionals. Logistic regressions are only able to work with linear data. Training a random forest algorithm may enable us to get more accuracy due to columns that correlate nonlinearly with loan_status # In[23]: from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(class_weight="balanced", random_state=1) predictions = cross_val_predict(rf, features, target, cv=3) predictions = pd.Series(predictions) # False positives. fp_filter = (predictions == 1) & (loans_2007["loan_status"] == 0) fp = len(predictions[fp_filter]) # True positives.` tp_filter = (predictions == 1) & (loans_2007["loan_status"] == 1) tp = len(predictions[tp_filter]) # False negatives. fn_filter = (predictions == 0) & (loans_2007["loan_status"] == 1) fn = len(predictions[fn_filter]) # True negatives tn_filter = (predictions == 0) & (loans_2007["loan_status"] == 0) tn = len(predictions[tn_filter]) # Rates tpr = tp / (tp + fn) fpr = fp / (fp + tn) print(tpr) print(fpr) # In[ ]: