# Import necessary packages
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
# Initialize instance of H2O
h2o.init()
Checking whether there is an H2O instance running at http://localhost:54321..... not found. Attempting to start a local H2O server... Java Version: java version "1.7.0_80"; Java(TM) SE Runtime Environment (build 1.7.0_80-b15); Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode) Starting server from /Users/megankurka/anaconda2/lib/python2.7/site-packages/h2o/backend/bin/h2o.jar Ice root: /var/folders/gv/w2f3zs_d33l3dt5j67k9nhhr0000gn/T/tmplmDKFI JVM stdout: /var/folders/gv/w2f3zs_d33l3dt5j67k9nhhr0000gn/T/tmplmDKFI/h2o_megankurka_started_from_python.out JVM stderr: /var/folders/gv/w2f3zs_d33l3dt5j67k9nhhr0000gn/T/tmplmDKFI/h2o_megankurka_started_from_python.err Server is running at http://127.0.0.1:54321 Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime: | 02 secs |
H2O cluster version: | 3.10.4.4 |
H2O cluster version age: | 9 days |
H2O cluster name: | H2O_from_python_megankurka_a8slf8 |
H2O cluster total nodes: | 1 |
H2O cluster free memory: | 3.556 Gb |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster status: | accepting new members, healthy |
H2O connection url: | http://127.0.0.1:54321 |
H2O connection proxy: | None |
H2O internal security: | False |
Python version: | 2.7.13 final |
# If possible download from the s3 link and change the path to the dataset
path = "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv"
# Specify some column types to "String" that we want to munge later
types = {"int_rate":"string", "revol_util":"string", "emp_length":"string", "verification_status":"string"}
# Task 1: Import the file and look at the frame
loan_stats = h2o.import_file(path=path, col_types= types)
loan_stats.describe()
Parse progress: |█████████████████████████████████████████████████████████| 100% Rows:42538 Cols:52
id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | real | enum | string | real | enum | enum | enum | string | enum | real | string | time | enum | enum | string | enum | enum | enum | enum | enum | real | int | time | int | enum | enum | int | int | int | string | int | enum | real | real | real | real | real | real | real | real | real | time | real | time | time | int | enum | int |
mins | 54734.0 | 70473.0 | 500.0 | 500.0 | 0.0 | NaN | 15.67 | NaN | 1896.0 | NaN | 1.180656e+12 | NaN | 0.0 | 0.0 | -7.573824e+11 | 0.0 | 1.0 | 0.0 | 0.0 | NaN | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.1964672e+12 | 0.0 | 1.1964672e+12 | 1.1779776e+12 | 0.0 | 1.0 | ||||||||||||||||
mean | 664579.85231 | 825702.55117 | 11089.7225814 | 10821.5857529 | 10139.8306012 | NaN | 322.625618667 | NaN | 69136.5564203 | NaN | 1.28564160993e+12 | NaN | 13.3730431409 | 0.152449066014 | 8.54295612817e+11 | 1.08142379899 | 9.34395144215 | 0.0581564955536 | 14297.8609145 | NaN | 22.1244059662 | 329.411123545 | 327.08996544 | 11651.753137 | 10948.5801657 | 9368.08058187 | 2209.04911203 | 1.45873268638 | 73.164719007 | 9.07681827201 | 1.36032037132e+12 | 2552.60725685 | 1.38132344523e+12 | 1.38597191711e+12 | 0.0 | 1.0 | ||||||||||||||||
maxs | 1077501.0 | 1314167.0 | 35000.0 | 35000.0 | 35000.0 | NaN | 1305.19 | NaN | 6000000.0 | NaN | 1.3226976e+12 | NaN | 29.99 | 13.0 | 1.2254976e+12 | 33.0 | 47.0 | 5.0 | 1207359.0 | NaN | 90.0 | 17749.51 | 17749.51 | 56809.0516288 | 56475.05 | 35000.03 | 21809.05 | 208.819529958 | 29623.35 | 7002.19 | 1.4331168e+12 | 36115.2 | 1.4383872e+12 | 1.4357088e+12 | 0.0 | 1.0 | ||||||||||||||||
sigma | 219302.219319 | 279540.905635 | 7410.93839055 | 7146.91467501 | 7131.68644868 | NaN | 208.928069 | NaN | 64096.3497189 | NaN | 31759283733.1 | NaN | 6.72631490173 | 0.512406485017 | 2.16068688522e+11 | 1.5274548353 | 4.49627387569 | 0.245713140651 | 22018.4410097 | NaN | 11.5928113373 | 1417.54568613 | 1409.61373349 | 8555.46989933 | 8486.98039403 | 6777.11981861 | 2502.52824092 | 7.57275617593 | 612.26018535 | 149.086823026 | 47801906399.7 | 4380.38476617 | 69074077358.6 | 49253773579.3 | 0.0 | 0.0 | ||||||||||||||||
zeros | 0 | 0 | 0 | 0 | 233 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 206 | 37771 | 4 | 19657 | 0 | 40130 | 1119 | 0 | 0 | 38972 | 38972 | 26 | 304 | 86 | 83 | 40214 | 38357 | 39033 | 0 | 95 | 0 | 0 | 42390 | 0 | ||||||||||||||||
missing | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 7 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 32 | 32 | 32 | 3 | 3 | 32 | 32 | 3 | 3 | 32 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 86 | 3 | 36281 | 7 | 148 | 3 | 3 |
0 | 1077501.0 | 1296599.0 | 5000.0 | 5000.0 | 4975.0 | 36 months | 10.65% | 162.87 | B | B2 | 10+ years | RENT | 24000.0 | VERIFIED - income | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077501 | Borrower added on 12/22/11 > I need to upgrade my business technologies. | credit_card | Computer | 860xx | AZ | 27.65 | 0.0 | 1985-01-01 00:00:00 | 1.0 | 3.0 | 0.0 | 13648.0 | 83.7% | 9.0 | f | 0.0 | 0.0 | 5861.07141425 | 5831.78 | 5000.0 | 861.07 | 0.0 | 0.0 | 0.0 | 2015-01-01 00:00:00 | 171.62 | 2015-01-01 00:00:00 | 0.0 | 1.0 | |||||
1 | 1077430.0 | 1314167.0 | 2500.0 | 2500.0 | 2500.0 | 60 months | 15.27% | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | VERIFIED - income source | 2011-12-01 00:00:00 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077430 | Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike. I only need this money because the deal im looking at is to good to pass up. Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike.I only need this money because the deal im looking at is to good to pass up. I have finished college with an associates degree in business and its takingmeplaces | car | bike | 309xx | GA | 1.0 | 0.0 | 1999-04-01 00:00:00 | 5.0 | 3.0 | 0.0 | 1687.0 | 9.4% | 4.0 | f | 0.0 | 0.0 | 1008.71 | 1008.71 | 456.46 | 435.17 | 0.0 | 117.08 | 1.11 | 2013-04-01 00:00:00 | 119.66 | 2013-09-01 00:00:00 | 0.0 | 1.0 | ||||
2 | 1077175.0 | 1313524.0 | 2400.0 | 2400.0 | 2400.0 | 36 months | 15.96% | 84.33 | C | C5 | 10+ years | RENT | 12252.0 | not verified | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077175 | small_business | real estate business | 606xx | IL | 8.72 | 0.0 | 2001-11-01 00:00:00 | 2.0 | 2.0 | 0.0 | 2956.0 | 98.5% | 10.0 | f | 0.0 | 0.0 | 3003.65364445 | 3003.65 | 2400.0 | 603.65 | 0.0 | 0.0 | 0.0 | 2014-06-01 00:00:00 | 649.91 | 2015-06-01 00:00:00 | 0.0 | 1.0 | ||||||
3 | 1076863.0 | 1277178.0 | 10000.0 | 10000.0 | 10000.0 | 36 months | 13.49% | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years | RENT | 49200.0 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1076863 | Borrower added on 12/21/11 > to pay for property tax (borrow from friend, need to pay back) & central A/C need to be replace. I'm very sorry to let my loan expired last time. | other | personel | 917xx | CA | 20.0 | 0.0 | 1996-02-01 00:00:00 | 1.0 | 35 | 10.0 | 0.0 | 5598.0 | 21% | 37.0 | f | 0.0 | 0.0 | 12226.3022123 | 12226.3 | 10000.0 | 2209.33 | 16.97 | 0.0 | 0.0 | 2015-01-01 00:00:00 | 357.48 | 2015-01-01 00:00:00 | 0.0 | 1.0 | |||
4 | 1075358.0 | 1311748.0 | 3000.0 | 3000.0 | 3000.0 | 60 months | 12.69% | 67.79 | B | B5 | University Medical Group | 1 year | RENT | 80000.0 | VERIFIED - income source | 2011-12-01 00:00:00 | Current | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1075358 | Borrower added on 12/21/11 > I plan on combining three large interest bills together and freeing up some extra each month to pay toward other bills. I've always been a good payor but have found myself needing to make adjustments to my budget due to a medical scare. My job is very stable, I love it. | other | Personal | 972xx | OR | 17.94 | 0.0 | 1996-01-01 00:00:00 | 0.0 | 38 | 15.0 | 0.0 | 27783.0 | 53.9% | 38.0 | f | 1168.04 | 1168.04 | 2767.64 | 2767.64 | 1831.96 | 935.68 | 0.0 | 0.0 | 0.0 | 2015-06-01 00:00:00 | 67.79 | 2015-07-01 00:00:00 | 2015-06-01 00:00:00 | 0.0 | 1.0 | ||
5 | 1075269.0 | 1311441.0 | 5000.0 | 5000.0 | 5000.0 | 36 months | 7.90% | 156.46 | A | A4 | Veolia Transportaton | 3 years | RENT | 36000.0 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1075269 | wedding | My wedding loan I promise to pay back | 852xx | AZ | 11.2 | 0.0 | 2004-11-01 00:00:00 | 3.0 | 9.0 | 0.0 | 7963.0 | 28.3% | 12.0 | f | 0.0 | 0.0 | 5631.37775318 | 5631.38 | 5000.0 | 631.38 | 0.0 | 0.0 | 0.0 | 2015-01-01 00:00:00 | 161.03 | 2015-06-01 00:00:00 | 0.0 | 1.0 | |||||
6 | 1069639.0 | 1304742.0 | 7000.0 | 7000.0 | 7000.0 | 60 months | 15.96% | 170.08 | C | C5 | Southern Star Photography | 8 years | RENT | 47004.0 | not verified | 2011-12-01 00:00:00 | Current | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1069639 | Borrower added on 12/18/11 > I am planning on using the funds to pay off two retail credit cards with 24.99% interest rates, as well as a major bank credit card with a 18.99% rate. I pay all my bills on time, looking for a lower combined payment and lower monthly payment. | debt_consolidation | Loan | 280xx | NC | 23.51 | 0.0 | 2005-07-01 00:00:00 | 1.0 | 7.0 | 0.0 | 17726.0 | 85.6% | 11.0 | f | 2853.21 | 2853.21 | 6946.28 | 6946.28 | 4146.8 | 2799.48 | 0.0 | 0.0 | 0.0 | 2015-06-01 00:00:00 | 170.08 | 2015-08-01 00:00:00 | 2015-06-01 00:00:00 | 0.0 | 1.0 | |||
7 | 1072053.0 | 1288686.0 | 3000.0 | 3000.0 | 3000.0 | 36 months | 18.64% | 109.43 | E | E1 | MKC Accounting | 9 years | RENT | 48000.0 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1072053 | Borrower added on 12/16/11 > Downpayment for a car. | car | Car Downpayment | 900xx | CA | 5.35 | 0.0 | 2007-01-01 00:00:00 | 2.0 | 4.0 | 0.0 | 8221.0 | 87.5% | 4.0 | f | 0.0 | 0.0 | 3938.14433376 | 3938.14 | 3000.0 | 938.14 | 0.0 | 0.0 | 0.0 | 2015-01-01 00:00:00 | 111.34 | 2014-12-01 00:00:00 | 0.0 | 1.0 | ||||
8 | 1071795.0 | 1306957.0 | 5600.0 | 5600.0 | 5600.0 | 60 months | 21.28% | 152.39 | F | F2 | 4 years | OWN | 40000.0 | VERIFIED - income source | 2011-12-01 00:00:00 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1071795 | Borrower added on 12/21/11 > I own a small home-based judgment collection business. I have 5 years experience collecting debts. I am now going from a home office to a small office. I also plan to buy a small debt portfolio (eg. $10K for $1M of debt) My score is not A+ because I own my home and have no mortgage. | small_business | Expand Business & Buy Debt Portfolio | 958xx | CA | 5.55 | 0.0 | 2004-04-01 00:00:00 | 2.0 | 11.0 | 0.0 | 5210.0 | 32.6% | 13.0 | f | 0.0 | 0.0 | 646.02 | 646.02 | 162.02 | 294.94 | 0.0 | 189.06 | 2.09 | 2012-04-01 00:00:00 | 152.39 | 2012-08-01 00:00:00 | 0.0 | 1.0 | |||||
9 | 1071570.0 | 1306721.0 | 5375.0 | 5375.0 | 5350.0 | 60 months | 12.69% | 121.45 | B | B5 | Starbucks | < 1 year | RENT | 15000.0 | VERIFIED - income | 2011-12-01 00:00:00 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1071570 | Borrower added on 12/16/11 > I'm trying to build up my credit history. I live with my brother and have no car payment or credit cards. I am in community college and work full time. Im going to use the money to make some repairs around the house and get some maintenance done on my car. Borrower added on 12/20/11 > $1000 down only $4375 to go. Thanks to everyone that invested so far, looking forward to surprising my brother with the fixes around the house. | other | Building my credit history. | 774xx | TX | 18.08 | 0.0 | 2004-09-01 00:00:00 | 0.0 | 2.0 | 0.0 | 9279.0 | 36.5% | 3.0 | f | 0.0 | 0.0 | 1476.19 | 1469.34 | 673.48 | 533.42 | 0.0 | 269.29 | 2.52 | 2012-11-01 00:00:00 | 121.45 | 2013-03-01 00:00:00 | 0.0 | 1.0 |
# Task 2: Look at the levels int he response column, "loan_status"
# Hint: Use .table() function on the response column
loan_status | Count |
---|---|
Charged Off | 5435 |
Current | 3351 |
Default | 7 |
Does not meet the credit policy. Status:Charged Off | 761 |
Does not meet the credit policy. Status:Current | 53 |
Does not meet the credit policy. Status:Fully Paid | 1933 |
Does not meet the credit policy. Status:In Grace Period | 2 |
Fully Paid | 30843 |
In Grace Period | 60 |
Late (16-30 days) | 16 |
# Task 3: Drop all loans that are still in progess and therefore cannot be deemed good/bad loans
# Hint: "Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)" are ongoing loans
loan_stats.show()
id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1.0775e+06 | 1.2966e+06 | 5000 | 5000 | 4975 | 36 months | 10.65% | 162.87 | B | B2 | 10+ years | RENT | 24000 | VERIFIED - income | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077501 | Borrower added on 12/22/11 > I need to upgrade my business technologies. | credit_card | Computer | 860xx | AZ | 27.65 | 0 | 1985-01-01 00:00:00 | 1 | 3 | 0 | 13648 | 83.7% | 9 | f | 0 | 0 | 5861.07 | 5831.78 | 5000 | 861.07 | 0 | 0 | 0 | 2015-01-01 00:00:00 | 171.62 | 2015-01-01 00:00:00 | 0 | 1 | |||||
1.07743e+06 | 1.31417e+06 | 2500 | 2500 | 2500 | 60 months | 15.27% | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000 | VERIFIED - income source | 2011-12-01 00:00:00 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077430 | Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike. I only need this money because the deal im looking at is to good to pass up. Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike.I only need this money because the deal im looking at is to good to pass up. I have finished college with an associates degree in business and its takingmeplaces | car | bike | 309xx | GA | 1 | 0 | 1999-04-01 00:00:00 | 5 | 3 | 0 | 1687 | 9.4% | 4 | f | 0 | 0 | 1008.71 | 1008.71 | 456.46 | 435.17 | 0 | 117.08 | 1.11 | 2013-04-01 00:00:00 | 119.66 | 2013-09-01 00:00:00 | 0 | 1 | ||||
1.07718e+06 | 1.31352e+06 | 2400 | 2400 | 2400 | 36 months | 15.96% | 84.33 | C | C5 | 10+ years | RENT | 12252 | not verified | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077175 | small_business | real estate business | 606xx | IL | 8.72 | 0 | 2001-11-01 00:00:00 | 2 | 2 | 0 | 2956 | 98.5% | 10 | f | 0 | 0 | 3003.65 | 3003.65 | 2400 | 603.65 | 0 | 0 | 0 | 2014-06-01 00:00:00 | 649.91 | 2015-06-01 00:00:00 | 0 | 1 | ||||||
1.07686e+06 | 1.27718e+06 | 10000 | 10000 | 10000 | 36 months | 13.49% | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years | RENT | 49200 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1076863 | Borrower added on 12/21/11 > to pay for property tax (borrow from friend, need to pay back) & central A/C need to be replace. I'm very sorry to let my loan expired last time. | other | personel | 917xx | CA | 20 | 0 | 1996-02-01 00:00:00 | 1 | 35 | 10 | 0 | 5598 | 21% | 37 | f | 0 | 0 | 12226.3 | 12226.3 | 10000 | 2209.33 | 16.97 | 0 | 0 | 2015-01-01 00:00:00 | 357.48 | 2015-01-01 00:00:00 | 0 | 1 | |||
1.07527e+06 | 1.31144e+06 | 5000 | 5000 | 5000 | 36 months | 7.90% | 156.46 | A | A4 | Veolia Transportaton | 3 years | RENT | 36000 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1075269 | wedding | My wedding loan I promise to pay back | 852xx | AZ | 11.2 | 0 | 2004-11-01 00:00:00 | 3 | 9 | 0 | 7963 | 28.3% | 12 | f | 0 | 0 | 5631.38 | 5631.38 | 5000 | 631.38 | 0 | 0 | 0 | 2015-01-01 00:00:00 | 161.03 | 2015-06-01 00:00:00 | 0 | 1 | |||||
1.07205e+06 | 1.28869e+06 | 3000 | 3000 | 3000 | 36 months | 18.64% | 109.43 | E | E1 | MKC Accounting | 9 years | RENT | 48000 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1072053 | Borrower added on 12/16/11 > Downpayment for a car. | car | Car Downpayment | 900xx | CA | 5.35 | 0 | 2007-01-01 00:00:00 | 2 | 4 | 0 | 8221 | 87.5% | 4 | f | 0 | 0 | 3938.14 | 3938.14 | 3000 | 938.14 | 0 | 0 | 0 | 2015-01-01 00:00:00 | 111.34 | 2014-12-01 00:00:00 | 0 | 1 | ||||
1.0718e+06 | 1.30696e+06 | 5600 | 5600 | 5600 | 60 months | 21.28% | 152.39 | F | F2 | 4 years | OWN | 40000 | VERIFIED - income source | 2011-12-01 00:00:00 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1071795 | Borrower added on 12/21/11 > I own a small home-based judgment collection business. I have 5 years experience collecting debts. I am now going from a home office to a small office. I also plan to buy a small debt portfolio (eg. $10K for $1M of debt) My score is not A+ because I own my home and have no mortgage. | small_business | Expand Business & Buy Debt Portfolio | 958xx | CA | 5.55 | 0 | 2004-04-01 00:00:00 | 2 | 11 | 0 | 5210 | 32.6% | 13 | f | 0 | 0 | 646.02 | 646.02 | 162.02 | 294.94 | 0 | 189.06 | 2.09 | 2012-04-01 00:00:00 | 152.39 | 2012-08-01 00:00:00 | 0 | 1 | |||||
1.07157e+06 | 1.30672e+06 | 5375 | 5375 | 5350 | 60 months | 12.69% | 121.45 | B | B5 | Starbucks | < 1 year | RENT | 15000 | VERIFIED - income | 2011-12-01 00:00:00 | Charged Off | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1071570 | Borrower added on 12/16/11 > I'm trying to build up my credit history. I live with my brother and have no car payment or credit cards. I am in community college and work full time. Im going to use the money to make some repairs around the house and get some maintenance done on my car. Borrower added on 12/20/11 > $1000 down only $4375 to go. Thanks to everyone that invested so far, looking forward to surprising my brother with the fixes around the house. | other | Building my credit history. | 774xx | TX | 18.08 | 0 | 2004-09-01 00:00:00 | 0 | 2 | 0 | 9279 | 36.5% | 3 | f | 0 | 0 | 1476.19 | 1469.34 | 673.48 | 533.42 | 0 | 269.29 | 2.52 | 2012-11-01 00:00:00 | 121.45 | 2013-03-01 00:00:00 | 0 | 1 | ||||
1.07008e+06 | 1.3052e+06 | 6500 | 6500 | 6500 | 60 months | 14.65% | 153.45 | C | C3 | Southwest Rural metro | 5 years | OWN | 72000 | not verified | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1070078 | Borrower added on 12/15/11 > I had recived a loan from Citi Financial about a year ago, I was paying 29.99 intrest, so the refinance is to cut that rate since cleaning up my credit I have been paying everything on time as shown on my credit report | debt_consolidation | High intrest Consolidation | 853xx | AZ | 16.12 | 0 | 1998-01-01 00:00:00 | 2 | 14 | 0 | 4032 | 20.6% | 23 | f | 0 | 0 | 7677.52 | 7677.52 | 6500 | 1177.52 | 0 | 0 | 0 | 2013-06-01 00:00:00 | 1655.54 | 2013-07-01 00:00:00 | 0 | 1 | ||||
1.06991e+06 | 1.30501e+06 | 12000 | 12000 | 12000 | 36 months | 12.69% | 402.54 | B | B5 | UCLA | 10+ years | OWN | 75000 | VERIFIED - income source | 2011-12-01 00:00:00 | Fully Paid | n | https://www.lendingclub.com/browse/loanDetail.action?loan_id=1069908 | debt_consolidation | Consolidation | 913xx | CA | 10.78 | 0 | 1989-10-01 00:00:00 | 0 | 12 | 0 | 23336 | 67.1% | 34 | f | 0 | 0 | 13943.1 | 13943.1 | 12000 | 1943.08 | 0 | 0 | 0 | 2013-09-01 00:00:00 | 6315.3 | 2013-08-01 00:00:00 | 0 | 1 |
# Task 4: Bin the response variable to good/bad oans only, use your best judgement for what qualifies as a good/bad loan
# Create a new column called "bad_loan" which should be a binary variable
# Hint: You can turn the bad_loan columm into a factor using .asfactor()
# Task 5: String munging to clearn string columns before converting to numeric
# Hint: Column that need munging include "int_rate," "revol_util," "emp_length"
#### Example for int_rate using gsub, trim, asnumeric ####
loan_stats["int_rate"] = loan_stats["int_rate"].gsub(pattern = "%", replacement = "") # strip %
loan_stats["int_rate"] = loan_stats["int_rate"].trim() # trim ws
loan_stats["int_rate"] = loan_stats["int_rate"].asnumeric() #change to a numeric
loan_stats["int_rate"].show()
int_rate |
---|
10.65 |
15.27 |
15.96 |
13.49 |
7.9 |
18.64 |
21.28 |
12.69 |
14.65 |
12.69 |
# Now try for revol_util yourself
revol_util |
---|
83.7 |
9.4 |
98.5 |
21 |
28.3 |
87.5 |
32.6 |
36.5 |
20.6 |
67.1 |
# Now we're going to clean up emp_length
# Use gsub to remove " year" and " years" also translate n/a to ""
loan_stats["emp_length"] = loan_stats["emp_length"].gsub(pattern = "([ ]*+[a-zA-Z].*)|(n/a)",
replacement = "")
# Use trim to remove any trailing spaces
loan_stats["emp_length"] = loan_stats["emp_length"].trim()
# Use sub to convert < 1 to 0 years and do the same for 10 + to 10
# Hint: Be mindful of spaces between characters
emp_length |
---|
10 |
0 |
10 |
10 |
3 |
9 |
4 |
0 |
5 |
10 |
# Task 6: Create new column called credit_length
# Hint: Do this by subtracting the earliest_cr year from the issue_d year
credit_length |
---|
26 |
12 |
10 |
15 |
7 |
4 |
7 |
7 |
13 |
22 |
# Task 7: Use the sub function to create two levels from the verification_status column. Ie "verified" and "not verified"
# Task 8: Do a test-train split (80-20)
# Task 9: Define your response and predictor variables
y="bad_loan"
x=[]
# Task 10: Train GBM Model
# Set parameters for GBM model
# Build your model
gbm Model Build progress: |███████████████████████████████████████████████| 100%
# Task 11: Plot the scoring history to make sure you're not overfitting
# Hint: Use plot function on the model object
# Task 12: Plot the ROC curve for the binomial models and get auc using h2o.auc
# Hint: Use h2o.performance and plot to grab the modelmetrics and then plotting the modelmetrics
Training AUC = 0.833931838694 Validation AUC = 0.793780377998
# Task 13: Check the variable importance and generate confusion matrix for max F1 threshold
# Hint: Use h2o.varimp for non-GLM model and use h2o.confusionMatrix
variable relative_importance scaled_importance percentage 0 inq_last_6mths 8205.389648 1.000000 0.484600 1 addr_state 1844.628906 0.224807 0.108941 2 term 1479.474487 0.180305 0.087376 3 credit_length 1132.209595 0.137984 0.066867 4 revol_util 1127.222534 0.137376 0.066572 5 revol_bal 896.188965 0.109220 0.052928 6 purpose 609.911682 0.074331 0.036021 7 annual_inc 556.461426 0.067817 0.032864 8 loan_amnt 197.390945 0.024056 0.011658 9 dti 164.333130 0.020027 0.009705 10 delinq_2yrs 154.244186 0.018798 0.009109 11 open_acc 129.676529 0.015804 0.007659 12 total_acc 126.753441 0.015448 0.007486 13 pub_rec 119.910492 0.014614 0.007082 14 verification_status 68.559044 0.008355 0.004049 15 emp_length 64.321648 0.007839 0.003799 16 home_ownership 55.632797 0.006780 0.003286 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.224251521214:
0 | 1 | Error | Rate | |
0 | 5027.0 | 1061.0 | 0.1743 | (1061.0/6088.0) |
1 | 640.0 | 977.0 | 0.3958 | (640.0/1617.0) |
Total | 5667.0 | 2038.0 | 0.2208 | (1701.0/7705.0) |
# Task 14: Score the entire data set using the model
# Hint: Use h2o.predict.
gbm prediction progress: |████████████████████████████████████████████████| 100%
# Extra: Calculate the money gain/loss if model is implemented
# Calculate the total amount of money earned or lost per loan
loan_stats["earned"] = loan_stats["total_pymnt"] - loan_stats["loan_amnt"]
# Calculate how much money will be lost to false negative, vs how much will be saved due to true positives
loan_stats["pred"] = pred["predict"]
grouped = loan_stats.group_by(["bad_loan", "pred"])
net = grouped.sum(col = "earned", na = "ignore").get_frame()
n1 = net[(net["bad_loan"] == "0") & (net["pred"] == "0")]["sum_earned"].round(digits = 0).max()
n2 = net[(net["bad_loan"] == "0") & (net["pred"] == "1")]["sum_earned"].round(digits = 0).max()
n3 = net[(net["bad_loan"] == "1") & (net["pred"] == "1")]["sum_earned"].round(digits = 0).max()
n4 = net[(net["bad_loan"] == "1") & (net["pred"] == "0")]["sum_earned"].round(digits = 0).max()
# Calculate the amount earned
print "Total amount of profit still earned using the model : %s" % '${:0,.0f}'.format(n1)
print "Total amount of profit forfeitted using the model : %s" % '${:0,.0f}'.format(n2)
print "Total amount of loss that could have been prevented : %s" % '${:0,.0f}'.format(n3)
print "Total amount of loss that still would've accrued : %s" % '${:0,.0f}'.format(n4)
# Calculate Net
print "Total profit by implementing model : $ %s" %'${:0,.0f}'.format((n1 - n2 + (-1*n3) - (-1*n4)))
Total amount of profit still earned using the model : $37,436,802 Total amount of profit forfeitted using the model : $12,838,849 Total amount of loss that could have been prevented : $-19,328,621 Total amount of loss that still would've accrued : $-12,083,433 Total profit by implementing model : $ $31,843,141
# Shutdown h2o instance
H2O session _sid_90d0 closed.