import pandas as pd
import matplotlib.pyplot as plt
from numpy import math
loansData = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')
#Note, a schema can be found here : https://github.com/herrfz/dataanalysis/blob/master/assignment1/Assignment1.pdf
loansData.columns.values
array(['Amount.Requested', 'Amount.Funded.By.Investors', 'Interest.Rate', 'Loan.Length', 'Loan.Purpose', 'Debt.To.Income.Ratio', 'State', 'Home.Ownership', 'Monthly.Income', 'FICO.Range', 'Open.CREDIT.Lines', 'Revolving.CREDIT.Balance', 'Inquiries.in.the.Last.6.Months', 'Employment.Length'], dtype=object)
loansData['Monthly.Income'][0:5] # first five rows of Interest.Rate
81174 6541.67 99592 4583.33 80059 11500.00 15825 3833.33 33182 3195.00 Name: Monthly.Income, dtype: float64
plt.figure()
inc = loansData['Monthly.Income']
h = inc.hist()
plt.title('Histogram of Monthly Income')
plt.show()
loansData['Monthly.LogIncome'] = [ math.log(x) for x in inc ]
plt.figure()
h = loansData['Monthly.LogIncome'].hist()
plt.title('Histogram of Log(Monthly Income)')
plt.show()
loansData['Monthly.LogIncome'].describe()
count 2499.000000 mean 8.501915 std 0.523019 min 6.377577 25% 8.160518 50% 8.517193 75% 8.824678 max 11.540054 dtype: float64