import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")
#Combining both train and test dataset
train['Type']='Train' #Create a flag for Train and Test Data set
test['Type']='Test'
fullData = pd.concat([train,test],axis=0)
#Look at the available missing values in the dataset
fullData.isnull().sum()
ApplicantIncome 0 CoapplicantIncome 0 Credit_History 79 Dependents 25 Education 0 Gender 24 LoanAmount 27 Loan_Amount_Term 20 Loan_ID 0 Loan_Status 367 Married 3 Property_Area 0 Self_Employed 55 Type 0 dtype: int64
#Identify categorical and continuous variables
ID_col = ['Loan_ID']
target_col = ["Loan_Status"]
cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']
other_col=['Type'] #Test and Train Data set identifier
num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))
#Imputing Missing values with mean for continuous variable
fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)
#Imputing Missing values with mode for categorical variables
cat_imput=pd.Series(fullData[cat_cols].mode().values[0])
cat_imput.index=cat_cols
fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)
C:\Users\abc\Anaconda2\lib\site-packages\pandas\core\generic.py:3178: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self._update_inplace(new_data)
#Create a new column as Total Income
fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']
#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists
fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])
#create label encoders for categorical features
for var in cat_cols:
number = LabelEncoder()
fullData[var] = number.fit_transform(fullData[var].astype('str'))
train_modified=fullData[fullData['Type']=='Train']
test_modified=fullData[fullData['Type']=='Test']
train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str'))
C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from sklearn.linear_model import LogisticRegression
predictors=['Credit_History','Education','Gender']
x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values
x_test=test_modified[list(predictors)].values
# Create logistic regression object
model = LogisticRegression()
# Train the model using the training sets
model.fit(x_train, y_train)
#Predict Output
predicted= model.predict(x_test)
#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)
#Store it to test dataset
test_modified['Loan_Status']=predicted
#Output file to make submission
test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status'])
C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
predictors=['Credit_History','Education','Gender']
x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values
x_test=test_modified[list(predictors)].values
from sklearn.tree import DecisionTreeClassifier
# Create Decision Tree object
model = DecisionTreeClassifier()
# Train the model using the training sets
model.fit(x_train, y_train)
#Predict Output
predicted= model.predict(x_test)
#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)
#Store it to test dataset
test_modified['Loan_Status']=predicted
#Output file to make submission
test_modified.to_csv("Submission2.csv",columns=['Loan_ID','Loan_Status'])
C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from sklearn.linear_model import LogisticRegression
predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',
'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']
x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values
x_test=test_modified[list(predictors)].values
from sklearn.ensemble import RandomForestClassifier
# Create Decision Tree object
model = RandomForestClassifier()
# Train the model using the training sets
model.fit(x_train, y_train)
#Predict Output
predicted= model.predict(x_test)
#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)
#Store it to test dataset
test_modified['Loan_Status']=predicted
#Output file to make submission
test_modified.to_csv("Submission3.csv",columns=['Loan_ID','Loan_Status'])
C:\Users\abc\Anaconda2\lib\site-packages\ipykernel\__main__.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)
print featimp
Credit_History 0.232724 TotalIncome 0.146955 LoanAmount 0.128687 ApplicantIncome 0.114424 Log_TotalIncome 0.113866 CoapplicantIncome 0.082272 Dependents 0.038125 Property_Area 0.036118 Loan_Amount_Term 0.032650 Married 0.022713 Self_Employed 0.022481 Education 0.016459 Gender 0.012527 dtype: float64
number = LabelEncoder()
train['Gender'] = number.fit_transform(train['Gender'].astype('str'))
train.Gender
0 1 1 1 2 1 3 1 4 1 5 1 6 1 7 1 8 1 9 1 10 1 11 1 12 1 13 1 14 1 15 1 16 1 17 0 18 1 19 1 20 1 21 1 22 1 23 2 24 1 25 1 26 1 27 1 28 1 29 0 .. 584 1 585 1 586 1 587 0 588 2 589 1 590 1 591 1 592 2 593 1 594 1 595 1 596 1 597 1 598 1 599 1 600 0 601 1 602 1 603 1 604 0 605 1 606 1 607 1 608 1 609 0 610 1 611 1 612 1 613 0 Name: Gender, dtype: int64