#!/usr/bin/env python # coding: utf-8 # # preprocessing of data set # In[3]: import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") # In[4]: #Combining both train and test dataset train['Type']='Train' #Create a flag for Train and Test Data set test['Type']='Test' fullData = pd.concat([train,test],axis=0) #Look at the available missing values in the dataset fullData.isnull().sum() # In[5]: #Identify categorical and continuous variables ID_col = ['Loan_ID'] target_col = ["Loan_Status"] cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed'] other_col=['Type'] #Test and Train Data set identifier num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col)) # In[6]: #Imputing Missing values with mean for continuous variable fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True) #Imputing Missing values with mode for categorical variables cat_imput=pd.Series(fullData[cat_cols].mode().values[0]) cat_imput.index=cat_cols fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True) # In[7]: #Create a new column as Total Income fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome'] #Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists fullData['Log_TotalIncome']=np.log(fullData['TotalIncome']) # In[8]: #create label encoders for categorical features for var in cat_cols: number = LabelEncoder() fullData[var] = number.fit_transform(fullData[var].astype('str')) train_modified=fullData[fullData['Type']=='Train'] test_modified=fullData[fullData['Type']=='Test'] train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str')) # # Building Logistic Regression # In[9]: from sklearn.linear_model import LogisticRegression predictors=['Credit_History','Education','Gender'] x_train = train_modified[list(predictors)].values y_train = train_modified["Loan_Status"].values x_test=test_modified[list(predictors)].values # In[10]: # Create logistic regression object model = LogisticRegression() # Train the model using the training sets model.fit(x_train, y_train) #Predict Output predicted= model.predict(x_test) #Reverse encoding for predicted outcome predicted = number.inverse_transform(predicted) #Store it to test dataset test_modified['Loan_Status']=predicted #Output file to make submission test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status']) # # Building Decision Tree Classifier # In[11]: predictors=['Credit_History','Education','Gender'] x_train = train_modified[list(predictors)].values y_train = train_modified["Loan_Status"].values x_test=test_modified[list(predictors)].values # In[12]: from sklearn.tree import DecisionTreeClassifier # Create Decision Tree object model = DecisionTreeClassifier() # Train the model using the training sets model.fit(x_train, y_train) #Predict Output predicted= model.predict(x_test) #Reverse encoding for predicted outcome predicted = number.inverse_transform(predicted) #Store it to test dataset test_modified['Loan_Status']=predicted #Output file to make submission test_modified.to_csv("Submission2.csv",columns=['Loan_ID','Loan_Status']) # # Building Random Forest Classifier # In[13]: from sklearn.linear_model import LogisticRegression predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome'] x_train = train_modified[list(predictors)].values y_train = train_modified["Loan_Status"].values x_test=test_modified[list(predictors)].values # In[14]: from sklearn.ensemble import RandomForestClassifier # Create Decision Tree object model = RandomForestClassifier() # Train the model using the training sets model.fit(x_train, y_train) #Predict Output predicted= model.predict(x_test) #Reverse encoding for predicted outcome predicted = number.inverse_transform(predicted) #Store it to test dataset test_modified['Loan_Status']=predicted #Output file to make submission test_modified.to_csv("Submission3.csv",columns=['Loan_ID','Loan_Status']) # In[15]: #Create a series with feature importances: featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False) print featimp # In[16]: number = LabelEncoder() train['Gender'] = number.fit_transform(train['Gender'].astype('str')) # In[17]: train.Gender # In[ ]: