#!/usr/bin/env python
# coding: utf-8

# # preprocessing of data set

# In[3]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv")
test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv")


# In[4]:


#Combining both train and test dataset

train['Type']='Train' #Create a flag for Train and Test Data set
test['Type']='Test'
fullData = pd.concat([train,test],axis=0)

#Look at the available missing values in the dataset
fullData.isnull().sum()


# In[5]:


#Identify categorical and continuous variables
ID_col = ['Loan_ID']
target_col = ["Loan_Status"]
cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']

other_col=['Type'] #Test and Train Data set identifier
num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))


# In[6]:


#Imputing Missing values with mean for continuous variable
fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)


#Imputing Missing values with mode for categorical variables
cat_imput=pd.Series(fullData[cat_cols].mode().values[0])
cat_imput.index=cat_cols
fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)


# In[7]:


#Create a new column as Total Income

fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']

#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists
fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])


# In[8]:


#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    fullData[var] = number.fit_transform(fullData[var].astype('str'))

train_modified=fullData[fullData['Type']=='Train']
test_modified=fullData[fullData['Type']=='Test']
train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str'))


# # Building Logistic Regression

# In[9]:


from sklearn.linear_model import LogisticRegression


predictors=['Credit_History','Education','Gender']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values


# In[10]:


# Create logistic regression object
model = LogisticRegression()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status'])


# # Building Decision Tree Classifier

# In[11]:


predictors=['Credit_History','Education','Gender']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values


# In[12]:


from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree object
model = DecisionTreeClassifier()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission2.csv",columns=['Loan_ID','Loan_Status'])


# # Building Random Forest Classifier

# In[13]:


from sklearn.linear_model import LogisticRegression


predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',
            'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']

x_train = train_modified[list(predictors)].values
y_train = train_modified["Loan_Status"].values

x_test=test_modified[list(predictors)].values


# In[14]:


from sklearn.ensemble import RandomForestClassifier

# Create Decision Tree object
model = RandomForestClassifier()

# Train the model using the training sets
model.fit(x_train, y_train)

#Predict Output
predicted= model.predict(x_test)

#Reverse encoding for predicted outcome
predicted = number.inverse_transform(predicted)

#Store it to test dataset
test_modified['Loan_Status']=predicted

#Output file to make submission
test_modified.to_csv("Submission3.csv",columns=['Loan_ID','Loan_Status'])


# In[15]:


#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)
print featimp


# In[16]:


number = LabelEncoder()
train['Gender'] = number.fit_transform(train['Gender'].astype('str'))
    

# In[17]:


train.Gender


# In[ ]: