#!/usr/bin/env python
# coding: utf-8

# ## Imports

# In[1]:


import pandas as pd
import os
import sklearn
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import train_test_split

print sklearn.__version__
print pd.__version__


# In[2]:


# reading data to df
DATA_DIR = '../data'

df = pd.read_csv(os.path.abspath(os.path.join(DATA_DIR, 'day15/jobclass.csv')))
df.head(5)


# In[3]:


# target is to predict the column PG using rest (all to one)
target = df['PG']

# dropping unnecessary columns and keeping just the concerned features
df.drop(['ID', 'JobFamilyDescription', 'JobClassDescription', 'PG'], axis=1, inplace=True)


# In[4]:


df.head(2)


# In[5]:


# check for NaN (Missing values)
df.isnull().sum()

# Luckily not a single missing values


# In[6]:


X = df[:].values
Y = target.values


# In[7]:


# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(X, Y)


# ## Model - 1 [Random Forest]

# In[8]:


clf1 = RandomForestClassifier(random_state=1)


# ## Model - 2 [XGBoost]

# In[9]:


clf2 = XGBClassifier()


# ## Meta Model [Logit]

# In[10]:


lr = LogisticRegression()
sclf = StackingClassifier(
                          classifiers = [clf1, clf2], 
                          meta_classifier=lr,
                          use_probas=True,
                          average_probas=False,
                         )


# In[11]:


print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, sclf], 
                      ['Random Forest', 
                       'XGBoost',
                       'StackingClassifier']):

    scores = sklearn.model_selection.cross_val_score(clf, X_train, Y_train, 
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))