#!/usr/bin/env python # coding: utf-8 # ## Imports # In[1]: import pandas as pd import os import sklearn import warnings warnings.filterwarnings('ignore') from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier from mlxtend.classifier import StackingClassifier from sklearn.cross_validation import train_test_split print sklearn.__version__ print pd.__version__ # In[2]: # reading data to df DATA_DIR = '../data' df = pd.read_csv(os.path.abspath(os.path.join(DATA_DIR, 'day15/jobclass.csv'))) df.head(5) # In[3]: # target is to predict the column PG using rest (all to one) target = df['PG'] # dropping unnecessary columns and keeping just the concerned features df.drop(['ID', 'JobFamilyDescription', 'JobClassDescription', 'PG'], axis=1, inplace=True) # In[4]: df.head(2) # In[5]: # check for NaN (Missing values) df.isnull().sum() # Luckily not a single missing values # In[6]: X = df[:].values Y = target.values # In[7]: # ideal practice is to use test as 20% - 30% of training data # defined by test_size in train_test_split() # random_state is required to avoid sequential biasness in the data distribution def data_split(X, Y): X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10) return X_train, X_test, Y_train, Y_test X_train, X_test, Y_train, Y_test = data_split(X, Y) # ## Model - 1 [Random Forest] # In[8]: clf1 = RandomForestClassifier(random_state=1) # ## Model - 2 [XGBoost] # In[9]: clf2 = XGBClassifier() # ## Meta Model [Logit] # In[10]: lr = LogisticRegression() sclf = StackingClassifier( classifiers = [clf1, clf2], meta_classifier=lr, use_probas=True, average_probas=False, ) # In[11]: print('5-fold cross validation:\n') for clf, label in zip([clf1, clf2, sclf], ['Random Forest', 'XGBoost', 'StackingClassifier']): scores = sklearn.model_selection.cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))