#!/usr/bin/env python # coding: utf-8 # # Imports # In[13]: import numpy as np import pandas as pd import sklearn import os from sklearn import preprocessing from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split # In[14]: # reading data into pandas dataframe DATA_DIR = 'data' df = pd.read_table( os.path.abspath(os.path.join(DATA_DIR, 'day1/iris.csv')), sep=',' ) df.head(5) # In[15]: # encoding the class to integers X = df.iloc[:, :-1].values Y = df.iloc[:, -1].values # encode the class with integers le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) # In[16]: # ideal practice is to use test as 20% - 30% of training data # defined by test_size in train_test_split() # random_state is required to avoid sequential biasness in the data distribution def data_split(X, Y): X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.30, random_state = 10) return X_train, X_test, Y_train, Y_test X_train, X_test, Y_train, Y_test = data_split(X, Y) print X_train.shape, X_test.shape # In[17]: # this class takes care for scaling the features to the scale of 0-1 # we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which # also has the range from 0-1 class Normalizer: def __init__(self): self.sc = StandardScaler() def scale(self, X, dtype): if dtype=='train': XX = self.sc.fit_transform(X) elif dtype=='test': XX = self.sc.transform(X) else: return None return XX # In[18]: norm = Normalizer() X_train = norm.scale(X_train, 'train') X_test = norm.scale(X_test, 'test') # # Model 1 (Logistic) # In[19]: from sklearn.linear_model import LogisticRegression # train the model classifier = LogisticRegression() model = classifier.fit(X_train, Y_train) predictions_lr = model.predict_proba(X_test) print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_lr, axis=1)) # # Model 2 (Decision Tree) # In[20]: from sklearn import tree # train the model classifier = tree.DecisionTreeClassifier() model = classifier.fit(X_train, Y_train) predictions_dtree = model.predict_proba(X_test) print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_dtree, axis=1)) # # Model 3 (KNN) # In[21]: from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3) model = knn.fit(X_train, Y_train) predictions_knn = model.predict_proba(X_test) print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_knn, axis=1)) # # Meta Model (Ensemble) # In[22]: class Ensemble(object): """ Implements averaging voting ensemble technique Each model is given equal weight """ def __init__(self, samples=None, classes=None, classifiers=None): self.classes = classes self.samples = samples self.classifiers = classifiers def mixmatch(self, predictions): if not self.classifiers: self.classifiers = len(predictions) if not self.samples: self.samples = len(predictions[0]) if not self.classes: self.classes = len(predictions[0][0]) final_pred = np.array([0]*self.classes) for s in range(self.samples): s_pred = np.array([0]*self.classes) for c in range(self.classifiers): pred = predictions[c][s] s_pred = np.vstack((s_pred, pred)) s_pred = s_pred[1:, :] s_pred_avg = np.average(s_pred, axis=0) final_pred = np.vstack((final_pred, s_pred_avg)) return final_pred[1:, :] # In[23]: ensemble = Ensemble(45, 3, 3) pred = np.argmax(ensemble.mixmatch([predictions_lr, predictions_dtree, predictions_knn]), axis=1) print sklearn.metrics.accuracy_score(Y_test, pred)