#!/usr/bin/env python # coding: utf-8 # # Import # In[1]: import pandas as pd import numpy as np import sklearn import os from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix from sklearn import preprocessing from sklearn.preprocessing import StandardScaler print sklearn.__version__ print np.__version__ print pd.__version__ # # Loading and Describe # In[2]: DATA_DIR = '../data' df = pd.read_table( os.path.abspath(os.path.join(DATA_DIR, 'day1/iris.csv')), sep=',' ) df.head(5) # In[3]: # examples per class df.groupby('class')['class'].count() # In[4]: # feat1/2/3/4 are considered as input (features) to the model, whereas class is considered as output of the model X = df.iloc[:, :-1].values Y = df.iloc[:, -1].values # encode the class with integers le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) # # Train/Test Split # In[5]: # ideal practice is to use test as 20% - 30% of training data # defined by test_size in train_test_split() # random_state is required to avoid sequential biasness in the data distribution def data_split(X, Y): X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10) return X_train, X_test, Y_train, Y_test X_train, X_test, Y_train, Y_test = data_split(X, Y) # In[6]: X_train.shape, X_test.shape # # Normalizer Class # In[7]: # this class takes care for scaling the features to the scale of 0-1 # we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which # also has the range from 0-1 class Normalizer: def __init__(self): self.sc = StandardScaler() def scale(self, X, dtype): if dtype=='train': XX = self.sc.fit_transform(X) elif dtype=='test': XX = self.sc.transform(X) else: return None return XX # # Logistic Model Class # In[8]: class LogisticModel: def __init__(self): self.classifier = LogisticRegression() def train(self, X_train, Y_train): model = self.classifier.fit(X_train, Y_train) return model def predict(self, model, X_test): return model.predict(X_test) def evaluate(self, Y_test, Y_pred, measure): if measure=='matrix': cm = confusion_matrix(Y_test, Y_pred, labels=[0, 1, 2]) return cm elif measure=='accuracy': return accuracy_score(Y_test, Y_pred)*100 else: return None # # Starting to Train and Predict # In[9]: # cap to range of 0-1 norm = Normalizer() X_train = norm.scale(X_train, 'train') X_test = norm.scale(X_test, 'test') # In[10]: # train the model logit = LogisticModel() model = logit.train(X_train, Y_train) predictions = logit.predict(model, X_test) # # Evaluating # In[11]: print logit.evaluate(Y_test, predictions, 'matrix') print print logit.evaluate(Y_test, predictions, 'accuracy') # In[12]: # iris versicolor verginica # iris - 10 10 0 # versicolor - 0 10 3 # verginica - 0 0 7