#!/usr/bin/env python # coding: utf-8 # ## Imports # In[1]: import pandas as pd import os from sklearn.tree import DecisionTreeClassifier from sklearn import preprocessing from sklearn.metrics import accuracy_score from sklearn.cross_validation import train_test_split print pd.__version__ # ## Loading Data # In[2]: # reading data to pandas dataframe DATA_DIR = '../data' df = pd.read_table( os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')), sep=',', header=None ) df.head(5) # In[3]: # (rows, columns) df.shape # In[4]: # checking for NaN in the entire df df.isnull().sum() # None of the columns have missing values in them # In[5]: # object in pandas means string; we need to convert all to numerical df.dtypes # ## Prune rows # In[6]: # but here is something we found unusual; a '?' we will prune out all the rows # where in any of the column if '?' exists # figuring out all the columns where '?' exists for columns in range(16): if '?' in df[columns].unique().tolist(): df = df[df[columns]!='?'] # In[7]: # new data after removing the rows having '?' in them df.shape # ## Transform datapoints # In[8]: # we will now encode the objects to float dtype for features # 0, 3, 4, 5, 6, 8, 9, 11 for column in [0, 3, 4, 5, 6, 8, 9, 11, 12]: possible_values = df[column].unique().tolist() encoded_inp = {v:idx for idx, v in enumerate(possible_values)} df[column].replace(encoded_inp, inplace=True) # we will now encode the objects to float dtype for target # 15 encoded_inp = {'+':1, '-':0} df[15].replace(encoded_inp, inplace=True) # we will not convert the remaining object dtypes to float dtype # 1, 13; i am not sure what 13 column is about; will drop it for now df.drop([13], axis = 1, inplace = True) df[1] = df[1].astype(float) # ## Sep. features and target # In[9]: X = df.iloc[:, :-1].values Y = df.iloc[:, -1].values # ## Train/Test split # In[10]: # ideal practice is to use test as 20% - 30% of training data # defined by test_size in train_test_split() # random_state is required to avoid sequential biasness in the data distribution def data_split(X, Y): X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10) return X_train, X_test, Y_train, Y_test X_train, X_test, Y_train, Y_test = data_split(X, Y) # In[11]: X_train.shape, X_test.shape # ## Train / Evaluate # In[12]: class DecisionTrees(object): def __init__(self): self.classifier = DecisionTreeClassifier(random_state=10) def train(self, X_train, Y_train): model = self.classifier.fit(X_train, Y_train) return model def predict(self, model, X_test): return model.predict(X_test) def evaluate(self, Y_test, Y_pred): return accuracy_score(Y_test, Y_pred)*100 # In[13]: # train the model and tuning depth paramater of the classifier over validation set dtree = DecisionTrees() model_dtree = dtree.train(X_train, Y_train) predictions = dtree.predict(model_dtree, X_test) print dtree.evaluate(Y_test, predictions)