#!/usr/bin/env python
# coding: utf-8

# ## Imports

# In[1]:


import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

print pd.__version__


# ## Loading Data

# In[2]:


# reading data to pandas dataframe
DATA_DIR = '../data'

df = pd.read_table(
                    os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),
                    sep=',',
                    header=None
                )
df.head(5)


# In[3]:


# (rows, columns)
df.shape


# In[4]:


# checking for NaN in the entire df
df.isnull().sum()

# None of the columns have missing values in them


# In[5]:


# object in pandas means string; we need to convert all to numerical
df.dtypes


# ## Prune rows

# In[6]:


# but here is something we found unusual; a '?' we will prune out all the rows 
# where in any of the column if '?' exists

# figuring out all the columns where '?' exists
for columns in range(16):
    if '?' in df[columns].unique().tolist():
        df = df[df[columns]!='?']


# In[7]:


# new data after removing the rows having '?' in them
df.shape


# ## Transform datapoints

# In[8]:


# we will now encode the objects to float dtype for features
# 0, 3, 4, 5, 6, 8, 9, 11
for column in [0, 3, 4, 5, 6, 8, 9, 11, 12]:
    possible_values = df[column].unique().tolist()
    encoded_inp = {v:idx for idx, v in enumerate(possible_values)}
    df[column].replace(encoded_inp, inplace=True)

# we will now encode the objects to float dtype for target
# 15
encoded_inp = {'+':1, '-':0}
df[15].replace(encoded_inp, inplace=True)

# we will not convert the remaining object dtypes to float dtype
# 1, 13; i am not sure what 13 column is about; will drop it for now
df.drop([13], axis = 1, inplace = True)
df[1] = df[1].astype(float)


# ## Sep. features and target

# In[9]:


X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values


# ## Train/Test split

# In[10]:


# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(X, Y)


# In[11]:


X_train.shape, X_test.shape


# ## Train / Evaluate

# In[12]:


class DecisionTrees(object):
    
    def __init__(self):
        self.classifier = DecisionTreeClassifier(random_state=10)

    def train(self, X_train, Y_train):
        model = self.classifier.fit(X_train, Y_train)
        return model
    
    def predict(self, model, X_test):
        return model.predict(X_test)
    
    def evaluate(self, Y_test, Y_pred):
        return accuracy_score(Y_test, Y_pred)*100


# In[13]:


# train the model and tuning depth paramater of the classifier over validation set
dtree = DecisionTrees()
model_dtree = dtree.train(X_train, Y_train)
predictions = dtree.predict(model_dtree, X_test)
print dtree.evaluate(Y_test, predictions)