#!/usr/bin/env python
# coding: utf-8

# # DecisionTreeEncoder
# 
# The DecisionTreeEncoder() encodes categorical variables with predictions of a decision tree model.
# 

# In[1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from feature_engine.encoding import DecisionTreeEncoder


# In[2]:


# Load titanic dataset from OpenML

def load_titanic():
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['age'] = data['age'].astype('float')
    data['fare'] = data['fare'].astype('float')
    data['embarked'].fillna('C', inplace=True)
    data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True)
    return data


# In[3]:


data = load_titanic()
data.head()


# In[4]:


X = data.drop(['survived', 'name', 'ticket'], axis=1)
y = data.survived


# In[5]:


# we will encode the below variables, they have no missing values
X[['cabin', 'pclass', 'embarked']].isnull().sum()


# In[6]:


''' Make sure that the variables are type (object).
if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) 
or not pick it up (if we leave variables=None). '''

X[['cabin', 'pclass', 'embarked']].dtypes


# In[7]:


# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape


# The categorical variable will be first encoded into integers with the
# OrdinalEncoder(). The integers can be assigned arbitrarily to the
# categories or following the mean value of the target in each category.
# 
# Then a decision tree will be fit using the resulting numerical variable to predict
# the target  variable. Finally, the original categorical variable values will be
# replaced by the predictions of the decision tree.

# In[8]:


'''
Parameters
    ----------

    encoding_method: str, default='arbitrary'
        The categorical encoding method that will be used to encode the original
        categories to numerical values.

        'ordered': the categories are numbered in ascending order according to
        the target mean value per category.

        'arbitrary' : categories are numbered arbitrarily.

    cv : int, default=3
        Desired number of cross-validation fold to be used to fit the decision
        tree.

    scoring: str, default='neg_mean_squared_error'
        Desired metric to optimise the performance for the tree. Comes from
        sklearn metrics. See the DecisionTreeRegressor or DecisionTreeClassifier
        model evaluation documentation for more options:
        https://scikit-learn.org/stable/modules/model_evaluation.html

    regression : boolean, default=True
        Indicates whether the encoder should train a regression or a classification
        decision tree.

    param_grid : dictionary, default=None
        The list of parameters over which the decision tree should be optimised
        during the grid search. The param_grid can contain any of the permitted
        parameters for Scikit-learn's DecisionTreeRegressor() or
        DecisionTreeClassifier().

        If None, then param_grid = {'max_depth': [1, 2, 3, 4]}.

    random_state : int, default=None
        The random_state to initialise the training of the decision tree. It is one
        of the parameters of the Scikit-learn's DecisionTreeRegressor() or
        DecisionTreeClassifier(). For reproducibility it is recommended to set
        the random_state to an integer.

    variables : list, default=None
        The list of categorical variables that will be encoded. If None, the
        encoder will find and select all object type variables.
'''


# In[9]:


tree_enc = DecisionTreeEncoder(encoding_method='arbitrary',
                               cv=3,
                               scoring = 'roc_auc',
                               param_grid = {'max_depth': [1, 2, 3, 4]},
                               regression = False,
                               variables=['cabin', 'pclass', 'embarked']
                              )

tree_enc.fit(X_train,y_train) # to fit you need to pass the target y


# In[10]:


tree_enc.encoder_


# In[11]:


# transform and visualise the data

train_t = tree_enc.transform(X_train)
test_t = tree_enc.transform(X_test)

test_t.sample(5)


# ### Automatically select the variables
# 
# This encoder will select all categorical variables to encode, when no variables are specified when calling the encoder.

# In[12]:


tree_enc = DecisionTreeEncoder(encoding_method='arbitrary',
                               cv=3,
                               scoring = 'roc_auc',
                               param_grid = {'max_depth': [1, 2, 3, 4]},
                               regression = False,
                              )

tree_enc.fit(X_train,y_train) # to fit you need to pass the target y


# In[13]:


tree_enc.encoder_


# In[14]:


# transform and visualise the data

train_t = tree_enc.transform(X_train)
test_t = tree_enc.transform(X_test)

test_t.sample(5)


# In[ ]: