#!/usr/bin/env python # coding: utf-8 # ## WoEEncoder (weight of evidence) # # This encoder replaces the labels by the weight of evidence # #### It only works for binary classification. # # The weight of evidence is given by: log( p(1) / p(0) ) # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.encoding import WoEEncoder from feature_engine.encoding import RareLabelEncoder #to reduce cardinality # In[2]: # Load titanic dataset from OpenML def load_titanic(): data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl') data = data.replace('?', np.nan) data['cabin'] = data['cabin'].astype(str).str[0] data['pclass'] = data['pclass'].astype('O') data['age'] = data['age'].astype('float') data['fare'] = data['fare'].astype('float') data['embarked'].fillna('C', inplace=True) data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True) return data # In[3]: data = load_titanic() data.head() # In[4]: X = data.drop(['survived', 'name', 'ticket'], axis=1) y = data.survived # In[5]: # we will encode the below variables, they have no missing values X[['cabin', 'pclass', 'embarked']].isnull().sum() # In[6]: ''' Make sure that the variables are type (object). if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) or not pick it up (if we leave variables=None). ''' X[['cabin', 'pclass', 'embarked']].dtypes # In[7]: # let's separate into training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) X_train.shape, X_test.shape # In[8]: ## Rare value encoder first to reduce the cardinality # see RareLabelEncoder jupyter notebook for more details on this encoder rare_encoder = RareLabelEncoder(tol=0.03, n_categories=2, variables=['cabin', 'pclass', 'embarked']) rare_encoder.fit(X_train) # transform train_t = rare_encoder.transform(X_train) test_t = rare_encoder.transform(X_test) # The WoERatioEncoder() replaces categories by the weight of evidence # or by the ratio between the probability of the target = 1 and the probability # of the target = 0. # # The weight of evidence is given by: log(P(X=xj|Y = 1)/P(X=xj|Y=0)) # # # Note: This categorical encoding is exclusive for binary classification. # # For example in the variable colour, if the mean of the target = 1 for blue # is 0.8 and the mean of the target = 0 is 0.2, blue will be replaced by: # np.log(0.8/0.2) = 1.386 # #### Note: # The division by 0 is not defined and the log(0) is not defined. # Thus, if p(0) = 0 or p(1) = 0 for # woe , in any of the variables, the encoder will return an error. # # The encoder will encode only categorical variables (type 'object'). A list # of variables can be passed as an argument. If no variables are passed as # argument, the encoder will find and encode all categorical variables # (object type).
# # For details on the calculation of the weight of evidence visit:
# https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html # ### Weight of evidence # In[9]: woe_enc = WoEEncoder(variables=['cabin', 'pclass', 'embarked']) # to fit you need to pass the target y woe_enc.fit(train_t, y_train) # In[10]: woe_enc.encoder_dict_ # In[11]: # transform and visualise the data train_t = woe_enc.transform(train_t) test_t = woe_enc.transform(test_t) test_t.sample(5) # In[12]: ''' The WoEEncoder has the characteristic that return monotonic variables, that is, encoded variables which values increase as the target increases''' # let's explore the monotonic relationship plt.figure(figsize=(7,5)) pd.concat([test_t,y_test], axis=1).groupby("pclass")["survived"].mean().plot() #plt.xticks([0,1,2]) plt.yticks(np.arange(0,1.1,0.1)) plt.title("Relationship between pclass and target") plt.xlabel("Pclass") plt.ylabel("Mean of target") plt.show() # ### Automatically select the variables # # This encoder will select all categorical variables to encode, when no variables are specified when calling the encoder. # In[12]: ratio_enc = WoEEncoder() # to fit we need to pass the target y ratio_enc.fit(train_t, y_train) # In[13]: # transform and visualise the data train_t = ratio_enc.transform(train_t) test_t = ratio_enc.transform(test_t) test_t.head() # In[ ]: