#!/usr/bin/env python # coding: utf-8 # # CountFrequencyEncoder #

The CountFrequencyEncoder() replaces categories by the count of # observations per category or by the percentage of observations per category.
# For example in the variable colour, if 10 observations are blue, blue will # be replaced by 10. Alternatively, if 10% of the observations are blue, blue # will be replaced by 0.1.

# In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.encoding import CountFrequencyEncoder # In[2]: # Load titanic dataset from OpenML def load_titanic(): data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl') data = data.replace('?', np.nan) data['cabin'] = data['cabin'].astype(str).str[0] data['pclass'] = data['pclass'].astype('O') data['age'] = data['age'].astype('float') data['fare'] = data['fare'].astype('float') data['embarked'].fillna('C', inplace=True) data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True) return data # In[3]: data = load_titanic() data.head() # In[4]: X = data.drop(['survived', 'name', 'ticket'], axis=1) y = data.survived # In[5]: # we will encode the below variables, they have no missing values X[['cabin', 'pclass', 'embarked']].isnull().sum() # In[6]: ''' Make sure that the variables are type (object). if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) or not pick it up (if we leave variables=None). ''' X[['cabin', 'pclass', 'embarked']].dtypes # In[7]: # let's separate into training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) X_train.shape, X_test.shape # The CountFrequencyEncoder(), replaces the categories by the count or frequency of the observations in the train set for that category. # # If we select "count" in the encoding_method, then for the variable colour, if there are 10 observations in the train set that show colour blue, blue will be replaced by 10.

Alternatively, if we select "frequency" in the encoding_method, if 10% of the observations in the train set show blue colour, then blue will be replaced by 0.1. # ### Frequency # # Labels are replaced by the percentage of the observations that show that label in the train set. # In[8]: ''' Parameters ---------- encoding_method : str, default='count' Desired method of encoding. 'count': number of observations per category 'frequency': percentage of observations per category variables : list The list of categorical variables that will be encoded. If None, the encoder will find and transform all object type variables. ''' count_encoder = CountFrequencyEncoder(encoding_method='frequency', variables=['cabin', 'pclass', 'embarked']) count_encoder.fit(X_train) # In[9]: # we can explore the encoder_dict_ to find out the category replacements. count_encoder.encoder_dict_ # In[10]: # transform the data: see the change in the head view train_t = count_encoder.transform(X_train) test_t = count_encoder.transform(X_test) test_t.head() # In[11]: test_t['pclass'].value_counts().plot.bar() plt.show() # In[12]: test_orig = count_encoder.inverse_transform(test_t) test_orig.head() # ### Count # # Labels are replaced by the number of the observations that show that label in the train set. # In[13]: # this time we encode only 1 variable count_enc = CountFrequencyEncoder(encoding_method='count', variables='cabin') count_enc.fit(X_train) # In[14]: # we can find the mappings in the encoder_dict_ attribute. count_enc.encoder_dict_ # In[15]: # transform the data: see the change in the head view for Cabin train_t = count_enc.transform(X_train) test_t = count_enc.transform(X_test) test_t.head() # ### Select categorical variables automatically # # If we don't indicate which variables we want to encode, the encoder will find all categorical variables # In[16]: # this time we ommit the argument for variable count_enc = CountFrequencyEncoder(encoding_method = 'count') count_enc.fit(X_train) # In[17]: # we can see that the encoder selected automatically all the categorical variables count_enc.variables # In[18]: # transform the data: see the change in the head view train_t = count_enc.transform(X_train) test_t = count_enc.transform(X_test) test_t.head() # ### Note # if there are labels in the test set that were not present in the train set, the transformer will introduce NaN, and raise a warning. # In[ ]: