#!/usr/bin/env python # coding: utf-8 # # RareLabelEncoder # # The RareLabelEncoder() groups labels that show a small number of observations in the dataset into a new category called 'Rare'. This helps to avoid overfitting. # # The argument ' tol ' indicates the percentage of observations that the label needs to have in order not to be re-grouped into the "Rare" label.
The argument n_categories indicates the minimum number of distinct categories that a variable needs to have for any of the labels to be re-grouped into 'Rare'.

# #### Note # If the number of labels is smaller than n_categories, then the encoder will not group the labels for that variable. # In[5]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.encoding import RareLabelEncoder # In[6]: # Load titanic dataset from OpenML def load_titanic(): data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl') data = data.replace('?', np.nan) data['cabin'] = data['cabin'].astype(str).str[0] data['pclass'] = data['pclass'].astype('O') data['age'] = data['age'].astype('float') data['fare'] = data['fare'].astype('float') data['embarked'].fillna('C', inplace=True) data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True) return data # In[7]: data = load_titanic() data.head() # In[8]: X = data.drop(['survived', 'name', 'ticket'], axis=1) y = data.survived # In[9]: # we will encode the below variables, they have no missing values X[['cabin', 'pclass', 'embarked']].isnull().sum() # In[10]: ''' Make sure that the variables are type (object). if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) or not pick it up (if we leave variables=None). ''' X[['cabin', 'pclass', 'embarked']].dtypes # In[17]: # let's separate into training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) X_train.shape, X_test.shape # The RareLabelEncoder() groups rare / infrequent categories in # a new category called "Rare", or any other name entered by the user. # # For example in the variable colour,
if the percentage of observations # for the categories magenta, cyan and burgundy # are < 5%, all those # categories will be replaced by the new label "Rare". # # Note, infrequent labels can also be grouped under a user defined name, for # example 'Other'. The name to replace infrequent categories is defined # with the parameter replace_with. # # The encoder will encode only categorical variables (type 'object'). A list # of variables can be passed as an argument. If no variables are passed as # argument, the encoder will find and encode all categorical variables # (object type). # In[8]: ## Rare value encoder ''' Parameters ---------- tol: float, default=0.05 the minimum frequency a label should have to be considered frequent. Categories with frequencies lower than tol will be grouped. n_categories: int, default=10 the minimum number of categories a variable should have for the encoder to find frequent labels. If the variable contains less categories, all of them will be considered frequent. max_n_categories: int, default=None the maximum number of categories that should be considered frequent. If None, all categories with frequency above the tolerance (tol) will be considered. variables : list, default=None The list of categorical variables that will be encoded. If None, the encoder will find and select all object type variables. replace_with : string, default='Rare' The category name that will be used to replace infrequent categories. ''' rare_encoder = RareLabelEncoder(tol=0.05, n_categories=5, variables=['cabin', 'pclass', 'embarked']) rare_encoder.fit(X_train) # In[9]: rare_encoder.encoder_dict_ # In[16]: train_t = rare_encoder.transform(X_train) test_t = rare_encoder.transform(X_train) test_t.head() # In[11]: test_t.cabin.value_counts() # #### The user can change the string from 'Rare' to something else. # In[20]: ## Rare value encoder rare_encoder = RareLabelEncoder(tol = 0.03, replace_with='Other', #replacing 'Rare' with 'Other' variables=['cabin', 'pclass', 'embarked'], n_categories=2 ) rare_encoder.fit(X_train) train_t = rare_encoder.transform(X_train) test_t = rare_encoder.transform(X_train) test_t.sample(5) # In[21]: rare_encoder.encoder_dict_ # In[22]: test_t.cabin.value_counts() # #### The user can choose to retain only the most popular categories with the argument max_n_categories. # In[25]: ## Rare value encoder rare_encoder = RareLabelEncoder(tol = 0.03, variables=['cabin', 'pclass', 'embarked'], n_categories=2, max_n_categories=3 #keeps only the most popular 3 categories in every variable. ) rare_encoder.fit(X_train) train_t = rare_encoder.transform(X_train) test_t = rare_encoder.transform(X_train) test_t.sample(5) # In[26]: rare_encoder.encoder_dict_ # ### Automatically select all categorical variables # # If no variable list is passed as argument, it selects all the categorical variables. # In[27]: ## Rare value encoder rare_encoder = RareLabelEncoder(tol = 0.03, n_categories=3) rare_encoder.fit(X_train) rare_encoder.encoder_dict_ # In[13]: train_t = rare_encoder.transform(X_train) test_t = rare_encoder.transform(X_train) test_t.sample(5) # In[ ]: