#!/usr/bin/env python # coding: utf-8 # # Missing value imputation: CategoricalImputer # # # CategoricalImputer performs imputation of categorical variables. It replaces missing values by an arbitrary label "Missing" (default) or any other label entered by the user. Alternatively, it imputes missing data with the most frequent category. # # **For this demonstration, we use the Ames House Prices dataset produced by Professor Dean De Cock:** # # [Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing # Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3](http://jse.amstat.org/v19n3/decock.pdf) # # The version of the dataset used in this notebook can be obtained from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) # ## Version # In[1]: # Make sure you are using this # Feature-engine version. import feature_engine feature_engine.__version__ # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.imputation import CategoricalImputer # ## Load data # In[3]: # Download the data from Kaggle and store it # in the same folder as this notebook. data = pd.read_csv('houseprice.csv') data.head() # In[4]: # Separate the data into train and test sets. X_train, X_test, y_train, y_test = train_test_split( data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0, ) X_train.shape, X_test.shape # ## Check missing data # In[5]: # These are categorical variables with missing data X_train[['Alley', 'MasVnrType']].isnull().mean() # In[6]: # Number of observations per category X_train['MasVnrType'].value_counts().plot.bar() plt.ylabel('Number of observations') plt.title('MasVnrType') # ## Imputat with string missing # # We replace missing data with the string "Missing". # In[7]: imputer = CategoricalImputer( imputation_method='missing', variables=['Alley', 'MasVnrType']) imputer.fit(X_train) # In[8]: # We impute all variables with the # string 'Missing' imputer.imputer_dict_ # In[9]: # Perform imputation. train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[10]: # Observe the new category 'Missing' test_t['MasVnrType'].value_counts().plot.bar() plt.ylabel('Number of observations') plt.title('Imputed MasVnrType') # In[11]: test_t['Alley'].value_counts().plot.bar() plt.ylabel('Number of observations') plt.title('Imputed Alley') # ## Impute with another string # # We can also enter a specific string for the imputation instead of the default 'Missing'. # In[12]: imputer = CategoricalImputer( variables='MasVnrType', fill_value="this_is_missing", ) # In[13]: # We can also fit and transform the train set # in one line of code train_t = imputer.fit_transform(X_train) # In[14]: # and then transform the test set test_t = imputer.transform(X_test) # In[15]: # let's check the current imputation # dictionary imputer.imputer_dict_ # In[16]: # After the imputation we see the new category test_t['MasVnrType'].value_counts().plot.bar() plt.ylabel('Number of observations') plt.title('Imputed MasVnrType') # ## Frequent Category Imputation # # We can also replace missing values with the most frequent category. # In[17]: imputer = CategoricalImputer( imputation_method='frequent', variables=['Alley', 'MasVnrType'], ) # In[18]: # Find most frequent category imputer.fit(X_train) # In[19]: # In this attribute we find the most frequent category # per variable to impute. imputer.imputer_dict_ # In[20]: # Impute variables train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[21]: # Let's count the number of observations per category # in the original variable. X_train['MasVnrType'].value_counts() # In[22]: # note that we have a few more observations in the # most frequent category, which for this variable # is 'None', after the transformation. train_t['MasVnrType'].value_counts() # The number of observations for `None` in `MasVnrType` increased from 609 to 614, thanks to replacing the NA with this label. # ## Automatically select categorical variables # # We can impute all catetgorical variables automatically, either with a string or with the most frequent category. # # To do so, we need to leave the parameter `variables` to `None`. # In[23]: # Impute all categorical variables with # the most frequent category imputer = CategoricalImputer(imputation_method='frequent') # In[24]: # with fit, the transformer identifies the categorical variables # in the train set, and their most frequent category. imputer.fit(X_train) # Here we find the imputation values for each # categorical variable. imputer.imputer_dict_ # In[25]: # With transform we replace missing data. train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[26]: # Sanity check: # No categorical variable with NA is left in the # transformed data. [v for v in train_t.columns if train_t[v].dtypes == 'O' and train_t[v].isnull().sum() > 1] # In[27]: # We can also return the name of the final features in # the transformed data imputer.get_feature_names_out()