AddMissingIndicator adds additional binary variables indicating missing data (thus, called missing indicators). The binary variables take the value 1 if the observation's value is missing, or 0 otherwise. AddMissingIndicator adds 1 binary variable per variable.
For this demonstration, we use the Ames House Prices dataset produced by Professor Dean De Cock:
The version of the dataset used in this notebook can be obtained from Kaggle
# Make sure you are using this
# Feature-engine version.
import feature_engine
feature_engine.__version__
'1.2.0'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
AddMissingIndicator,
MeanMedianImputer,
CategoricalImputer,
)
# Download the data from Kaggle and store it
# in the same folder as this notebook.
data = pd.read_csv('houseprice.csv')
data.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# Separate the data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['Id', 'SalePrice'], axis=1),
data['SalePrice'],
test_size=0.3,
random_state=0,
)
X_train.shape, X_test.shape
((1022, 79), (438, 79))
We will add indicators to 4 variables with missing data.
# Check missing data
X_train[['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']].isnull().mean()
Alley 0.939335 MasVnrType 0.004892 LotFrontage 0.184932 MasVnrArea 0.004892 dtype: float64
# Start the imputer with the variables for which
# we want indicators.
imputer = AddMissingIndicator(
variables=['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea'],
)
imputer.fit(X_train)
AddMissingIndicator(variables=['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea'])
# the variables for which missing
# indicators will be added.
imputer.variables_
['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']
# Check the added indicators. They take the name of
# the variable underscore na
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)
train_t[['Alley_na', 'MasVnrType_na', 'LotFrontage_na', 'MasVnrArea_na']].head()
Alley_na | MasVnrType_na | LotFrontage_na | MasVnrArea_na | |
---|---|---|---|---|
64 | 1 | 0 | 1 | 0 |
682 | 1 | 0 | 1 | 0 |
960 | 1 | 0 | 0 | 0 |
1384 | 1 | 0 | 0 | 0 |
1100 | 1 | 0 | 0 | 0 |
# Note that the original variables still have missing data.
train_t[['Alley_na', 'MasVnrType_na', 'LotFrontage_na', 'MasVnrArea_na']].mean()
Alley_na 0.939335 MasVnrType_na 0.004892 LotFrontage_na 0.184932 MasVnrArea_na 0.004892 dtype: float64
We normally add missing indicators and impute the original variables with the mean or median if the variable is numerical, or with the mode if the variable is categorical. So let's do that.
# Check variable types
X_train[['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']].dtypes
Alley object MasVnrType object LotFrontage float64 MasVnrArea float64 dtype: object
The first 2 variables are categorical, so I will impute them with the most frequent category. The last variables are numerical, so I will impute with the median.
# Create a pipeline with the imputation strategy
pipe = Pipeline([
('indicators', AddMissingIndicator(
variables=['Alley', 'MasVnrType',
'LotFrontage', 'MasVnrArea'],
)),
('imputer_num', MeanMedianImputer(
imputation_method='median',
variables=['LotFrontage', 'MasVnrArea'],
)),
('imputer_cat', CategoricalImputer(
imputation_method='frequent',
variables=['Alley', 'MasVnrType'],
)),
])
# With fit() the transformers learn the
# required parameters.
pipe.fit(X_train)
Pipeline(steps=[('indicators', AddMissingIndicator(variables=['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea'])), ('imputer_num', MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea'])), ('imputer_cat', CategoricalImputer(imputation_method='frequent', variables=['Alley', 'MasVnrType']))])
# We can look into the attributes of the
# different transformers.
# Check the variables that will take indicators.
pipe.named_steps['indicators'].variables_
['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']
# Check the median values for the imputation.
pipe.named_steps['imputer_num'].imputer_dict_
{'LotFrontage': 69.0, 'MasVnrArea': 0.0}
# Check the mode values for the imputation.
pipe.named_steps['imputer_cat'].imputer_dict_
{'Alley': 'Pave', 'MasVnrType': 'None'}
# Now, we transform the data.
train_t = pipe.transform(X_train)
test_t = pipe.transform(X_test)
# Lets' look at the transformed variables.
# original variables plus indicators
vars_ = ['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea',
'Alley_na', 'MasVnrType_na', 'LotFrontage_na', 'MasVnrArea_na']
train_t[vars_].head()
Alley | MasVnrType | LotFrontage | MasVnrArea | Alley_na | MasVnrType_na | LotFrontage_na | MasVnrArea_na | |
---|---|---|---|---|---|---|---|---|
64 | Pave | BrkFace | 69.0 | 573.0 | 1 | 0 | 1 | 0 |
682 | Pave | None | 69.0 | 0.0 | 1 | 0 | 1 | 0 |
960 | Pave | None | 50.0 | 0.0 | 1 | 0 | 0 | 0 |
1384 | Pave | None | 60.0 | 0.0 | 1 | 0 | 0 | 0 |
1100 | Pave | None | 60.0 | 0.0 | 1 | 0 | 0 | 0 |
# After the transformation, the variables do not
# show missing data
train_t[vars_].isnull().sum()
Alley 0 MasVnrType 0 LotFrontage 0 MasVnrArea 0 Alley_na 0 MasVnrType_na 0 LotFrontage_na 0 MasVnrArea_na 0 dtype: int64
We have the option to add indicators to all variables in the dataset, or to all variables with missing data. AddMissingIndicator can select which variables to transform automatically.
When the parameter variables
is left to None and the parameter missing_only
is left to True, the imputer add indicators to all variables with missing data.
When the parameter variables
is left to None and the parameter missing_only
is switched to False, the imputer add indicators to all variables.
It is good practice to use missing_only=True
when we set variables=None
, so that the transformer handles the imputation automatically in a meaningful way.
# With missing_only=True, missing indicators will only be added
# to those variables with missing data found during the fit method
# in the train set
imputer = AddMissingIndicator(
variables=None,
missing_only=True,
)
# finds variables with missing data
imputer.fit(X_train)
AddMissingIndicator()
# The original variables argument was None
imputer.variables
# In variables_ we find the list of variables with NA
# in the train set
imputer.variables_
['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
len(imputer.variables_)
19
We've got 19 variables with NA in the train set.
# After transforming the dataset, we see more columns
# corresponding to the missing indicators.
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)
X_train.shape, train_t.shape
((1022, 79), (1022, 98))
# Towards the right, we find the missing indicators.
train_t.head()
MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | Electrical_na | FireplaceQu_na | GarageType_na | GarageYrBlt_na | GarageFinish_na | GarageQual_na | GarageCond_na | PoolQC_na | Fence_na | MiscFeature_na | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
64 | 60 | RL | NaN | 9375 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
682 | 120 | RL | NaN | 2887 | Pave | NaN | Reg | HLS | AllPub | Inside | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
960 | 20 | RL | 50.0 | 7207 | Pave | NaN | IR1 | Lvl | AllPub | Inside | ... | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1384 | 50 | RL | 60.0 | 9060 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
1100 | 30 | RL | 60.0 | 8400 | Pave | NaN | Reg | Bnk | AllPub | Inside | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
5 rows × 98 columns
# We can, in practice, set up the indicator to add
# missing indicators to all variables
imputer = AddMissingIndicator(
variables=None,
missing_only=False,
)
imputer.fit(X_train)
AddMissingIndicator(missing_only=False)
# the attribute variables_ now shows all variables
# in the train set.
len(imputer.variables_)
79
# After transforming the dataset,
# we obtain double the number of columns
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)
X_train.shape, train_t.shape
((1022, 79), (1022, 158))
We can automatically impute missing data in numerical and categorical variables, letting the imputers find out which variables to impute.
We need to set the parameter variables to None in all imputers. None is the default value, so we can simply omit the parameter when initialising the transformers.
# Create a pipeline with the imputation strategy
pipe = Pipeline([
# add indicators to variables with NA
('indicators', AddMissingIndicator(
missing_only=True,
)),
# impute all numerical variables with the median
('imputer_num', MeanMedianImputer(
imputation_method='median',
)),
# impute all categorical variables with the mode
('imputer_cat', CategoricalImputer(
imputation_method='frequent',
)),
])
# With fit() the transformers learn the
# required parameters.
pipe.fit(X_train)
Pipeline(steps=[('indicators', AddMissingIndicator()), ('imputer_num', MeanMedianImputer()), ('imputer_cat', CategoricalImputer(imputation_method='frequent'))])
# We can look into the attributes of the
# different transformers.
# Check the variables that will take indicators.
pipe.named_steps['indicators'].variables_
['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
# Check the median values for the imputation.
pipe.named_steps['imputer_num'].imputer_dict_
{'MSSubClass': 50.0, 'LotFrontage': 69.0, 'LotArea': 9536.0, 'OverallQual': 6.0, 'OverallCond': 5.0, 'YearBuilt': 1972.0, 'YearRemodAdd': 1993.0, 'MasVnrArea': 0.0, 'BsmtFinSF1': 386.0, 'BsmtFinSF2': 0.0, 'BsmtUnfSF': 486.5, 'TotalBsmtSF': 992.0, '1stFlrSF': 1095.0, '2ndFlrSF': 0.0, 'LowQualFinSF': 0.0, 'GrLivArea': 1479.0, 'BsmtFullBath': 0.0, 'BsmtHalfBath': 0.0, 'FullBath': 2.0, 'HalfBath': 0.0, 'BedroomAbvGr': 3.0, 'KitchenAbvGr': 1.0, 'TotRmsAbvGrd': 6.0, 'Fireplaces': 1.0, 'GarageYrBlt': 1979.0, 'GarageCars': 2.0, 'GarageArea': 477.0, 'WoodDeckSF': 0.0, 'OpenPorchSF': 25.0, 'EnclosedPorch': 0.0, '3SsnPorch': 0.0, 'ScreenPorch': 0.0, 'PoolArea': 0.0, 'MiscVal': 0.0, 'MoSold': 6.0, 'YrSold': 2008.0, 'LotFrontage_na': 0.0, 'Alley_na': 1.0, 'MasVnrType_na': 0.0, 'MasVnrArea_na': 0.0, 'BsmtQual_na': 0.0, 'BsmtCond_na': 0.0, 'BsmtExposure_na': 0.0, 'BsmtFinType1_na': 0.0, 'BsmtFinType2_na': 0.0, 'Electrical_na': 0.0, 'FireplaceQu_na': 0.0, 'GarageType_na': 0.0, 'GarageYrBlt_na': 0.0, 'GarageFinish_na': 0.0, 'GarageQual_na': 0.0, 'GarageCond_na': 0.0, 'PoolQC_na': 1.0, 'Fence_na': 1.0, 'MiscFeature_na': 1.0}
# Check the mode values for the imputation.
pipe.named_steps['imputer_cat'].imputer_dict_
{'MSZoning': 'RL', 'Street': 'Pave', 'Alley': 'Pave', 'LotShape': 'Reg', 'LandContour': 'Lvl', 'Utilities': 'AllPub', 'LotConfig': 'Inside', 'LandSlope': 'Gtl', 'Neighborhood': 'NAmes', 'Condition1': 'Norm', 'Condition2': 'Norm', 'BldgType': '1Fam', 'HouseStyle': '1Story', 'RoofStyle': 'Gable', 'RoofMatl': 'CompShg', 'Exterior1st': 'VinylSd', 'Exterior2nd': 'VinylSd', 'MasVnrType': 'None', 'ExterQual': 'TA', 'ExterCond': 'TA', 'Foundation': 'PConc', 'BsmtQual': 'TA', 'BsmtCond': 'TA', 'BsmtExposure': 'No', 'BsmtFinType1': 'Unf', 'BsmtFinType2': 'Unf', 'Heating': 'GasA', 'HeatingQC': 'Ex', 'CentralAir': 'Y', 'Electrical': 'SBrkr', 'KitchenQual': 'TA', 'Functional': 'Typ', 'FireplaceQu': 'Gd', 'GarageType': 'Attchd', 'GarageFinish': 'Unf', 'GarageQual': 'TA', 'GarageCond': 'TA', 'PavedDrive': 'Y', 'PoolQC': 'Gd', 'Fence': 'MnPrv', 'MiscFeature': 'Shed', 'SaleType': 'WD', 'SaleCondition': 'Normal'}
# Now, we transform the data.
train_t = pipe.transform(X_train)
test_t = pipe.transform(X_test)
# We should see a complete case dataset
train_t.isnull().sum()
MSSubClass 0 MSZoning 0 LotFrontage 0 LotArea 0 Street 0 .. GarageQual_na 0 GarageCond_na 0 PoolQC_na 0 Fence_na 0 MiscFeature_na 0 Length: 98, dtype: int64
# Sanity check
[v for v in train_t.columns if train_t[v].isnull().sum() > 1]
[]