#!/usr/bin/env python # coding: utf-8 # # EndTailImputer # # The EndTailImputer() replaces missing data by a value at either tail of the distribution. It automatically determines the value to be used in the imputation using the mean plus or minus a factor of the standard deviation, or using the inter-quartile range proximity rule. Alternatively, it can use a factor of the maximum value. # # The EndTailImputer() is in essence, very similar to the ArbitraryNumberImputer, but it selects the value to use fr the imputation automatically, instead of having the user pre-define them. # # It works only with numerical variables. # # **For this demonstration, we use the Ames House Prices dataset produced by Professor Dean De Cock:** # # [Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing # Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3](http://jse.amstat.org/v19n3/decock.pdf) # # The version of the dataset used in this notebook can be obtained from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) # ## Version # In[1]: # Make sure you are using this # Feature-engine version. import feature_engine feature_engine.__version__ # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.imputation import EndTailImputer # ## Load data # In[3]: # Download the data from Kaggle and store it # in the same folder as this notebook. data = pd.read_csv('houseprice.csv') data.head() # In[4]: # Separate the data into train and test sets. X_train, X_test, y_train, y_test = train_test_split( data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0, ) X_train.shape, X_test.shape # ## Check missing data # In[5]: # numerical variables with missing data X_train[['LotFrontage', 'MasVnrArea']].isnull().mean() # The EndTailImputer can replace NA with a value at the left or right end of the distribution. # # In addition, it uses 3 different methods to identify the imputation values. # # In the following cells, we show how to use each method. # # ## Gaussian, right tail # # Let's begin by finding the values automatically at the right tail, by using the mean and the standard deviation. # In[6]: imputer = EndTailImputer( # uses mean and standard deviation to determine the value imputation_method='gaussian', # value at right tail of distribution tail='right', # multiply the std by 3 fold=3, # the variables to impute variables=['LotFrontage', 'MasVnrArea'], ) # In[7]: # find the imputation values imputer.fit(X_train) # In[8]: # The values for the imputation imputer.imputer_dict_ # Note that we use different values for different variables. # In[9]: # impute the data train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[10]: # check we no longer have NA train_t['LotFrontage'].isnull().sum() # In[11]: # The variable distribution changed slightly with # more values accumulating towards the right tail fig = plt.figure() ax = fig.add_subplot(111) X_train['LotFrontage'].plot(kind='kde', ax=ax) train_t['LotFrontage'].plot(kind='kde', ax=ax, color='red') lines, labels = ax.get_legend_handles_labels() ax.legend(lines, labels, loc='best') # ## IQR, left tail # # Now, we will impute variables with values at the left tail. The values are identified using the inter-quartile range proximity rule. # # The IQR rule is better suited for skewed variables. # In[12]: imputer = EndTailImputer( # uses the inter-quartile range proximity rule imputation_method='iqr', # determines values at the left tail of the distribution tail='left', # multiplies the IQR by 3 fold=3, # the variables to impute variables=['LotFrontage', 'MasVnrArea'], ) # In[13]: # finds the imputation values imputer.fit(X_train) # In[14]: # imputation values per variable imputer.imputer_dict_ # In[15]: # transform the data train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[16]: # Check we have no NA after the transformation train_t[['LotFrontage', 'MasVnrArea']].isnull().sum() # In[17]: # The variable distribution changed with the # transformation, with more values # accumulating towards the left tail. fig = plt.figure() ax = fig.add_subplot(111) X_train['LotFrontage'].plot(kind='kde', ax=ax) train_t['LotFrontage'].plot(kind='kde', ax=ax, color='red') lines, labels = ax.get_legend_handles_labels() ax.legend(lines, labels, loc='best') # ## Impute with the maximum value # # We can find imputation values with a factor of the maximum variable value. # In[18]: imputer = EndTailImputer( # imputes beyond the maximum value imputation_method='max', # multiplies the maximum value by 3 fold=3, # the variables to impute variables=['LotFrontage', 'MasVnrArea'], ) # In[19]: # find imputation values imputer.fit(X_train) # In[20]: # The imputation values. imputer.imputer_dict_ # In[21]: # the maximum values of the variables, # note how the imputer multiplied them by 3 # to determine the imputation values. X_train[imputer.variables_].max() # In[22]: # impute the data train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[23]: # Check we have no NA in the imputed data train_t[['LotFrontage', 'MasVnrArea']].isnull().sum() # In[24]: # The variable distribution changed with the # transformation, with now more values # beyond the maximum. fig = plt.figure() ax = fig.add_subplot(111) X_train['LotFrontage'].plot(kind='kde', ax=ax) train_t['LotFrontage'].plot(kind='kde', ax=ax, color='red') lines, labels = ax.get_legend_handles_labels() ax.legend(lines, labels, loc='best') # ## Automatically impute all variables # # As with all Feature-engine transformers, the EndTailImputer can also find and impute all numerical variables in the data. # In[25]: # Start the imputer imputer = EndTailImputer() # In[26]: # Check the default parameters # how to find the imputation value imputer.imputation_method # In[27]: # which tail to use imputer.tail # In[28]: # how far out imputer.fold # In[29]: # Find variables and imputation values imputer.fit(X_train) # In[30]: # The variables to impute imputer.variables_ # In[31]: # The imputation values imputer.imputer_dict_ # In[32]: # impute the data train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[33]: # Sanity check: # No numerical variable with NA is left in the # transformed data. [v for v in train_t.columns if train_t[v].dtypes != 'O' and train_t[v].isnull().sum() > 1] # In[ ]: