#!/usr/bin/env python # coding: utf-8 # # Missing value imputation: RandomSampleImputer # # # The RandomSampleImputer extracts a random sample of observations where data is available, and uses it to replace the NA. It is suitable for numerical and categorical variables. # # To control the random sample extraction, there are various ways to set a seed and ensure or maximize reproducibility. # # # **For this demonstration, we use the Ames House Prices dataset produced by Professor Dean De Cock:** # # [Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing # Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3](http://jse.amstat.org/v19n3/decock.pdf) # # The version of the dataset used in this notebook can be obtained from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) # ## Version # In[1]: # Make sure you are using this # Feature-engine version. import feature_engine feature_engine.__version__ # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.imputation import RandomSampleImputer # In[3]: # Download the data from Kaggle and store it # in the same folder as this notebook. data = pd.read_csv('houseprice.csv') data.head() # In[4]: # Separate the data into train and test sets. X_train, X_test, y_train, y_test = train_test_split( data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0, ) X_train.shape, X_test.shape # ## Imputation in batch # # We can set the imputer to impute several observations in batch with a unique seed. This is the equivalent of setting the `random_state` to an integer in `pandas.sample()`. # In[5]: # Start the imputer imputer = RandomSampleImputer( # the variables to impute variables=['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea'], # the random state for reproducibility random_state=10, # equialent to setting random_state in # pandas.sample() seed='general', ) # In[6]: # Stores a copy of the train set variables imputer.fit(X_train) # In[7]: # the imputer saves a copy of the variables # from the training set to impute new data. imputer.X_.head() # In[8]: # Check missing data in train set X_train[['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']].isnull().mean() # In[9]: # impute data train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[10]: # Check missing data after the transformation train_t[['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']].isnull().mean() # In[11]: # when using the random sample imputer, # the distribution of the variable does not change. # This imputation method is useful for models that # are sensitive to changes in the variable distributions. fig = plt.figure() ax = fig.add_subplot(111) X_train['LotFrontage'].plot(kind='kde', ax=ax) train_t['LotFrontage'].plot(kind='kde', ax=ax, color='red') lines, labels = ax.get_legend_handles_labels() ax.legend(lines, labels, loc='best') # ## Specific seeds for each observation # # Sometimes, we want to guarantee that the same observation is imputed with the same value, run after run. # # To achieve this, we need to always use the same seed for every particular observation. # # To do this, we can use the values in neighboring variables as seed. # # In this case, the seed will be calculated observation per observation, either by adding or multiplying the seeding variable values, and passed to the random_state of pandas.sample(), which is used under the hood by the imputer. # Then, a value will be extracted from the train set using that seed and used to replace the NAN in particular observation. # # **To know more about how the observation per seed is used check this [notebook](https://github.com/solegalli/feature-engineering-for-machine-learning/blob/master/Section-04-Missing-Data-Imputation/04.07-Random-Sample-Imputation.ipynb)** # In[12]: imputer = RandomSampleImputer( # the values of these variables will be used as seed random_state=['MSSubClass', 'YrSold'], # 1 seed per observation seed='observation', # how to combine the values of the seeding variables seeding_method='add', # impute all variables, numerical and categorical variables=None, ) # In[13]: # Stores a copy of the train set. imputer.fit(X_train) # In[14]: # takes a copy of the entire train set imputer.X_ # In[15]: # imputes all variables. # this procedure takes a while because it is # done observation per observation. train_t = imputer.transform(X_train) test_t = imputer.transform(X_test) # In[16]: # No missing data in any variable # after the imputation. test_t.isnull().sum() # In[17]: # when using the random sample imputer, # the distribution of the variable does not change fig = plt.figure() ax = fig.add_subplot(111) X_train['LotFrontage'].plot(kind='kde', ax=ax) train_t['LotFrontage'].plot(kind='kde', ax=ax, color='red') lines, labels = ax.get_legend_handles_labels() ax.legend(lines, labels, loc='best') # In[ ]: