import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.wrappers import SklearnTransformerWrapper
# load house prices data set from Kaggle
data = pd.read_csv('houseprice.csv')
data.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['Id', 'SalePrice'], axis=1),
data['SalePrice'],
test_size=0.3,
random_state=0)
X_train.shape, X_test.shape
((1022, 79), (438, 79))
X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()
LotFrontage 0.184932 MasVnrArea 0.004892 dtype: float64
imputer = SklearnTransformerWrapper(
transformer = SimpleImputer(strategy='mean'),
variables = ['LotFrontage', 'MasVnrArea'],
)
imputer.fit(X_train)
SklearnTransformerWrapper(transformer=SimpleImputer(), variables=['LotFrontage', 'MasVnrArea'])
# we can find the mean values within the parameters of the
# simple imputer
imputer.transformer_.statistics_
array([ 69.66866747, 103.55358899])
# remove NA
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()
LotFrontage 0.0 MasVnrArea 0.0 dtype: float64
cols = [c for c in data.columns if data[c].dtypes=='O' and data[c].isnull().sum()>0]
data[cols].head()
Alley | MasVnrType | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Electrical | FireplaceQu | GarageType | GarageFinish | GarageQual | GarageCond | PoolQC | Fence | MiscFeature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | BrkFace | Gd | TA | No | GLQ | Unf | SBrkr | NaN | Attchd | RFn | TA | TA | NaN | NaN | NaN |
1 | NaN | None | Gd | TA | Gd | ALQ | Unf | SBrkr | TA | Attchd | RFn | TA | TA | NaN | NaN | NaN |
2 | NaN | BrkFace | Gd | TA | Mn | GLQ | Unf | SBrkr | TA | Attchd | RFn | TA | TA | NaN | NaN | NaN |
3 | NaN | None | TA | Gd | No | ALQ | Unf | SBrkr | Gd | Detchd | Unf | TA | TA | NaN | NaN | NaN |
4 | NaN | BrkFace | Gd | TA | Av | GLQ | Unf | SBrkr | TA | Attchd | RFn | TA | TA | NaN | NaN | NaN |
imputer = SklearnTransformerWrapper(
transformer=SimpleImputer(strategy='most_frequent'),
variables=cols,
)
# find the most frequent category
imputer.fit(X_train)
SklearnTransformerWrapper(transformer=SimpleImputer(strategy='most_frequent'), variables=['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])
# we can find the most frequent values within the parameters of the
# simple imputer
imputer.transformer_.statistics_
array(['Pave', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd', 'Attchd', 'Unf', 'TA', 'TA', 'Gd', 'MnPrv', 'Shed'], dtype=object)
# remove NA
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
X_train[cols].isnull().mean()
Alley 0.0 MasVnrType 0.0 BsmtQual 0.0 BsmtCond 0.0 BsmtExposure 0.0 BsmtFinType1 0.0 BsmtFinType2 0.0 Electrical 0.0 FireplaceQu 0.0 GarageType 0.0 GarageFinish 0.0 GarageQual 0.0 GarageCond 0.0 PoolQC 0.0 Fence 0.0 MiscFeature 0.0 dtype: float64
X_test[cols].head()
Alley | MasVnrType | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Electrical | FireplaceQu | GarageType | GarageFinish | GarageQual | GarageCond | PoolQC | Fence | MiscFeature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
529 | Pave | None | TA | TA | No | Rec | Unf | SBrkr | TA | Attchd | RFn | TA | TA | Gd | MnPrv | Shed |
491 | Pave | None | TA | TA | No | BLQ | Rec | FuseA | TA | Attchd | Unf | TA | TA | Gd | MnPrv | Shed |
459 | Pave | BrkCmn | TA | TA | No | LwQ | Unf | SBrkr | TA | Detchd | Unf | TA | TA | Gd | MnPrv | Shed |
279 | Pave | BrkFace | Gd | TA | No | BLQ | Unf | SBrkr | TA | Attchd | Fin | TA | TA | Gd | MnPrv | Shed |
655 | Pave | BrkFace | TA | TA | No | Unf | Unf | SBrkr | Gd | Detchd | Unf | TA | TA | Gd | MnPrv | Shed |