In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import RareLabelEncoder
In [2]:
# load the dataset from Kaggle

data = pd.read_csv('houseprice.csv')
data.head()
Out[2]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape
Out[3]:
((1022, 79), (438, 79))

OrdinalEncoder

In [4]:
cols = ['Alley',
        'MasVnrType',
        'BsmtQual',
        'BsmtCond',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'Electrical',
        'FireplaceQu',
        'GarageType',
        'GarageFinish',
        'GarageQual']
In [5]:
# let's remove rare labels to avoid errors when encoding

rare_label_enc = RareLabelEncoder(n_categories=2, variables=cols)

X_train = rare_label_enc.fit_transform(X_train.fillna('Missing'))
X_test = rare_label_enc.transform(X_test.fillna('Missing'))
In [6]:
# now let's replace categories by integers

encoder = SklearnTransformerWrapper(
    transformer = OrdinalEncoder(),
    variables = cols,
)

encoder.fit(X_train)
Out[6]:
SklearnTransformerWrapper(transformer=OrdinalEncoder(),
                          variables=['Alley', 'MasVnrType', 'BsmtQual',
                                     'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'Electrical',
                                     'FireplaceQu', 'GarageType',
                                     'GarageFinish', 'GarageQual'])
In [7]:
# we can navigate to the parameters of the sklearn transformer
# like this:

encoder.transformer_.categories_
Out[7]:
[array(['Missing', 'Rare'], dtype=object),
 array(['BrkFace', 'None', 'Rare', 'Stone'], dtype=object),
 array(['Ex', 'Gd', 'Rare', 'TA'], dtype=object),
 array(['Rare', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'No', 'Rare'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'Rare', 'Rec', 'Unf'], dtype=object),
 array(['Rare', 'Unf'], dtype=object),
 array(['FuseA', 'Rare', 'SBrkr'], dtype=object),
 array(['Gd', 'Missing', 'Rare', 'TA'], dtype=object),
 array(['Attchd', 'BuiltIn', 'Detchd', 'Missing', 'Rare'], dtype=object),
 array(['Fin', 'Missing', 'RFn', 'Unf'], dtype=object),
 array(['Missing', 'Rare', 'TA'], dtype=object)]
In [8]:
# encode categories

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

X_train[cols].isnull().mean()
Out[8]:
Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
dtype: float64
In [9]:
X_test[cols].head()
Out[9]:
Alley MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual
529 0.0 2.0 3.0 1.0 3.0 4.0 1.0 2.0 3.0 0.0 2.0 2.0
491 0.0 1.0 3.0 1.0 3.0 1.0 0.0 0.0 3.0 0.0 3.0 2.0
459 0.0 2.0 3.0 1.0 3.0 3.0 1.0 2.0 3.0 2.0 3.0 2.0
279 0.0 0.0 1.0 1.0 3.0 1.0 1.0 2.0 3.0 0.0 0.0 2.0
655 0.0 0.0 3.0 1.0 3.0 5.0 1.0 2.0 1.0 2.0 3.0 2.0
In [ ]: