In [1]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import RareLabelEncoder

In [2]:

# load the dataset from Kaggle

data = pd.read_csv('houseprice.csv')
data.head()

Out[2]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

In [3]:

# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

Out[3]:

((1022, 79), (438, 79))

OrdinalEncoder¶

In [4]:

cols = ['Alley',
        'MasVnrType',
        'BsmtQual',
        'BsmtCond',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'Electrical',
        'FireplaceQu',
        'GarageType',
        'GarageFinish',
        'GarageQual']

In [5]:

# let's remove rare labels to avoid errors when encoding

rare_label_enc = RareLabelEncoder(n_categories=2, variables=cols)

X_train = rare_label_enc.fit_transform(X_train.fillna('Missing'))
X_test = rare_label_enc.transform(X_test.fillna('Missing'))

In [6]:

# now let's replace categories by integers

encoder = SklearnTransformerWrapper(
    transformer = OrdinalEncoder(),
    variables = cols,
)

encoder.fit(X_train)

Out[6]:

SklearnTransformerWrapper(transformer=OrdinalEncoder(),
                          variables=['Alley', 'MasVnrType', 'BsmtQual',
                                     'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'Electrical',
                                     'FireplaceQu', 'GarageType',
                                     'GarageFinish', 'GarageQual'])

In [7]:

# we can navigate to the parameters of the sklearn transformer
# like this:

encoder.transformer_.categories_

Out[7]:

[array(['Missing', 'Rare'], dtype=object),
 array(['BrkFace', 'None', 'Rare', 'Stone'], dtype=object),
 array(['Ex', 'Gd', 'Rare', 'TA'], dtype=object),
 array(['Rare', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'No', 'Rare'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'Rare', 'Rec', 'Unf'], dtype=object),
 array(['Rare', 'Unf'], dtype=object),
 array(['FuseA', 'Rare', 'SBrkr'], dtype=object),
 array(['Gd', 'Missing', 'Rare', 'TA'], dtype=object),
 array(['Attchd', 'BuiltIn', 'Detchd', 'Missing', 'Rare'], dtype=object),
 array(['Fin', 'Missing', 'RFn', 'Unf'], dtype=object),
 array(['Missing', 'Rare', 'TA'], dtype=object)]

In [8]:

# encode categories

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

X_train[cols].isnull().mean()

Out[8]:

Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
dtype: float64

In [9]:

X_test[cols].head()

Out[9]:

	MasVnrType	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinType2	Electrical	FireplaceQu	GarageType	GarageFinish	GarageQual
529	2.0	3.0	1.0	3.0	4.0	1.0	2.0	3.0	0.0	2.0	2.0
491	1.0	3.0	1.0	3.0	1.0	0.0	0.0	3.0	0.0	3.0	2.0
459	2.0	3.0	1.0	3.0	3.0	1.0	2.0	3.0	2.0	3.0	2.0
279	0.0	1.0	1.0	3.0	1.0	1.0	2.0	3.0	0.0	0.0	2.0
655	0.0	3.0	1.0	3.0	5.0	1.0	2.0	1.0	2.0	3.0	2.0

In [ ]:

	MasVnrType	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinType2	Electrical	FireplaceQu	GarageType	GarageFinish	GarageQual
529	2.0	3.0	1.0	3.0	4.0	1.0	2.0	3.0	0.0	2.0	2.0
491	1.0	3.0	1.0	3.0	1.0	0.0	0.0	3.0	0.0	3.0	2.0
459	2.0	3.0	1.0	3.0	3.0	1.0	2.0	3.0	2.0	3.0	2.0
279	0.0	1.0	1.0	3.0	1.0	1.0	2.0	3.0	0.0	0.0	2.0
655	0.0	3.0	1.0	3.0	5.0	1.0	2.0	1.0	2.0	3.0	2.0

	MasVnrType	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinType2	Electrical	FireplaceQu	GarageType	GarageFinish	GarageQual
529	2.0	3.0	1.0	3.0	4.0	1.0	2.0	3.0	0.0	2.0	2.0
491	1.0	3.0	1.0	3.0	1.0	0.0	0.0	3.0	0.0	3.0	2.0
459	2.0	3.0	1.0	3.0	3.0	1.0	2.0	3.0	2.0	3.0	2.0
279	0.0	1.0	1.0	3.0	1.0	1.0	2.0	3.0	0.0	0.0	2.0
655	0.0	3.0	1.0	3.0	5.0	1.0	2.0	1.0	2.0	3.0	2.0

	MasVnrType	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinType2	Electrical	FireplaceQu	GarageType	GarageFinish	GarageQual
529	2.0	3.0	1.0	3.0	4.0	1.0	2.0	3.0	0.0	2.0	2.0
491	1.0	3.0	1.0	3.0	1.0	0.0	0.0	3.0	0.0	3.0	2.0
459	2.0	3.0	1.0	3.0	3.0	1.0	2.0	3.0	2.0	3.0	2.0
279	0.0	1.0	1.0	3.0	1.0	1.0	2.0	3.0	0.0	0.0	2.0
655	0.0	3.0	1.0	3.0	5.0	1.0	2.0	1.0	2.0	3.0	2.0