import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import RareLabelEncoder
# load the dataset from Kaggle
data = pd.read_csv('houseprice.csv')
data.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['Id', 'SalePrice'], axis=1),
data['SalePrice'],
test_size=0.3,
random_state=0,
)
X_train.shape, X_test.shape
((1022, 79), (438, 79))
cols = ['Alley',
'MasVnrType',
'BsmtQual',
'BsmtCond',
'BsmtExposure',
'BsmtFinType1',
'BsmtFinType2',
'Electrical',
'FireplaceQu',
'GarageType',
'GarageFinish',
'GarageQual']
# let's remove rare labels to avoid errors when encoding
rare_label_enc = RareLabelEncoder(n_categories=2, variables=cols)
X_train = rare_label_enc.fit_transform(X_train.fillna('Missing'))
X_test = rare_label_enc.transform(X_test.fillna('Missing'))
# now let's replace categories by integers
encoder = SklearnTransformerWrapper(
transformer = OrdinalEncoder(),
variables = cols,
)
encoder.fit(X_train)
SklearnTransformerWrapper(transformer=OrdinalEncoder(), variables=['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual'])
# we can navigate to the parameters of the sklearn transformer
# like this:
encoder.transformer_.categories_
[array(['Missing', 'Rare'], dtype=object), array(['BrkFace', 'None', 'Rare', 'Stone'], dtype=object), array(['Ex', 'Gd', 'Rare', 'TA'], dtype=object), array(['Rare', 'TA'], dtype=object), array(['Av', 'Gd', 'Mn', 'No', 'Rare'], dtype=object), array(['ALQ', 'BLQ', 'GLQ', 'Rare', 'Rec', 'Unf'], dtype=object), array(['Rare', 'Unf'], dtype=object), array(['FuseA', 'Rare', 'SBrkr'], dtype=object), array(['Gd', 'Missing', 'Rare', 'TA'], dtype=object), array(['Attchd', 'BuiltIn', 'Detchd', 'Missing', 'Rare'], dtype=object), array(['Fin', 'Missing', 'RFn', 'Unf'], dtype=object), array(['Missing', 'Rare', 'TA'], dtype=object)]
# encode categories
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)
X_train[cols].isnull().mean()
Alley 0.0 MasVnrType 0.0 BsmtQual 0.0 BsmtCond 0.0 BsmtExposure 0.0 BsmtFinType1 0.0 BsmtFinType2 0.0 Electrical 0.0 FireplaceQu 0.0 GarageType 0.0 GarageFinish 0.0 GarageQual 0.0 dtype: float64
X_test[cols].head()
Alley | MasVnrType | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Electrical | FireplaceQu | GarageType | GarageFinish | GarageQual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
529 | 0.0 | 2.0 | 3.0 | 1.0 | 3.0 | 4.0 | 1.0 | 2.0 | 3.0 | 0.0 | 2.0 | 2.0 |
491 | 0.0 | 1.0 | 3.0 | 1.0 | 3.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | 3.0 | 2.0 |
459 | 0.0 | 2.0 | 3.0 | 1.0 | 3.0 | 3.0 | 1.0 | 2.0 | 3.0 | 2.0 | 3.0 | 2.0 |
279 | 0.0 | 0.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.0 | 2.0 | 3.0 | 0.0 | 0.0 | 2.0 |
655 | 0.0 | 0.0 | 3.0 | 1.0 | 3.0 | 5.0 | 1.0 | 2.0 | 1.0 | 2.0 | 3.0 | 2.0 |