import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (
f_regression,
SelectKBest,
SelectFromModel,
)
from sklearn.linear_model import Lasso
from feature_engine.wrappers import SklearnTransformerWrapper
# load dataset
data = pd.read_csv('houseprice.csv')
data.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['Id', 'SalePrice'], axis=1),
data['SalePrice'],
test_size=0.3,
random_state=0,
)
X_train.shape, X_test.shape
((1022, 79), (438, 79))
# variables to evaluate:
cols = [var for var in X_train.columns if X_train[var].dtypes !='O']
cols
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
# let's use select K best to select the best k variables
selector = SklearnTransformerWrapper(
transformer = SelectKBest(f_regression, k=5),
variables = cols)
selector.fit(X_train.fillna(0), y_train)
SklearnTransformerWrapper(transformer=SelectKBest(k=5, score_func=<function f_regression at 0x0000007EFF7D7F70>), variables=['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', ...])
selector.transformer_.get_support(indices=True)
array([ 3, 11, 15, 25, 26], dtype=int64)
# selecteed features
X_train.columns[selector.transformer_.get_support(indices=True)]
Index(['LotArea', 'Neighborhood', 'HouseStyle', 'MasVnrArea', 'ExterQual'], dtype='object')
# the transformer returns the selected variables from the list
# we passed to the transformer PLUS the remaining variables
# in the dataframe that were not examined
X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))
X_test_t.head()
LotArea | Neighborhood | HouseStyle | MasVnrArea | ExterQual | MSZoning | Street | Alley | LotShape | LandContour | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
529 | 32668 | Crawfor | 1Story | 0.0 | Gd | RL | Pave | 0 | IR1 | Lvl | ... | Attchd | RFn | TA | TA | Y | 0 | 0 | 0 | WD | Alloca |
491 | 9490 | NAmes | 1.5Fin | 0.0 | TA | RL | Pave | 0 | Reg | Lvl | ... | Attchd | Unf | TA | TA | Y | 0 | MnPrv | 0 | WD | Normal |
459 | 7015 | BrkSide | 1.5Fin | 161.0 | TA | RL | Pave | 0 | IR1 | Bnk | ... | Detchd | Unf | TA | TA | Y | 0 | 0 | 0 | WD | Normal |
279 | 10005 | ClearCr | 2Story | 299.0 | TA | RL | Pave | 0 | Reg | Lvl | ... | Attchd | Fin | TA | TA | Y | 0 | 0 | 0 | WD | Normal |
655 | 1680 | BrDale | 2Story | 381.0 | TA | RM | Pave | 0 | Reg | Lvl | ... | Detchd | Unf | TA | TA | Y | 0 | 0 | 0 | WD | Family |
5 rows × 48 columns
# let's select the best variables according to Lasso
lasso = Lasso(alpha=10000, random_state=0)
sfm = SelectFromModel(lasso, prefit=False)
selector = SklearnTransformerWrapper(
transformer = sfm,
variables = cols)
selector.fit(X_train.fillna(0), y_train)
SklearnTransformerWrapper(transformer=SelectFromModel(estimator=Lasso(alpha=10000, random_state=0)), variables=['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', ...])
selector.transformer_.get_support(indices=True)
array([ 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 26, 27, 28, 29, 30, 31, 32, 33], dtype=int64)
len(selector.transformer_.get_support(indices=True))
24
len(cols)
36
# the transformer returns the selected variables from the list
# we passed to the transformer PLUS the remaining variables
# in the dataframe that were not examined
X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))
X_test_t.head()
MSSubClass | MSZoning | LotFrontage | LotArea | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
529 | 20 | RL | 0.0 | 32668 | 0 | IR1 | Lvl | AllPub | CulDSac | Gtl | ... | Attchd | RFn | TA | TA | Y | 0 | 0 | 0 | WD | Alloca |
491 | 50 | RL | 79.0 | 9490 | 0 | Reg | Lvl | AllPub | Inside | Gtl | ... | Attchd | Unf | TA | TA | Y | 0 | MnPrv | 0 | WD | Normal |
459 | 50 | RL | 0.0 | 7015 | 0 | IR1 | Bnk | AllPub | Corner | Gtl | ... | Detchd | Unf | TA | TA | Y | 0 | 0 | 0 | WD | Normal |
279 | 60 | RL | 83.0 | 10005 | 0 | Reg | Lvl | AllPub | Inside | Gtl | ... | Attchd | Fin | TA | TA | Y | 0 | 0 | 0 | WD | Normal |
655 | 160 | RM | 21.0 | 1680 | 0 | Reg | Lvl | AllPub | Inside | Gtl | ... | Detchd | Unf | TA | TA | Y | 0 | 0 | 0 | WD | Family |
5 rows × 67 columns