Winzorizer finds maximum and minimum values following a Gaussian or skewed distribution as indicated. It can also cap the right, left or both ends of the distribution.
The Winsorizer() caps maximum and / or minimum values of a variable.
The Winsorizer() works only with numerical variables. A list of variables can be indicated. Alternatively, the Winsorizer() will select all numerical variables in the train set.
The Winsorizer() first calculates the capping values at the end of the distribution. The values are determined using:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.outliers import Winsorizer
# Load titanic dataset from OpenML
def load_titanic():
data = pd.read_csv(
'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
data = data.replace('?', np.nan)
data['cabin'] = data['cabin'].astype(str).str[0]
data['pclass'] = data['pclass'].astype('O')
data['embarked'].fillna('C', inplace=True)
data['fare'] = data['fare'].astype('float')
data['fare'].fillna(data['fare'].median(), inplace=True)
data['age'] = data['age'].astype('float')
data['age'].fillna(data['age'].median(), inplace=True)
data.drop(['name', 'ticket'], axis=1, inplace=True)
return data
# To plot histogram of given numerical feature
def plot_hist(data, col):
plt.figure(figsize=(8, 5))
plt.hist(data[col], bins=30)
plt.title("Distribution of "+col)
return plt.show()
# Loading titanic dataset
data = load_titanic()
data.sample(5)
pclass | survived | sex | age | sibsp | parch | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
157 | 1 | 0 | male | 28.0 | 0 | 0 | 51.8625 | E | S | NaN | NaN | Brighton, MA |
400 | 2 | 1 | female | 34.0 | 1 | 1 | 32.5000 | n | S | 10 | NaN | Greenport, NY |
546 | 2 | 1 | female | 28.0 | 0 | 0 | 13.0000 | n | S | 9 | NaN | Spain |
618 | 3 | 0 | male | 35.0 | 0 | 0 | 8.0500 | n | S | NaN | NaN | Lower Clapton, Middlesex or Erdington, Birmingham |
1208 | 3 | 0 | female | 9.0 | 3 | 2 | 27.9000 | n | S | NaN | NaN | NaN |
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data.drop('survived', axis=1),
data['survived'],
test_size=0.3,
random_state=0)
print("train data:", X_train.shape)
print("test data:", X_test.shape)
train data: (916, 11) test data: (393, 11)
# let's find out the maximum Age and maximum Fare in the titanic
print("Max age:", data.age.max())
print("Max fare:", data.fare.max())
Max age: 80.0 Max fare: 512.3292
# Histogram of age feature before capping outliers
plot_hist(data, 'age')
# Histogram of fare feature before capping outliers
plot_hist(data, 'fare')
'''Parameters
----------
capping_method : str, default=gaussian
Desired capping method. Can take 'gaussian', 'iqr' or 'quantiles'.
tail : str, default=right
Whether to cap outliers on the right, left or both tails of the distribution.
Can take 'left', 'right' or 'both'.
fold: int or float, default=3
How far out to to place the capping values. The number that will multiply
the std or IQR to calculate the capping values. Recommended values, 2
or 3 for the gaussian approximation, or 1.5 or 3 for the IQR proximity
rule.
variables: list, default=None
missing_values: string, default='raise'
Indicates if missing values should be ignored or raised.
'''
# capping at right tail using gaussian capping method
capper = Winsorizer(
capping_method='gaussian', tail='right', fold=3, variables=['age', 'fare'])
# fitting winsorizer object to training data
capper.fit(X_train)
Winsorizer(variables=['age', 'fare'])
# here we can find the maximum caps allowed
capper.right_tail_caps_
{'age': 67.49048447470315, 'fare': 174.78162171790441}
# this dictionary is empty, because we selected only right tail
capper.left_tail_caps_
{}
# # Histogram of age feature after capping outliers
plot_hist(capper.transform(X_train), 'age')
# transforming the training and testing data
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)
# let's check the new maximum Age and maximum Fare in the titanic
train_t.age.max(), train_t.fare.max()
(67.49048447470315, 174.78162171790441)
# Capping the outliers at both tails using gaussian capping method
winsor = Winsorizer(capping_method='gaussian',
tail='both', fold=2, variables='fare')
winsor.fit(X_train)
Winsorizer(fold=2, tail='both', variables=['fare'])
print("Minimum caps :", winsor.left_tail_caps_)
print("Maximum caps :", winsor.right_tail_caps_)
Minimum caps : {'fare': -62.30099726608475} Maximum caps : {'fare': 127.36509792110658}
# Histogram of fare feature after capping outliers
plot_hist(winsor.transform(X_train), 'fare')
# transforming the training and testing data
train_t = winsor.transform(X_train)
test_t = winsor.transform(X_test)
print("Max fare:", train_t.fare.max())
print("Min fare:", train_t.fare.min())
Max fare: 127.36509792110658 Min fare: 0.0
IQR limits:
where IQR is the inter-quartile range: 75th quantile - 25th quantile.
# capping at both tails using iqr capping method
winsor = Winsorizer(capping_method='iqr', tail='both',
variables=['age', 'fare'])
winsor.fit(X_train)
Winsorizer(capping_method='iqr', tail='both', variables=['age', 'fare'])
winsor.left_tail_caps_
{'age': -13.0, 'fare': -62.24179999999999}
winsor.right_tail_caps_
{'age': 71.0, 'fare': 101.4126}
# transforming the training and testing data
train_t = winsor.transform(X_train)
test_t = winsor.transform(X_test)
print("Max fare:", train_t.fare.max())
print("Min fare", train_t.fare.min())
Max fare: 101.4126 Min fare 0.0
# capping at both tails using quantiles capping method
winsor = Winsorizer(capping_method='quantiles', tail='both',
fold=0.02, variables=['age', 'fare'])
winsor.fit(X_train)
Winsorizer(capping_method='quantiles', fold=0.02, tail='both', variables=['age', 'fare'])
print("Minimum caps :", winsor.left_tail_caps_)
print("Maximum caps :", winsor.right_tail_caps_)
Minimum caps : {'age': 2.0, 'fare': 6.44125} Maximum caps : {'age': 61.69999999999993, 'fare': 211.5}
# transforming the training and testing data
train_t = winsor.transform(X_train)
test_t = winsor.transform(X_test)
print("Max age:", train_t.age.max())
print("Min age", train_t.age.min())
Max age: 61.69999999999993 Min age 2.0
# Histogram of age feature after capping outliers
plot_hist(train_t, 'age')