DropCorrelatedFeatures
¶In this tutorial we show how to pass a custom method to DropCorrelatedFeatures
using the association measure Distance Correlation from the python package dcor.
import pandas as pd
import dcor
import warnings
from sklearn.datasets import make_classification
from feature_engine.selection import DropCorrelatedFeatures
warnings.filterwarnings('ignore')
X, _ = make_classification(
n_samples=1000,
n_features=12,
n_redundant=6,
n_clusters_per_class=1,
weights=[0.50],
class_sep=2,
random_state=1,
)
colnames = ["var_" + str(i) for i in range(12)]
X = pd.DataFrame(X, columns=colnames)
X
var_0 | var_1 | var_2 | var_3 | var_4 | var_5 | var_6 | var_7 | var_8 | var_9 | var_10 | var_11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.718421 | -0.306430 | 0.477337 | 1.662651 | 1.621889 | -0.226039 | 2.089741 | -2.145033 | 2.616778 | 0.074477 | 1.402662 | 1.599289 |
1 | 0.584286 | -0.871870 | 1.490290 | 3.644921 | 3.584239 | -0.750463 | -0.024631 | -4.525042 | 5.518534 | 1.788593 | 3.077793 | 3.188758 |
2 | -1.644619 | -0.391961 | 0.891121 | 2.232705 | 2.175168 | -0.278656 | -1.145170 | -2.897788 | 3.535246 | -0.796662 | 1.883299 | 2.178584 |
3 | 1.795776 | -2.645368 | 1.568321 | 1.449491 | 1.754788 | -3.226923 | 0.626374 | 0.238043 | -0.310298 | 1.247212 | 1.256478 | -2.376344 |
4 | -0.683522 | -1.420178 | -0.120177 | 1.019803 | 1.171396 | -1.708503 | -0.114110 | -0.223424 | 0.262247 | 0.322612 | 0.877768 | -0.972715 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 0.379855 | -0.529128 | -0.093361 | 2.668557 | 2.608481 | -0.410322 | -1.343059 | -3.409712 | 4.159278 | -1.287548 | 2.251801 | 2.507712 |
996 | 0.410435 | -1.590386 | 0.301589 | 0.962002 | 1.140932 | -1.931062 | 0.010015 | 0.011464 | -0.025811 | -1.124970 | 0.831563 | -1.315063 |
997 | 0.562542 | -0.173591 | -0.551323 | 1.456996 | 1.407670 | -0.077131 | -1.215225 | -1.963863 | 2.396559 | 1.678760 | 1.227821 | 1.551989 |
998 | 0.187248 | -0.355866 | -1.385539 | 1.304138 | 1.288720 | -0.324460 | 0.260543 | -1.580115 | 1.926655 | -1.330030 | 1.101843 | 1.071300 |
999 | 0.105134 | -2.982815 | 0.309657 | 2.085668 | 2.406926 | -3.593946 | -0.339890 | -0.387522 | 0.451001 | -0.221839 | 1.796291 | -2.113529 |
1000 rows × 12 columns
dcor_tr = DropCorrelatedFeatures(
variables=None, method=dcor.distance_correlation, threshold=0.8
)
X_dcor = dcor_tr.fit_transform(X)
X_dcor
var_0 | var_1 | var_2 | var_3 | var_6 | var_7 | var_9 | |
---|---|---|---|---|---|---|---|
0 | -0.718421 | -0.306430 | 0.477337 | 1.662651 | 2.089741 | -2.145033 | 0.074477 |
1 | 0.584286 | -0.871870 | 1.490290 | 3.644921 | -0.024631 | -4.525042 | 1.788593 |
2 | -1.644619 | -0.391961 | 0.891121 | 2.232705 | -1.145170 | -2.897788 | -0.796662 |
3 | 1.795776 | -2.645368 | 1.568321 | 1.449491 | 0.626374 | 0.238043 | 1.247212 |
4 | -0.683522 | -1.420178 | -0.120177 | 1.019803 | -0.114110 | -0.223424 | 0.322612 |
... | ... | ... | ... | ... | ... | ... | ... |
995 | 0.379855 | -0.529128 | -0.093361 | 2.668557 | -1.343059 | -3.409712 | -1.287548 |
996 | 0.410435 | -1.590386 | 0.301589 | 0.962002 | 0.010015 | 0.011464 | -1.124970 |
997 | 0.562542 | -0.173591 | -0.551323 | 1.456996 | -1.215225 | -1.963863 | 1.678760 |
998 | 0.187248 | -0.355866 | -1.385539 | 1.304138 | 0.260543 | -1.580115 | -1.330030 |
999 | 0.105134 | -2.982815 | 0.309657 | 2.085668 | -0.339890 | -0.387522 | -0.221839 |
1000 rows × 7 columns
In the next example, we use the function sklearn.feature_selection.mutual_info_regression to calculate the Mutual Information between two numerical variables, dropping any features with a score below 0.8.
Remember that the callable should take as input two 1d ndarrays and output a float value, we define a custom function calling the sklearn method.
from sklearn.feature_selection import mutual_info_regression
def custom_mi(x, y):
x = x.reshape(-1, 1)
y = y.reshape(-1, 1)
return mutual_info_regression(x, y)[0] # should return a float value
mi_tr = DropCorrelatedFeatures(
variables=None, method=custom_mi, threshold=0.8
)
X_mi = mi_tr.fit_transform(X)
X_mi
var_0 | var_1 | var_2 | var_3 | var_6 | var_7 | var_9 | |
---|---|---|---|---|---|---|---|
0 | -0.718421 | -0.306430 | 0.477337 | 1.662651 | 2.089741 | -2.145033 | 0.074477 |
1 | 0.584286 | -0.871870 | 1.490290 | 3.644921 | -0.024631 | -4.525042 | 1.788593 |
2 | -1.644619 | -0.391961 | 0.891121 | 2.232705 | -1.145170 | -2.897788 | -0.796662 |
3 | 1.795776 | -2.645368 | 1.568321 | 1.449491 | 0.626374 | 0.238043 | 1.247212 |
4 | -0.683522 | -1.420178 | -0.120177 | 1.019803 | -0.114110 | -0.223424 | 0.322612 |
... | ... | ... | ... | ... | ... | ... | ... |
995 | 0.379855 | -0.529128 | -0.093361 | 2.668557 | -1.343059 | -3.409712 | -1.287548 |
996 | 0.410435 | -1.590386 | 0.301589 | 0.962002 | 0.010015 | 0.011464 | -1.124970 |
997 | 0.562542 | -0.173591 | -0.551323 | 1.456996 | -1.215225 | -1.963863 | 1.678760 |
998 | 0.187248 | -0.355866 | -1.385539 | 1.304138 | 0.260543 | -1.580115 | -1.330030 |
999 | 0.105134 | -2.982815 | 0.309657 | 2.085668 | -0.339890 | -0.387522 | -0.221839 |
1000 rows × 7 columns