#!/usr/bin/env python # coding: utf-8 # ## Univariate Single Performance # # - Train a ML model per every single feature # - Determine the performance of the models # - Select features if model performance is above a certain threshold # In[2]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import roc_auc_score, mean_squared_error from feature_engine.selection import SelectBySingleFeaturePerformance # ## Classification # In[2]: # In[3]: data.head() # **Important** # # In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfit. # In[4]: # separate train and test sets X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['target'], axis=1), data['target'], test_size=0.3, random_state=0) X_train.shape, X_test.shape # In[5]: # set up a machine learning model rf = RandomForestClassifier( n_estimators=10, random_state=1, n_jobs=4) # set up the selector sel = SelectBySingleFeaturePerformance( variables=None, estimator=rf, scoring="roc_auc", cv=3, threshold=0.5) # find predictive features sel.fit(X_train, y_train) # In[6]: # the transformer stores a dictionary of feature:metric pairs # in this case is the roc_auc of each individual model sel.feature_performance_ # In[7]: # we can plot feature importance sorted by importance pd.Series(sel.feature_performance_).sort_values(ascending=False).plot.bar(figsize=(20, 5)) plt.title('Performance of ML models trained with individual features') plt.ylabel('roc-auc') # In[8]: # the features that will be removed len(sel.features_to_drop_) # In[9]: # remove non-prective features X_train = sel.transform(X_train) X_test = sel.transform(X_test) X_train.shape, X_test.shape # ## Regression # # ### with r2 and user specified threshold # In[3]: # load dataset data = pd.read_csv('../houseprice.csv') data.shape # In[4]: # I will use only numerical variables # select numerical columns: numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numerical_vars = list(data.select_dtypes(include=numerics).columns) data = data[numerical_vars] data.shape # In[5]: data.head() # In[6]: # fill missing values data.fillna(0, inplace=True) # In[7]: # separate train and test sets X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['Id','SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0) X_train.shape, X_test.shape # In[8]: # set up the machine learning model rf = RandomForestRegressor( n_estimators=10, max_depth=2, random_state=1, n_jobs=4) # set up the selector sel = SelectBySingleFeaturePerformance( variables=None, estimator=rf, scoring="r2", cv=3, threshold=0.5) # find predictive features sel.fit(X_train, y_train) # In[9]: # the transformer stores a dictionary of feature:metric pairs # notice that the r2 can be positive or negative. # the selector selects based on the absolute value sel.feature_performance_ # In[10]: pd.Series(sel.feature_performance_).sort_values(ascending=False).plot.bar(figsize=(20, 5)) plt.title('Performance of ML models trained with individual features') plt.ylabel('r2') # In[11]: # same plot but taking the absolute value of the r2 np.abs(pd.Series(sel.feature_performance_)).sort_values(ascending=False).plot.bar(figsize=(20, 5)) plt.title('Performance of ML models trained with individual features') plt.ylabel('r2 - absolute value') # In[12]: # the features that will be removed len(sel.features_to_drop_) # In[13]: # select features in the dataframes X_train = sel.transform(X_train) X_test = sel.transform(X_test) X_train.shape, X_test.shape # ### Automatically select threshold # # If we leave the threshold to None, the threshold will be automatically specified as the mean of performance of all features. # In[14]: # separate train and test sets X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['Id','SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0) X_train.shape, X_test.shape # In[15]: # set up the machine learning model rf = RandomForestRegressor( n_estimators=10, max_depth=2, random_state=1, n_jobs=4) # set up the selector sel = SelectBySingleFeaturePerformance( variables=None, estimator=rf, scoring="neg_mean_squared_error", cv=3, threshold=None) # find predictive features sel.fit(X_train, y_train) # In[16]: # the transformer stores a dictionary of feature:metric pairs # the selector will select those features with neg mean squared error # bigger than the mean of the neg squared error of all features sel.feature_performance_ # In[17]: pd.Series(sel.feature_performance_).sort_values(ascending=False).plot.bar(figsize=(20, 5)) plt.title('Performance of ML models trained with individual features') plt.ylabel('Negative mean Squared Error') # In[18]: # the features that will be dropped sel.features_to_drop_ # In[20]: # note that these features have the biggest negative mean squared error pd.Series(sel.feature_performance_)[sel.features_to_drop_].sort_values(ascending=False).plot.bar(figsize=(20, 5)) # In[ ]: