#!/usr/bin/env python # coding: utf-8 # Sebastian Raschka, 2015-2022 # `mlxtend`, a library of extension and helper modules for Python's data analysis and machine learning libraries # # - GitHub repository: https://github.com/rasbt/mlxtend # - Documentation: https://rasbt.github.io/mlxtend/ # In[28]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p matplotlib,numpy,scipy,mlxtend") # In[29]: get_ipython().run_line_magic('matplotlib', 'inline') # # ExhaustiveFeatureSelector: Optimal feature sets by considering all possible feature combinations # Implementation of an *exhaustive feature selector* for sampling and evaluating all possible feature combinations in a specified range. # > from mlxtend.feature_selection import ExhaustiveFeatureSelector # ## Overview # This exhaustive feature selection algorithm is a wrapper approach for brute-force evaluation of feature subsets; the best subset is selected by optimizing a specified performance metric given an arbitrary regressor or classifier. For instance, if the classifier is a logistic regression and the dataset consists of 4 features, the alogorithm will evaluate all 15 feature combinations (if `min_features=1` and `max_features=4`) # # - {0} # - {1} # - {2} # - {3} # - {0, 1} # - {0, 2} # - {0, 3} # - {1, 2} # - {1, 3} # - {2, 3} # - {0, 1, 2} # - {0, 1, 3} # - {0, 2, 3} # - {1, 2, 3} # - {0, 1, 2, 3} # # and select the one that results in the best performance (e.g., classification accuracy) of the logistic regression classifier. # # # ## Example 1 - A simple Iris example # Initializing a simple classifier from scikit-learn: # In[30]: from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) efs1 = EFS(knn, min_features=1, max_features=4, scoring='accuracy', print_progress=True, cv=5) efs1 = efs1.fit(X, y) print('Best accuracy score: %.2f' % efs1.best_score_) print('Best subset (indices):', efs1.best_idx_) print('Best subset (corresponding names):', efs1.best_feature_names_) # ### Feature Names # In[ ]: # When working with large datasets, the feature indices might be hard to interpret. In this case, we recommend using pandas DataFrames with distinct column names as input: # In[31]: import pandas as pd df_X = pd.DataFrame(X, columns=["Sepal length", "Sepal width", "Petal length", "Petal width"]) df_X.head() # In[32]: efs1 = efs1.fit(df_X, y) print('Best accuracy score: %.2f' % efs1.best_score_) print('Best subset (indices):', efs1.best_idx_) print('Best subset (corresponding names):', efs1.best_feature_names_) # ### Detailed Outputs # Via the `subsets_` attribute, we can take a look at the selected feature indices at each step: # In[33]: efs1.subsets_ # ## Example 2 - Visualizing the feature selection results # For our convenience, we can visualize the output from the feature selection in a pandas DataFrame format using the `get_metric_dict` method of the `ExhaustiveFeatureSelector` object. The columns `std_dev` and `std_err` represent the standard deviation and standard errors of the cross-validation scores, respectively. # Below, we see the DataFrame of the Sequential Forward Selector from Example 2: # In[34]: import pandas as pd iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) efs1 = EFS(knn, min_features=1, max_features=4, scoring='accuracy', print_progress=True, cv=5) feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') df_X = pd.DataFrame( X, columns=["Sepal length", "Sepal width", "Petal length", "Petal width"]) efs1 = efs1.fit(df_X, y) df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T df.sort_values('avg_score', inplace=True, ascending=False) df # In[35]: import matplotlib.pyplot as plt metric_dict = efs1.get_metric_dict() fig = plt.figure() k_feat = sorted(metric_dict.keys()) avg = [metric_dict[k]['avg_score'] for k in k_feat] upper, lower = [], [] for k in k_feat: upper.append(metric_dict[k]['avg_score'] + metric_dict[k]['std_dev']) lower.append(metric_dict[k]['avg_score'] - metric_dict[k]['std_dev']) plt.fill_between(k_feat, upper, lower, alpha=0.2, color='blue', lw=1) plt.plot(k_feat, avg, color='blue', marker='o') plt.ylabel('Accuracy +/- Standard Deviation') plt.xlabel('Number of Features') feature_min = len(metric_dict[k_feat[0]]['feature_idx']) feature_max = len(metric_dict[k_feat[-1]]['feature_idx']) plt.xticks(k_feat, [str(metric_dict[k]['feature_names']) for k in k_feat], rotation=90) plt.show() # ## Example 3 - Exhaustive feature selection for regression analysis # Similar to the classification examples above, the `SequentialFeatureSelector` also supports scikit-learn's estimators # for regression. # In[36]: from sklearn.linear_model import LinearRegression from sklearn.datasets import load_boston boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() efs = EFS(lr, min_features=10, max_features=12, scoring='neg_mean_squared_error', cv=10) efs.fit(X, y) print('Best MSE score: %.2f' % efs.best_score_ * (-1)) print('Best subset:', efs.best_idx_) # ## Example 4 - Regression and adjusted R2 # As shown in Example 3, the exhaustive feature selector can be used for selecting features via a regression model. In regression analysis, there exists the common phenomenon that the $R^2$ score can become spuriously inflated the more features we choose. Hence, and this is especially true for feature selection, it is useful to make model comparisons based on the adjusted $R^2$ value rather than the regular $R^2$. The adjusted $R^2$, $\bar{R}^{2}$, accounts for the number of features and examples as follows: # # $$\bar{R}^{2}=1-\left(1-R^{2}\right) \frac{n-1}{n-p-1},$$ # # where $n$ is the number of examples and $p$ is the number of features. # # One of the advantages of scikit-learn's API is that it's consistent, intuitive, and simple to use. However, one downside of this API design is that it can be a bit restrictive for certain scenarios. For instance, scikit-learn scoring function only take two inputs, the predicted and the true target values. Hence, we cannot use scikit-learn's scoring API to compute the adjusted $R^2$, which also requires the number of features. # # However, as a workaround, we can compute the $R^2$ for the different feature subsets and then do a posthoc computation to obtain the adjusted $R^2$. # **Step 1: Compute $R^2$:** # In[37]: from sklearn.linear_model import LinearRegression from sklearn.datasets import load_boston boston = load_boston() X, y = boston.data, boston.target lr = LinearRegression() efs = EFS(lr, min_features=10, max_features=12, scoring='r2', cv=10) efs.fit(X, y) print('Best R2 score: %.2f' % efs.best_score_ * (-1)) print('Best subset:', efs.best_idx_) # **Step 2: Compute adjusted $R^2$:** # In[38]: def adjust_r2(r2, num_examples, num_features): coef = (num_examples - 1) / (num_examples - num_features - 1) return 1 - (1 - r2) * coef # In[39]: for i in efs.subsets_: efs.subsets_[i]['adjusted_avg_score'] = ( adjust_r2(r2=efs.subsets_[i]['avg_score'], num_examples=X.shape[0]/10, num_features=len(efs.subsets_[i]['feature_idx'])) ) # **Step 3: Select best subset based on adjusted $R^2$:** # In[40]: score = -99e10 for i in efs.subsets_: score = efs.subsets_[i]['adjusted_avg_score'] if ( efs.subsets_[i]['adjusted_avg_score'] == score and len(efs.subsets_[i]['feature_idx']) < len(efs.best_idx_) )\ or efs.subsets_[i]['adjusted_avg_score'] > score: efs.best_idx_ = efs.subsets_[i]['feature_idx'] # In[41]: print('Best adjusted R2 score: %.2f' % efs.best_score_ * (-1)) print('Best subset:', efs.best_idx_) # ## Example 5 - Using the selected feature subset For making new predictions # In[42]: # Initialize the dataset from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=1) knn = KNeighborsClassifier(n_neighbors=3) # In[43]: # Select the "best" three features via # 5-fold cross-validation on the training set. from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS efs1 = EFS(knn, min_features=1, max_features=4, scoring='accuracy', cv=5) efs1 = efs1.fit(X_train, y_train) # In[44]: print('Selected features:', efs1.best_idx_) # In[45]: # Generate the new subsets based on the selected features # Note that the transform call is equivalent to # X_train[:, efs1.k_feature_idx_] X_train_efs = efs1.transform(X_train) X_test_efs = efs1.transform(X_test) # Fit the estimator using the new feature subset # and make a prediction on the test data knn.fit(X_train_efs, y_train) y_pred = knn.predict(X_test_efs) # Compute the accuracy of the prediction acc = float((y_test == y_pred).sum()) / y_pred.shape[0] print('Test set accuracy: %.2f %%' % (acc*100)) # ## Example 6 - Exhaustive feature selection and GridSearch # In[46]: # Initialize the dataset from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=1) # Use scikit-learn's `GridSearch` to tune the hyperparameters of the `LogisticRegression` estimator inside the `ExhaustiveFeatureSelector` and use it for prediction in the pipeline. **Note that the `clone_estimator` attribute needs to be set to `False`.** # In[47]: from sklearn.model_selection import GridSearchCV from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS lr = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=123) efs1 = EFS(estimator=lr, min_features=2, max_features=3, scoring='accuracy', print_progress=False, clone_estimator=False, cv=5, n_jobs=1) pipe = make_pipeline(efs1, lr) param_grid = {'exhaustivefeatureselector__estimator__C': [0.1, 1.0, 10.0]} gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='accuracy', n_jobs=1, cv=2, verbose=1, refit=False) # run gridearch gs = gs.fit(X_train, y_train) # ... and the "best" parameters determined by GridSearch are ... # In[48]: print("Best parameters via GridSearch", gs.best_params_) # #### Obtaining the best *k* feature indices after GridSearch # If we are interested in the best *k* best feature indices via `SequentialFeatureSelection.best_idx_`, we have to initialize a `GridSearchCV` object with `refit=True`. Now, the grid search object will take the complete training dataset and the best parameters, which it found via cross-validation, to train the estimator pipeline. # In[49]: gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='accuracy', n_jobs=1, cv=2, verbose=1, refit=True) # After running the grid search, we can access the individual pipeline objects of the `best_estimator_` via the `steps` attribute. # In[50]: gs = gs.fit(X_train, y_train) gs.best_estimator_.steps # Via sub-indexing, we can then obtain the best-selected feature subset: # In[51]: print('Best features:', gs.best_estimator_.steps[0][1].best_idx_) # During cross-validation, this feature combination had a CV accuracy of: # In[52]: print('Best score:', gs.best_score_) # In[53]: gs.best_params_ # **Alternatively**, if we can set the "best grid search parameters" in our pipeline manually if we ran `GridSearchCV` with `refit=False`. It should yield the same results: # In[54]: pipe.set_params(**gs.best_params_).fit(X_train, y_train) print('Best features:', pipe.steps[0][1].best_idx_) # ## Example 7 - Exhaustive Feature Selection with LOOCV # The `ExhaustiveFeatureSelector` is not restricted to k-fold cross-validation. You can use any type of cross-validation method that supports the general scikit-learn cross-validation API. # # The following example illustrates the use of scikit-learn's `LeaveOneOut` cross-validation method in combination with the exhaustive feature selector. # In[55]: from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS from sklearn.model_selection import LeaveOneOut iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=3) efs1 = EFS(knn, min_features=1, max_features=4, scoring='accuracy', print_progress=True, cv=LeaveOneOut()) ### Use cross-validation generator here efs1 = efs1.fit(X, y) print('Best accuracy score: %.2f' % efs1.best_score_) print('Best subset (indices):', efs1.best_idx_) print('Best subset (corresponding names):', efs1.best_feature_names_) # ## Example 8 - Interrupting Long Runs for Intermediate Results # If your run is taking too long, it is possible to trigger a `KeyboardInterrupt` (e.g., ctrl+c on a Mac, or interrupting the cell in a Jupyter notebook) to obtain temporary results. # **Toy dataset** # In[61]: from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split X, y = make_classification( n_samples=200000, n_features=6, n_informative=2, n_redundant=1, n_repeated=1, n_clusters_per_class=2, flip_y=0.05, class_sep=0.5, random_state=123, ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=123 ) # **Long run with interruption** # In[62]: from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS from sklearn.linear_model import LogisticRegression model = LogisticRegression(max_iter=10000) efs1 = EFS(model, min_features=1, max_features=4, print_progress=True, scoring='accuracy') efs1 = efs1.fit(X_train, y_train) # **Finalizing the fit** # Note that the feature selection run hasn't finished, so certain attributes may not be available. In order to use the EFS instance, it is recommended to call `finalize_fit`, which will make EFS estimator appear as "fitted" process the temporary results: # In[63]: efs1.finalize_fit() # In[64]: print('Best accuracy score: %.2f' % efs1.best_score_) print('Best subset (indices):', efs1.best_idx_) # ## Example 9 - Working with Feature Groups # Since mlxtend v0.21.0, it is possible to specify feature groups. Feature groups allow you to group certain features together, such that they are always selected as a group. This can be very useful in contexts similar to one-hot encoding -- if you want to treat the one-hot encoded feature as a single feature: # # ![](SequentialFeatureSelector_files/feature_groups.jpeg) # In the following example, we specify sepal length and sepal width as a feature group so that they are always selected together: # In[29]: from sklearn.datasets import load_iris import pandas as pd iris = load_iris() X = iris.data y = iris.target X_df = pd.DataFrame(X, columns=['sepal len', 'petal len', 'sepal wid', 'petal wid']) X_df.head() # In[30]: from sklearn.neighbors import KNeighborsClassifier from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS knn = KNeighborsClassifier(n_neighbors=3) efs1 = EFS(knn, min_features=2, max_features=2, scoring='accuracy', feature_groups=[['sepal len', 'sepal wid'], ['petal len'], ['petal wid']], cv=3) efs1 = efs1.fit(X_df, y) print('Best accuracy score: %.2f' % efs1.best_score_) print('Best subset (indices):', efs1.best_idx_) print('Best subset (corresponding names):', efs1.best_feature_names_) # Notice that the returned number of features is 3, since the number of `min_features` and `max_features` corresponds to the number of feature groups. I.e., we have 2 feature groups in `['sepal len', 'sepal wid'], ['petal wid']`, but it expands to 3 features. # ## API # In[31]: with open('../../api_modules/mlxtend.feature_selection/ExhaustiveFeatureSelector.md', 'r') as f: print(f.read())