#!/usr/bin/env python
# coding: utf-8

# Sebastian Raschka, 2015-2022  
# `mlxtend`, a library of extension and helper modules for Python's data analysis and machine learning libraries
# 
# - GitHub repository: https://github.com/rasbt/mlxtend
# - Documentation: https://rasbt.github.io/mlxtend/

# In[28]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p matplotlib,numpy,scipy,mlxtend")


# In[29]:


get_ipython().run_line_magic('matplotlib', 'inline')


# # ExhaustiveFeatureSelector: Optimal feature sets by considering all possible feature combinations

# Implementation of an *exhaustive feature selector* for sampling and evaluating all possible feature combinations in a specified range.

# > from mlxtend.feature_selection import ExhaustiveFeatureSelector

# ## Overview

# This exhaustive feature selection algorithm is a wrapper approach for brute-force evaluation of feature subsets; the best subset is selected by optimizing a specified performance metric given an arbitrary regressor or classifier. For instance, if the classifier is a logistic regression and the dataset consists of 4 features, the alogorithm will evaluate all 15 feature combinations (if `min_features=1` and `max_features=4`)
# 
# - {0}
# - {1}
# - {2}
# - {3}
# - {0, 1}
# - {0, 2}
# - {0, 3}
# - {1, 2}
# - {1, 3}
# - {2, 3}
# - {0, 1, 2}
# - {0, 1, 3}
# - {0, 2, 3}
# - {1, 2, 3}
# - {0, 1, 2, 3}
# 
# and select the one that results in the best performance (e.g., classification accuracy) of the logistic regression classifier.
# 
# 

# ## Example 1 - A simple Iris example

# Initializing a simple classifier from scikit-learn:

# In[30]:


from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

iris = load_iris()
X = iris.data
y = iris.target

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)


# ### Feature Names

# In[ ]:


# When working with large datasets, the feature indices might be hard to interpret. In this case, we recommend using pandas DataFrames with distinct column names as input:

# In[31]:


import pandas as pd

df_X = pd.DataFrame(X, columns=["Sepal length", "Sepal width", "Petal length", "Petal width"])
df_X.head()


# In[32]:


efs1 = efs1.fit(df_X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)


# ### Detailed Outputs

# Via the `subsets_` attribute, we can take a look at the selected feature indices at each step:

# In[33]:


efs1.subsets_


# ## Example 2 - Visualizing the feature selection results

#  For our convenience, we can visualize the output from the feature selection in a pandas DataFrame format using the `get_metric_dict` method of the `ExhaustiveFeatureSelector` object. The columns `std_dev` and `std_err` represent the standard deviation and standard errors of the cross-validation scores, respectively.

# Below, we see the DataFrame of the Sequential Forward Selector from Example 2:

# In[34]:


import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

feature_names = ('sepal length', 'sepal width',
                 'petal length', 'petal width')

df_X = pd.DataFrame(
    X, columns=["Sepal length", "Sepal width", "Petal length", "Petal width"])
efs1 = efs1.fit(df_X, y)

df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df


# In[35]:


import matplotlib.pyplot as plt

metric_dict = efs1.get_metric_dict()

fig = plt.figure()
k_feat = sorted(metric_dict.keys())
avg = [metric_dict[k]['avg_score'] for k in k_feat]

upper, lower = [], []
for k in k_feat:
    upper.append(metric_dict[k]['avg_score'] +
                 metric_dict[k]['std_dev'])
    lower.append(metric_dict[k]['avg_score'] -
                 metric_dict[k]['std_dev'])
    
plt.fill_between(k_feat,
                 upper,
                 lower,
                 alpha=0.2,
                 color='blue',
                 lw=1)

plt.plot(k_feat, avg, color='blue', marker='o')
plt.ylabel('Accuracy +/- Standard Deviation')
plt.xlabel('Number of Features')
feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
plt.xticks(k_feat, 
           [str(metric_dict[k]['feature_names']) for k in k_feat], 
           rotation=90)
plt.show()


# ## Example 3 - Exhaustive feature selection for regression analysis

# Similar to the classification examples above, the `SequentialFeatureSelector` also supports scikit-learn's estimators
# for regression.

# In[36]:


from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

boston = load_boston()
X, y = boston.data, boston.target

lr = LinearRegression()

efs = EFS(lr, 
          min_features=10,
          max_features=12,
          scoring='neg_mean_squared_error',
          cv=10)

efs.fit(X, y)

print('Best MSE score: %.2f' % efs.best_score_ * (-1))
print('Best subset:', efs.best_idx_)


# ## Example 4 - Regression and adjusted R2

# As shown in Example 3, the exhaustive feature selector can be used for selecting features via a regression model. In regression analysis, there exists the common phenomenon that the $R^2$ score can become spuriously inflated the more features we choose. Hence, and this is especially true for feature selection, it is useful to make model comparisons based on the adjusted $R^2$ value rather than the regular $R^2$. The adjusted $R^2$, $\bar{R}^{2}$, accounts for the number of features and examples as follows:
# 
# $$\bar{R}^{2}=1-\left(1-R^{2}\right) \frac{n-1}{n-p-1},$$
# 
# where $n$ is the number of examples and $p$ is the number of features.
# 
# One of the advantages of scikit-learn's API is that it's consistent, intuitive, and simple to use. However, one downside of this API design is that it can be a bit restrictive for certain scenarios. For instance, scikit-learn scoring function only take two inputs, the predicted and the true target values. Hence, we cannot use scikit-learn's scoring API to compute the adjusted $R^2$, which also requires the number of features.
# 
# However, as a workaround, we can compute the $R^2$ for the different feature subsets and then do a posthoc computation to obtain the adjusted $R^2$.

# **Step 1: Compute $R^2$:**

# In[37]:


from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

boston = load_boston()
X, y = boston.data, boston.target

lr = LinearRegression()

efs = EFS(lr, 
          min_features=10,
          max_features=12,
          scoring='r2',
          cv=10)

efs.fit(X, y)

print('Best R2 score: %.2f' % efs.best_score_ * (-1))
print('Best subset:', efs.best_idx_)


# **Step 2: Compute adjusted $R^2$:**

# In[38]:


def adjust_r2(r2, num_examples, num_features):
    coef = (num_examples - 1) / (num_examples - num_features - 1) 
    return 1 - (1 - r2) * coef


# In[39]:


for i in efs.subsets_:
    efs.subsets_[i]['adjusted_avg_score'] = (
        adjust_r2(r2=efs.subsets_[i]['avg_score'],
                  num_examples=X.shape[0]/10,
                  num_features=len(efs.subsets_[i]['feature_idx']))
    )


# **Step 3: Select best subset based on adjusted $R^2$:**

# In[40]:


score = -99e10

for i in efs.subsets_:
    score = efs.subsets_[i]['adjusted_avg_score']
    if ( efs.subsets_[i]['adjusted_avg_score'] == score and
        len(efs.subsets_[i]['feature_idx']) < len(efs.best_idx_) )\
      or efs.subsets_[i]['adjusted_avg_score'] > score:
        efs.best_idx_ = efs.subsets_[i]['feature_idx']


# In[41]:


print('Best adjusted R2 score: %.2f' % efs.best_score_ * (-1))
print('Best subset:', efs.best_idx_)


# ## Example 5 - Using the selected feature subset For making new predictions

# In[42]:


# Initialize the dataset

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=1)

knn = KNeighborsClassifier(n_neighbors=3)


# In[43]:


# Select the "best" three features via
# 5-fold cross-validation on the training set.

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

efs1 = EFS(knn, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           cv=5)
efs1 = efs1.fit(X_train, y_train)


# In[44]:


print('Selected features:', efs1.best_idx_)


# In[45]:


# Generate the new subsets based on the selected features
# Note that the transform call is equivalent to
# X_train[:, efs1.k_feature_idx_]

X_train_efs = efs1.transform(X_train)
X_test_efs = efs1.transform(X_test)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
knn.fit(X_train_efs, y_train)
y_pred = knn.predict(X_test_efs)

# Compute the accuracy of the prediction
acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))


# ## Example 6 - Exhaustive feature selection and GridSearch

# In[46]:


# Initialize the dataset

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=1)


# Use scikit-learn's `GridSearch` to tune the hyperparameters of the `LogisticRegression` estimator inside the `ExhaustiveFeatureSelector` and use it for prediction in the pipeline. **Note that the `clone_estimator` attribute needs to be set to `False`.**

# In[47]:


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

lr = LogisticRegression(multi_class='multinomial', 
                        solver='newton-cg', 
                        random_state=123)

efs1 = EFS(estimator=lr, 
           min_features=2,
           max_features=3,
           scoring='accuracy',
           print_progress=False,
           clone_estimator=False,
           cv=5,
           n_jobs=1)

pipe = make_pipeline(efs1, lr)

param_grid = {'exhaustivefeatureselector__estimator__C': [0.1, 1.0, 10.0]}
    
gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  n_jobs=1, 
                  cv=2, 
                  verbose=1, 
                  refit=False)

# run gridearch
gs = gs.fit(X_train, y_train)


# ... and the "best" parameters determined by GridSearch are ...

# In[48]:


print("Best parameters via GridSearch", gs.best_params_)


# #### Obtaining the best *k* feature indices after GridSearch

# If we are interested in the best *k* best feature indices via `SequentialFeatureSelection.best_idx_`, we have to initialize a `GridSearchCV` object with `refit=True`. Now, the grid search object will take the complete training dataset and the best parameters, which it found via cross-validation, to train the estimator pipeline.

# In[49]:


gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  n_jobs=1, 
                  cv=2, 
                  verbose=1, 
                  refit=True)


# After running the grid search, we can access the individual pipeline objects of the `best_estimator_` via the `steps` attribute.

# In[50]:


gs = gs.fit(X_train, y_train)
gs.best_estimator_.steps


# Via sub-indexing, we can then obtain the best-selected feature subset:

# In[51]:


print('Best features:', gs.best_estimator_.steps[0][1].best_idx_)


# During cross-validation, this feature combination had a CV accuracy of:

# In[52]:


print('Best score:', gs.best_score_)


# In[53]:


gs.best_params_


# **Alternatively**, if we can set the "best grid search parameters" in our pipeline manually if we ran `GridSearchCV` with `refit=False`. It should yield the same results:

# In[54]:


pipe.set_params(**gs.best_params_).fit(X_train, y_train)
print('Best features:', pipe.steps[0][1].best_idx_)


# ## Example 7 - Exhaustive Feature Selection with LOOCV

# The `ExhaustiveFeatureSelector` is not restricted to k-fold cross-validation. You can use any type of cross-validation method that supports the general scikit-learn cross-validation API. 
# 
# The following example illustrates the use of scikit-learn's `LeaveOneOut` cross-validation method in combination with the exhaustive feature selector.

# In[55]:


from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import LeaveOneOut


iris = load_iris()
X = iris.data
y = iris.target

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=LeaveOneOut()) ### Use cross-validation generator here

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)


# ## Example 8 - Interrupting Long Runs for Intermediate Results

# If your run is taking too long, it is possible to trigger a `KeyboardInterrupt` (e.g., ctrl+c on a Mac, or interrupting the cell in a Jupyter notebook) to obtain temporary results.

# **Toy dataset**

# In[61]:


from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


X, y = make_classification(
    n_samples=200000,
    n_features=6,
    n_informative=2,
    n_redundant=1,
    n_repeated=1,
    n_clusters_per_class=2,
    flip_y=0.05,
    class_sep=0.5,
    random_state=123,
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)


# **Long run with interruption**

# In[62]:


from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10000)

efs1 = EFS(model, 
           min_features=1, 
           max_features=4,
           print_progress=True,
           scoring='accuracy')

efs1 = efs1.fit(X_train, y_train)


# **Finalizing the fit**

# Note that the feature selection run hasn't finished, so certain attributes may not be available. In order to use the EFS instance, it is recommended to call `finalize_fit`, which will make EFS estimator appear as "fitted" process the temporary results:

# In[63]:


efs1.finalize_fit()


# In[64]:


print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)


# ## Example 9 - Working with Feature Groups

# Since mlxtend v0.21.0, it is possible to specify feature groups. Feature groups allow you to group certain features together, such that they are always selected as a group. This can be very useful in contexts similar to one-hot encoding -- if you want to treat the one-hot encoded feature as a single feature:
# 
# ![](SequentialFeatureSelector_files/feature_groups.jpeg)

# In the following example, we specify sepal length and sepal width as a feature group so that they are always selected together:

# In[29]:


from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

X_df = pd.DataFrame(X, columns=['sepal len', 'petal len',
                                'sepal wid', 'petal wid'])
X_df.head()


# In[30]:


from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=2,
           max_features=2,
           scoring='accuracy',
           feature_groups=[['sepal len', 'sepal wid'], ['petal len'], ['petal wid']],
           cv=3)

efs1 = efs1.fit(X_df, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)


# Notice that the returned number of features is 3, since the number of `min_features` and `max_features` corresponds to the number of feature groups. I.e., we have 2 feature groups in `['sepal len', 'sepal wid'], ['petal wid']`, but it expands to 3 features.

# ## API

# In[31]:


with open('../../api_modules/mlxtend.feature_selection/ExhaustiveFeatureSelector.md', 'r') as f:
    print(f.read())