#!/usr/bin/env python # coding: utf-8 # In[1]: ## Why do Feature Selection ?? # Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise. # Improves Accuracy: Less misleading data means modeling accuracy improves. # Reduces Training Time: Less data means that algorithms train faster. # In[2]: # Imports from sklearn.utils import shuffle import pandas as pd import numpy as np np.random.seed(10) import sklearn from numpy import set_printoptions from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.ensemble import ExtraTreesClassifier from sklearn import preprocessing from sklearn.preprocessing import MinMaxScaler print sklearn.__version__ print pd.__version__ print np.__version__ # In[3]: # this class takes care for scaling the features to the scale of 0-1 # we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which # also has the range from 0-1 class Normalizer: def __init__(self): self.sc = MinMaxScaler() def scale(self, X, dtype): if dtype=='train': XX = self.sc.fit_transform(X) elif dtype=='test': XX = self.sc.transform(X) else: return None return XX # In[4]: #### METHOD-1 # Univariate Feature Selection ### # Select those features that have strong relationship with the output variable df = pd.read_csv('../data/day1/iris.csv') df = shuffle(df) print df.head(2) print '*'*20 norm = Normalizer() X = norm.scale(df.iloc[:,:-1].values, 'train') Y = df.iloc[:,-1].values print X[:2], Y[:2] print '*'*20 # feature extraction test = SelectKBest(score_func=chi2, k=2) fit = test.fit(X, Y) # summarize scores np.set_printoptions(precision=3) print(fit.scores_) print '*'*20 features = fit.transform(X) # summarize selected features print(features[0:2,:]) # we see that feat3 and feat4 are good enough for this problem as per this feature selection technique # ### Suggestions before you see the next cell # # * Read about chi-square test and get the intution. # In[5]: #### METHOD-2 # Recursive Feature Elimination (Backwards) ### # The Recursive Feature Elimination (or RFE) works by recursively removing attributes and # building a model on those attributes that remain. It uses the model accuracy to identify which attributes # (and combination of attributes) contribute the most to predicting the target attribute df = pd.read_csv('../data/day1/iris.csv') df = shuffle(df) print df.head(2) print '*'*20 norm = Normalizer() X = norm.scale(df.iloc[:,:-1].values, 'train') Y = df.iloc[:,-1].values print X[:2], Y[:2] print '*'*20 model = LogisticRegression() rfe = RFE(model, 2) fit = rfe.fit(X, Y) print("Selected Features: %s") % fit.support_ # we see that feat3 and feat4 are good enough for this problem as per this feature selection technique # ### Suggestions before you see the next cell # # * http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html # In[6]: #### METHOD-3 # Using Feature importance from any bagging algorithm ### df = pd.read_csv('../data/day1/iris.csv') df = shuffle(df) print df.head(2) print '*'*20 norm = Normalizer() X = norm.scale(df.iloc[:,:-1].values, 'train') Y = df.iloc[:,-1].values print X[:2], Y[:2] print '*'*20 # feature extraction model = ExtraTreesClassifier() model.fit(X, Y) print model.feature_importances_ # we see that feat3 and feat4 are good enough for this problem as per this feature selection technique # In[7]: # Feature Selection and Elimination are mutually exclusive techniques for getting correct set of features for # your ML model # Few techniques for elimination are: ## Remove Zero/Less Variance columns ## Remove Columns with many missing values ## Remove Highly +ve/-ve co-related features # In[8]: # Other techniques for feature selection include: ## PCA Decomposition ## Auto Encoders ### we will see them in few upcoming days!!