#!/usr/bin/env python
# coding: utf-8

# In[1]:


## Why do Feature Selection ??

# Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
# Improves Accuracy: Less misleading data means modeling accuracy improves.
# Reduces Training Time: Less data means that algorithms train faster.


# In[2]:


# Imports
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
np.random.seed(10)
import sklearn
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

print sklearn.__version__
print pd.__version__
print np.__version__


# In[3]:


# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which 
# also has the range from 0-1
class Normalizer:

    def __init__(self):
        self.sc = MinMaxScaler()
    
    def scale(self, X, dtype):
        if dtype=='train':
            XX = self.sc.fit_transform(X)
        elif dtype=='test':
            XX = self.sc.transform(X)
        else:
            return None
        return XX


# In[4]:


#### METHOD-1
# Univariate Feature Selection
###

# Select those features that have strong relationship with the output variable
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
# feature extraction
test = SelectKBest(score_func=chi2, k=2)
fit = test.fit(X, Y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
print '*'*20
features = fit.transform(X)
# summarize selected features
print(features[0:2,:])

# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique


# ### Suggestions before you see the next cell
# 
# * Read about chi-square test and get the intution.

# In[5]:


#### METHOD-2
# Recursive Feature Elimination (Backwards)
###
# The Recursive Feature Elimination (or RFE) works by recursively removing attributes and
# building a model on those attributes that remain. It uses the model accuracy to identify which attributes 
# (and combination of attributes) contribute the most to predicting the target attribute

df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
model = LogisticRegression()
rfe = RFE(model, 2)
fit = rfe.fit(X, Y)
print("Selected Features: %s") % fit.support_

# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique


# ### Suggestions before you see the next cell
# 
# * http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

# In[6]:


#### METHOD-3
# Using Feature importance from any bagging algorithm
###
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print model.feature_importances_

# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique


# In[7]:


# Feature Selection and Elimination are mutually exclusive techniques for getting correct set of features for
# your ML model

# Few techniques for elimination are:
## Remove Zero/Less Variance columns
## Remove Columns with many missing values
## Remove Highly +ve/-ve co-related features


# In[8]:


# Other techniques for feature selection include: 
## PCA Decomposition
## Auto Encoders

### we will see them in few upcoming days!!