In this lab we'll look at:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
import os
%matplotlib inline
First we'll load the dataset and take a quick peak at its columns and size
#load dataset
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'
f = datadir + 'ads_dataset_cut.txt'
data = pd.read_csv(f, sep = '\t')
data.columns, data.shape
(Index(['isbuyer', 'buy_freq', 'visit_freq', 'buy_interval', 'sv_interval', 'expected_time_buy', 'expected_time_visit', 'last_buy', 'last_visit', 'multiple_buy', 'multiple_visit', 'uniq_urls', 'num_checkins', 'y_buy'], dtype='object'), (54584, 14))
In the next step we'll use the Decision Tree classifier's built in feature importance attribute to compute the normalized Mutual Information/Information Gain of each feature. Note a few things about this approach: 1). With extremely high dimensional data, one may want to calculate the normalized MI directly for each feature (the code to do that is a bit more complex so we used the DT instead), 2). The DT is a greedy algorithm, so the feature importance ranks it produces may not be equal to the rank of normalized MI calculated individually.
#import the decision tree module from sklearn
from sklearn.tree import DecisionTreeClassifier
#build a decision tree with max_depth = 20 using entropy
Y = data['y_buy']
X = data.drop('y_buy', 1)
dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
dt.fit(X, Y)
#Now use built in feature importance attribute to get MI of each feature and Y
feature_mi = dt.feature_importances_
Now we'll add the feature importances to a dictionary where key-values are: {feature_name:dt_feature_importance}. This can be done in one line using the zip and dict functions.
#Add features and their importances to a dictionary
feature_mi_dict = dict(zip(X.columns.values, feature_mi))
Now we are going to compute feature ranks using AUC. We can do this without fitting a model, by just seeing how well the individual feature ranks the positives and negatives.
#define a function to print ROC curves.
#It should take in only arrays/lists of predictions and outcomes
from sklearn.metrics import roc_curve, auc
def plotUnivariateROC(preds, truth, label_string):
'''
preds is an nx1 array of predictions
truth is an nx1 array of truth labels
label_string is text to go into the plotting label
'''
#Student input code here
#1. call the roc_curve function to get the ROC X and Y values
fpr, tpr, thresholds = roc_curve(truth, preds)
#2. Input fpr and tpr into the auc function to get the AUC
roc_auc = auc(fpr, tpr)
#we are doing this as a special case because we are sending unfitted predictions
#into the function
if roc_auc < 0.5:
fpr, tpr, thresholds = roc_curve(truth, -1 * preds)
roc_auc = auc(fpr, tpr)
#chooses a random color for plotting
c = (np.random.rand(), np.random.rand(), np.random.rand())
#create a plot and set some options
plt.plot(fpr, tpr, color = c, label = label_string + ' (AUC = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
return roc_auc
Next we'll run each feature through the above function to get its invdividual AUC and also plot on a chart. We add some extra lines of matplotlib code to control the formatting and position of the legend. We also want to add each to a dictionary of the format {feature_name:feature_auc}, similar to what we did above (though not using the same one liner). Take some time to review the chart and think about why different features produce differently shaped curves.
fig = plt.figure(figsize = (12, 6))
ax = plt.subplot(111)
#Plot the univariate AUC on the training data. Store the AUC
#Student put code here
feature_auc_dict = {}
for col in data.drop('y_buy',1).columns:
feature_auc_dict[col] = plotUnivariateROC(data[col], Y, col)
# Put a legend below current axis
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])
ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True,
shadow = True, ncol = 4, prop = {'size':10})
<matplotlib.legend.Legend at 0x117969e80>
Next we want to add both of the dictionaries created above into a data frame.
#Add auc and mi each to a single dataframe
df_auc = pd.DataFrame(pd.Series(feature_auc_dict), columns = ['auc'])
df_mi = pd.DataFrame(pd.Series(feature_mi_dict), columns = ['mi'])
#Now merge the two on the feature name
feat_imp_df = df_auc.merge(df_mi, left_index = True, right_index = True)
feat_imp_df
auc | mi | |
---|---|---|
buy_freq | 0.668931 | 0.000000 |
buy_interval | 0.564296 | 0.000000 |
expected_time_buy | 0.526401 | 0.031809 |
expected_time_visit | 0.594029 | 0.016068 |
isbuyer | 0.667206 | 0.000000 |
last_buy | 0.660148 | 0.191752 |
last_visit | 0.814216 | 0.466288 |
multiple_buy | 0.564313 | 0.000000 |
multiple_visit | 0.733904 | 0.000000 |
num_checkins | 0.574946 | 0.037052 |
sv_interval | 0.717967 | 0.063482 |
uniq_urls | 0.588509 | 0.025836 |
visit_freq | 0.780138 | 0.167713 |
To put the different metrics on the same scale, we'll use pandas rank() method for each feature.
#Now create a df that holds the ranks of auc and mi
feat_ranks = feat_imp_df.rank(axis = 0, ascending = False)
#Plot the two ranks
plt.plot(feat_ranks.auc, feat_ranks.mi, '.')
#Plot a y=x reference line
plt.plot(feat_ranks.auc, feat_ranks.auc,)
[<matplotlib.lines.Line2D at 0x1052bfc50>]
Do the feature importance metrics above completely agree with each other (in terms of rank order)? If not, why do you suppose?
Some discussion points:
#Now create lists of top 5 features for both auc and mi
top5_auc = list(feat_ranks[(feat_ranks.auc <= 5)].index.values)
top5_mi = list(feat_ranks[(feat_ranks.mi <= 5)].index.values)
top5_auc, top5_mi
(['buy_freq', 'last_visit', 'multiple_visit', 'sv_interval', 'visit_freq'], ['last_buy', 'last_visit', 'num_checkins', 'sv_interval', 'visit_freq'])
The next step is the conclusive step from all the analysis done above. We want to test which method above can be used to produce the best subset of features. What we'll do is use the top 5 features ranked by both AUC and the decision tree feature importance and compare them against each other with different algorithms.
'''
Now do the following
1. Split the data into 80/20 train/test
2. For each set of features:
- build two decision trees (max_depth in {10, 20})
- build two logistic regression (C in {10**-2, 10**2})
- get the auc and log-loss on the test set
'''
from sklearn.linear_model import LogisticRegression
def getLogLoss(Ps, Ys, eps = 10**-6):
return ((Ys == 1) * np.log(Ps + eps) + (Ys == 0) * np.log(1 - Ps + eps)).mean()
#Split into train and test randomly without using sklearn package
#Note, there are many ways to do this:
train_pct = 0.8
#1. create an array of n random uniform variables drawn on [0,1] range
rand = np.random.rand(data.shape[0])
#2. Convert to boolean where True = random number < train_pct
rand_filt = (rand < train_pct)
#Use the filter to index data
train = data[rand_filt]
test = data[(rand_filt == False)]
fsets = [top5_auc, top5_mi]
fset_descr = ['auc', 'mi']
mxdepths = [5, 10]
Cs = [10**-2, 10**2]
#Set up plotting box
fig = plt.figure(figsize = (15, 8))
ax = plt.subplot(111)
for i, fset in enumerate(fsets):
descr = fset_descr[i]
#set training and testing data
Y_train = train['y_buy']
X_train = train[fset]
Y_test = test['y_buy']
X_test = test[fset]
for d in mxdepths:
dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = d)
dt.fit(X_train, Y_train)
preds_dt = dt.predict_proba(X_test)[:, 1]
ll_dt = getLogLoss(preds_dt, Y_test)
plotUnivariateROC(preds_dt, Y_test, '{}:DT:md={}:(LL={})'.format(descr, d, round(ll_dt, 3)))
for C in Cs:
lr = LogisticRegression(C = C)
lr.fit(X_train, Y_train)
preds_lr = lr.predict_proba(X_test)[:, 1]
ll_lr = getLogLoss(preds_lr, Y_test)
plotUnivariateROC(preds_lr, Y_test, '{}:LR:C={}:(LL={})'.format(descr, C, round(ll_lr, 3)))
# Put a legend below current axis
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])
ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True,
shadow = True, ncol = 2, prop = {'size':10})
<matplotlib.legend.Legend at 0x116a78b70>
So which feature selection method produced a better model? Any intuition as to why?