The Reciever Operator Curve is a useful tool for understanding how well a classification model ranks instances.
Defining the ROC
Assume some function $f(x)$ that gives real-valued output. We define a classifier based on $f(x)$ as the following:
The Area Under the Curve
The area under the curve captures the ROC information as a single metric. It is useful for both model and feature selection. With the ROC defined, we simply integrate across all values of FPR to get the area.
While this is the exact definition, and is how one might measure it empirically using numeric integration, it is not necessarily intuitive to think of a classifier in the $FPR$ domain. After all, we choose the threshold $t$ and not the $FPR$. We can get around this, but first lets define a few things. Let $s^+(v)$ and $s^-(v)$ be the PDF of $v=f(x)$ for positive and negative labeled examples, respectively. Similarly, let $S^+(v)$ and $S^-(v)$ be the CDF of $v=f(x)$ for positive and negative labeled labeled, respectively.
We can use these to define the $FPR$ and $TPR$ as functions of $t$.
With $FPR$ and $TPR$ defined as functions of $t$, we can use the fact that $\frac{dFPR(t)}{dt}=-s^-(t)$
to show that:
This last term expresess $AUC$ as a function of both the distribution of $f(x)$ for both positive and negative instances. This gives the AUC a probabilistic interpretation.
For given threshold $t$, defining an infintesimal interval $[t, t+dt]$, the probability that a negative has $v$ in that interval is $s^-(t)\:dt$. The probability that a positive will have a score greater than $t$ is $TPR(t)=1-S^+(t)$, so that the probability that a positive will have a score higher than a negative, given that the negative has a score within the interval $[t, t+dt]$ is $(1-S^+(t))\:s^-(t)\:dt$
Marginalize out $t$ by integrating over all possible values to get the probaility that a positive ranks higher than a negative.
Which, as we showed above, is equal to the AUC.
So that's enough theory for now, let's do a model bakeoff and show how the ROC can be used for model selection. First we'll build a model using four different algorithms and compare their ROCs.
#Build the datasets
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm, linear_model
from sklearn.neighbors import KNeighborsClassifier
import course_utils as bd
import pandas as pd
import numpy as np
reload(bd)
#Load data and downsample for a 50/50 split, then split into a train/test
f='/Users/briand/Desktop/ds course/datasets/ads_dataset_cut.txt'
train_split = 0.5
tdat = pd.read_csv(f,header=0,sep='\t')
lab = 'y_buy'
moddat = bd.downSample(tdat,lab,9)
#We know the dataset is sorted so we can just split by index
train = moddat[:int(math.floor(moddat.shape[0]*train_split))]
test = moddat[int(math.floor(moddat.shape[0]*train_split)):]
#Train the models
dt = DecisionTreeClassifier(criterion='entropy',min_samples_leaf = 10,max_depth = 4)
dt = dt.fit(train.drop(lab,1),train[lab])
lr = linear_model.LogisticRegression(C=1000)
lr.fit(train.drop(lab,1), train[lab])
mm = svm.SVC(kernel='linear', C=1)
mm.fit(train.drop(lab,1), 2*train[lab]-1)
knn = KNeighborsClassifier(n_neighbors=10, p=2)
knn.fit(train.drop(lab,1), train[lab])
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=10, p=2, weights='uniform')
#plt.clf()
def plotAUC(truth, pred, lab):
fpr, tpr, thresholds = roc_curve(truth, pred)
roc_auc = auc(fpr, tpr)
c = (np.random.rand(), np.random.rand(), np.random.rand())
plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plotAUC(test[lab], dt.predict_proba(test.drop(lab,1))[:,1], 'DT')
plotAUC(test[lab], lr.predict_proba(test.drop(lab,1))[:,1], 'LR')
plotAUC(test[lab], mm.decision_function(test.drop(lab,1)), 'SVM')
plotAUC(test[lab], knn.predict_proba(test.drop(lab,1))[:,1], 'kNN')
plt.show()
A calibration chart
def getMAE(pred, truth):
return np.abs(truth - pred).mean()
def getLL(pred, truth):
ll_sum = 0
for i in range(len(pred)):
if (pred[i] == 0):
p = 0.0001
elif (pred[i] == 1):
p = 0.9999
else:
p = pred[i]
ll_sum += truth[i]*np.log(p)+(1-truth[i])*np.log(1-p)
return (ll_sum)/len(pred)
def plotCalib(truth, pred, bins, f, l):
mae = np.round(getMAE(pred, truth),3)
ll = np.round(getLL(pred, truth), 3)
d = pd.DataFrame({'p':pred, 'y':truth})
d['p_bin'] = np.floor(d['p']*bins)/bins
d_bin=d.groupby(['p_bin']).agg([np.mean,len])
filt = (d_bin['p']['len']>f)
plt.plot(d_bin['p']['mean'][filt], d_bin['y']['mean'][filt], 'b.', label=l+': '+'ll={},mae={}'.format(ll,mae))
plt.plot([0.0, 1.0], [0.0, 1.0], 'k-')
#plt.title(label+':'+'MAE={}'.format(mae, ll))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('prediction P(Y|X)')
plt.ylabel('actual P(Y|X)')
plt.legend(loc=4)
plt.clf()
fig = plt.figure()
#plt.title('Calibration Charts for DT vs. LR')
ax = fig.add_subplot(2,1,1)
plotCalib(test[lab].values, dt.predict_proba(test.drop(lab,1))[:,1], 50, 10, 'DT')
ax = fig.add_subplot(2,1,2)
plotCalib(test[lab].values, lr.predict_proba(test.drop(lab,1))[:,1], 50, 10, 'LR')
plt.show()
<matplotlib.figure.Figure at 0x1091e5d50>
Loop through features to get AUC and LL
from sklearn.metrics import roc_auc_score
def evalFeat(x_train, y_train, x_test, y_test):
lr_f = linear_model.LogisticRegression(C=1e30)
lr_f.fit(x_train, y_train)
p = lr_f.predict_proba(x_test)[:,1]
ll = -1*getLL(p, y_test.values)
auc = roc_auc_score(y_test, p)
return [ll, auc]
lls = []
aucs = []
feats = train.drop(lab,1).columns.values
for f in feats:
ll_f, auc_f = evalFeat(train[[f]], train[lab], test[[f]], test[lab])
lls.append(ll_f)
aucs.append(auc_f)
A quick Lift chart
def liftTable(pred, truth, b):
df = pd.DataFrame({'p':pred+np.random.rand(len(pred))*0.000001,'y':truth})
df['b'] = b - pd.qcut(df['p'], b, labels=False)
df['n'] = np.ones(df.shape[0])
df_grp = df.groupby(['b']).sum()
base = np.sum(df_grp['y'])/float(df.shape[0])
df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0])
df_grp['y_cum'] = np.cumsum(df_grp['y'])
df_grp['p_y_b'] = df_grp['y']/df_grp['n']
df_grp['lift_b'] = df_grp['p_y_b']/base
df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base
return df_grp
lifts_lr = liftTable(lr.predict_proba(test.drop(lab,1))[:,1], test[lab], 25)
#lifts_svm = liftTable(mm.decision_function(test.drop(lab,1)), test[lab], 25)
lifts_dt = liftTable(dt.predict_proba(test.drop(lab,1))[:,1]+np.random.rand(test.shape[0])*0.00001, test[lab], 25)
lifts_knn = liftTable(knn.predict_proba(test.drop(lab,1))[:,1], test[lab], 25)
plt.title('A Lift Chart on Ads Data with LR')
plt.plot(lifts_lr['n_cum'], lifts_lr['cum_lift_b'], label='LR')
plt.plot(lifts_knn['n_cum'], lifts_knn['cum_lift_b'], label='kNN')
plt.plot(lifts_dt['n_cum'], lifts_dt['cum_lift_b'], label='DT')
plt.plot(lifts_lr['n_cum'], np.ones(lifts_lr.shape[0]))
plt.xlim([lifts_lr['n_cum'].min(), lifts_lr['n_cum'].max()])
plt.ylim([0.0, lifts_lr['cum_lift_b'].max()+1])
plt.xlabel('Cumulative Ranked Users')
plt.ylabel('Cumulative Lift')
plt.legend()
<matplotlib.legend.Legend at 0x10956ea10>