#Build the datasets from sklearn.metrics import roc_curve, auc from sklearn.tree import DecisionTreeClassifier from sklearn import svm, linear_model from sklearn.neighbors import KNeighborsClassifier import course_utils as bd import pandas as pd import numpy as np reload(bd) #Load data and downsample for a 50/50 split, then split into a train/test f='/Users/briand/Desktop/ds course/datasets/ads_dataset_cut.txt' train_split = 0.5 tdat = pd.read_csv(f,header=0,sep='\t') lab = 'y_buy' moddat = bd.downSample(tdat,lab,9) #We know the dataset is sorted so we can just split by index train = moddat[:int(math.floor(moddat.shape[0]*train_split))] test = moddat[int(math.floor(moddat.shape[0]*train_split)):] #Train the models dt = DecisionTreeClassifier(criterion='entropy',min_samples_leaf = 10,max_depth = 4) dt = dt.fit(train.drop(lab,1),train[lab]) lr = linear_model.LogisticRegression(C=1000) lr.fit(train.drop(lab,1), train[lab]) mm = svm.SVC(kernel='linear', C=1) mm.fit(train.drop(lab,1), 2*train[lab]-1) knn = KNeighborsClassifier(n_neighbors=10, p=2) knn.fit(train.drop(lab,1), train[lab]) #plt.clf() def plotAUC(truth, pred, lab): fpr, tpr, thresholds = roc_curve(truth, pred) roc_auc = auc(fpr, tpr) c = (np.random.rand(), np.random.rand(), np.random.rand()) plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC') plt.legend(loc="lower right") plotAUC(test[lab], dt.predict_proba(test.drop(lab,1))[:,1], 'DT') plotAUC(test[lab], lr.predict_proba(test.drop(lab,1))[:,1], 'LR') plotAUC(test[lab], mm.decision_function(test.drop(lab,1)), 'SVM') plotAUC(test[lab], knn.predict_proba(test.drop(lab,1))[:,1], 'kNN') plt.show() def getMAE(pred, truth): return np.abs(truth - pred).mean() def getLL(pred, truth): ll_sum = 0 for i in range(len(pred)): if (pred[i] == 0): p = 0.0001 elif (pred[i] == 1): p = 0.9999 else: p = pred[i] ll_sum += truth[i]*np.log(p)+(1-truth[i])*np.log(1-p) return (ll_sum)/len(pred) def plotCalib(truth, pred, bins, f, l): mae = np.round(getMAE(pred, truth),3) ll = np.round(getLL(pred, truth), 3) d = pd.DataFrame({'p':pred, 'y':truth}) d['p_bin'] = np.floor(d['p']*bins)/bins d_bin=d.groupby(['p_bin']).agg([np.mean,len]) filt = (d_bin['p']['len']>f) plt.plot(d_bin['p']['mean'][filt], d_bin['y']['mean'][filt], 'b.', label=l+': '+'ll={},mae={}'.format(ll,mae)) plt.plot([0.0, 1.0], [0.0, 1.0], 'k-') #plt.title(label+':'+'MAE={}'.format(mae, ll)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('prediction P(Y|X)') plt.ylabel('actual P(Y|X)') plt.legend(loc=4) plt.clf() fig = plt.figure() #plt.title('Calibration Charts for DT vs. LR') ax = fig.add_subplot(2,1,1) plotCalib(test[lab].values, dt.predict_proba(test.drop(lab,1))[:,1], 50, 10, 'DT') ax = fig.add_subplot(2,1,2) plotCalib(test[lab].values, lr.predict_proba(test.drop(lab,1))[:,1], 50, 10, 'LR') plt.show() from sklearn.metrics import roc_auc_score def evalFeat(x_train, y_train, x_test, y_test): lr_f = linear_model.LogisticRegression(C=1e30) lr_f.fit(x_train, y_train) p = lr_f.predict_proba(x_test)[:,1] ll = -1*getLL(p, y_test.values) auc = roc_auc_score(y_test, p) return [ll, auc] lls = [] aucs = [] feats = train.drop(lab,1).columns.values for f in feats: ll_f, auc_f = evalFeat(train[[f]], train[lab], test[[f]], test[lab]) lls.append(ll_f) aucs.append(auc_f) def liftTable(pred, truth, b): df = pd.DataFrame({'p':pred+np.random.rand(len(pred))*0.000001,'y':truth}) df['b'] = b - pd.qcut(df['p'], b, labels=False) df['n'] = np.ones(df.shape[0]) df_grp = df.groupby(['b']).sum() base = np.sum(df_grp['y'])/float(df.shape[0]) df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0]) df_grp['y_cum'] = np.cumsum(df_grp['y']) df_grp['p_y_b'] = df_grp['y']/df_grp['n'] df_grp['lift_b'] = df_grp['p_y_b']/base df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base return df_grp lifts_lr = liftTable(lr.predict_proba(test.drop(lab,1))[:,1], test[lab], 25) #lifts_svm = liftTable(mm.decision_function(test.drop(lab,1)), test[lab], 25) lifts_dt = liftTable(dt.predict_proba(test.drop(lab,1))[:,1]+np.random.rand(test.shape[0])*0.00001, test[lab], 25) lifts_knn = liftTable(knn.predict_proba(test.drop(lab,1))[:,1], test[lab], 25) plt.title('A Lift Chart on Ads Data with LR') plt.plot(lifts_lr['n_cum'], lifts_lr['cum_lift_b'], label='LR') plt.plot(lifts_knn['n_cum'], lifts_knn['cum_lift_b'], label='kNN') plt.plot(lifts_dt['n_cum'], lifts_dt['cum_lift_b'], label='DT') plt.plot(lifts_lr['n_cum'], np.ones(lifts_lr.shape[0])) plt.xlim([lifts_lr['n_cum'].min(), lifts_lr['n_cum'].max()]) plt.ylim([0.0, lifts_lr['cum_lift_b'].max()+1]) plt.xlabel('Cumulative Ranked Users') plt.ylabel('Cumulative Lift') plt.legend()