#Build the datasets
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm, linear_model
from sklearn.neighbors import KNeighborsClassifier
import course_utils as bd
import pandas as pd
import numpy as np
reload(bd)

#Load data and downsample for a 50/50 split, then split into a train/test
f='/Users/briand/Desktop/ds course/datasets/ads_dataset_cut.txt'

train_split = 0.5
tdat = pd.read_csv(f,header=0,sep='\t')

lab = 'y_buy'

moddat = bd.downSample(tdat,lab,9)
#We know the dataset is sorted so we can just split by index
train = moddat[:int(math.floor(moddat.shape[0]*train_split))]
test = moddat[int(math.floor(moddat.shape[0]*train_split)):]

#Train the models
dt = DecisionTreeClassifier(criterion='entropy',min_samples_leaf = 10,max_depth = 4)
dt = dt.fit(train.drop(lab,1),train[lab])

lr = linear_model.LogisticRegression(C=1000)
lr.fit(train.drop(lab,1), train[lab])

mm = svm.SVC(kernel='linear', C=1)
mm.fit(train.drop(lab,1), 2*train[lab]-1)

knn = KNeighborsClassifier(n_neighbors=10, p=2)
knn.fit(train.drop(lab,1), train[lab])

#plt.clf()

def plotAUC(truth, pred, lab):
    fpr, tpr, thresholds = roc_curve(truth, pred)
    roc_auc = auc(fpr, tpr)
    c = (np.random.rand(), np.random.rand(), np.random.rand())
    plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC')
    plt.legend(loc="lower right")

plotAUC(test[lab], dt.predict_proba(test.drop(lab,1))[:,1], 'DT')
plotAUC(test[lab], lr.predict_proba(test.drop(lab,1))[:,1], 'LR')    
plotAUC(test[lab], mm.decision_function(test.drop(lab,1)), 'SVM')    
plotAUC(test[lab], knn.predict_proba(test.drop(lab,1))[:,1], 'kNN')    
plt.show()

def getMAE(pred, truth):
    return np.abs(truth - pred).mean()

def getLL(pred, truth):
    ll_sum = 0
    for i in range(len(pred)):
        if (pred[i] == 0):
            p = 0.0001
        elif (pred[i] == 1):
            p = 0.9999
        else:
            p = pred[i]
        ll_sum += truth[i]*np.log(p)+(1-truth[i])*np.log(1-p)
    return (ll_sum)/len(pred)


def plotCalib(truth, pred, bins, f, l):
    mae = np.round(getMAE(pred, truth),3)
    ll = np.round(getLL(pred, truth), 3)
    d = pd.DataFrame({'p':pred, 'y':truth})
    d['p_bin'] = np.floor(d['p']*bins)/bins
    d_bin=d.groupby(['p_bin']).agg([np.mean,len])
    filt = (d_bin['p']['len']>f)
    plt.plot(d_bin['p']['mean'][filt], d_bin['y']['mean'][filt], 'b.', label=l+': '+'ll={},mae={}'.format(ll,mae))
    plt.plot([0.0, 1.0], [0.0, 1.0], 'k-')
    #plt.title(label+':'+'MAE={}'.format(mae, ll))
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('prediction P(Y|X)')
    plt.ylabel('actual P(Y|X)')
    plt.legend(loc=4)
    
plt.clf()
fig = plt.figure()
#plt.title('Calibration Charts for DT vs. LR')

ax = fig.add_subplot(2,1,1)
plotCalib(test[lab].values, dt.predict_proba(test.drop(lab,1))[:,1], 50, 10, 'DT')

ax = fig.add_subplot(2,1,2)
plotCalib(test[lab].values, lr.predict_proba(test.drop(lab,1))[:,1], 50, 10, 'LR')    

plt.show()


from sklearn.metrics import roc_auc_score

def evalFeat(x_train, y_train, x_test, y_test):
    lr_f = linear_model.LogisticRegression(C=1e30)
    lr_f.fit(x_train, y_train)
    p = lr_f.predict_proba(x_test)[:,1]
    ll = -1*getLL(p, y_test.values)
    auc = roc_auc_score(y_test, p)
    return [ll, auc]


lls = []
aucs = []
feats = train.drop(lab,1).columns.values
for f in feats:
    ll_f, auc_f = evalFeat(train[[f]], train[lab], test[[f]], test[lab])
    lls.append(ll_f)
    aucs.append(auc_f)


def liftTable(pred, truth, b):
    df = pd.DataFrame({'p':pred+np.random.rand(len(pred))*0.000001,'y':truth})
    df['b'] = b - pd.qcut(df['p'], b, labels=False)
    df['n'] = np.ones(df.shape[0])
    df_grp = df.groupby(['b']).sum()
    base = np.sum(df_grp['y'])/float(df.shape[0])
    df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0])
    df_grp['y_cum'] = np.cumsum(df_grp['y'])
    df_grp['p_y_b'] = df_grp['y']/df_grp['n']
    df_grp['lift_b'] = df_grp['p_y_b']/base
    df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base
    return df_grp
    

lifts_lr = liftTable(lr.predict_proba(test.drop(lab,1))[:,1], test[lab], 25)
#lifts_svm = liftTable(mm.decision_function(test.drop(lab,1)), test[lab], 25)
lifts_dt = liftTable(dt.predict_proba(test.drop(lab,1))[:,1]+np.random.rand(test.shape[0])*0.00001, test[lab], 25)
lifts_knn = liftTable(knn.predict_proba(test.drop(lab,1))[:,1], test[lab], 25)   


plt.title('A Lift Chart on Ads Data with LR')
plt.plot(lifts_lr['n_cum'], lifts_lr['cum_lift_b'], label='LR')
plt.plot(lifts_knn['n_cum'], lifts_knn['cum_lift_b'], label='kNN')
plt.plot(lifts_dt['n_cum'], lifts_dt['cum_lift_b'], label='DT')

plt.plot(lifts_lr['n_cum'], np.ones(lifts_lr.shape[0]))
plt.xlim([lifts_lr['n_cum'].min(), lifts_lr['n_cum'].max()])
plt.ylim([0.0, lifts_lr['cum_lift_b'].max()+1])
plt.xlabel('Cumulative Ranked Users')
plt.ylabel('Cumulative Lift')
plt.legend()