#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import synapseclient
import os
import sys
import pandas
import pylab as pl

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import pylab as pl
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, precision_recall_curve

import warnings
warnings.filterwarnings('ignore')


# In[2]:


syn = synapseclient.login()


# In[3]:


def get_truePhenotype_testSamples_RA_challenge(syn):
    """
    function to get the read the true phenotype values for the testSamples
    """
    #get the phenotype data
    phenotype_data_synId = 'syn2324939'
    phenotype_data_entity = syn.get(phenotype_data_synId)
    phenotype_df = pandas.DataFrame.from_csv(header=0, sep=" ", path=phenotype_data_entity.path)
    phenotype_df = phenotype_df.reset_index()
    #renaming column 'ID_1' to 'ID'
    phenotype_df['ID'] = phenotype_df.ID_1
    phenotype_df = phenotype_df.drop(['ID_1'], axis=1)

    #get the partitions and extract the testSamples ID  (test1 == 1)
    partition_data_synId = 'syn2351768'
    partition_data_entity = syn.get(partition_data_synId)
    partitions_df = pandas.DataFrame.from_csv(partition_data_entity.path,header=0,sep=" ")
    partitions_df = partitions_df.reset_index()

    #the following generates warning 
    #testSamples_ID = partitions_df.query('test1 == 1')['ID']
    testSamples_ID = partitions_df[partitions_df['test1'] == 1]['ID']

    #return only the phenotypes for the testSamples set
    #testSamples_true_phenotype = phenotype_df.query('ID == testSamples_ID')
    testSamples_true_phenotype = phenotype_df[phenotype_df.ID.isin(testSamples_ID)]
    return (testSamples_true_phenotype)


# In[4]:


truth_df = get_truePhenotype_testSamples_RA_challenge(syn)


# ####creating binary class for Precision Recall calculation 
# * 1 = non responder
# * 0 = responder

# In[5]:


truth_df['true_class'] = np.nan
truth_df.true_class[truth_df.Response.isin(['Intermediate', 'Good'])] = 0
truth_df.true_class[truth_df.Response.isin(['Non', 'Supernon'])] = 1


# In[6]:


def __get_blockWise_stats(sub_stats):
    
    #group to calculate group wise stats for each block
    grouped = sub_stats.groupby(['predict'], sort=False)
    
    #instantiate a pandas dataframe to store the results for each group (tied values)
    result = pandas.DataFrame.from_dict({'block':xrange(len(grouped)),
                                         'block_numElements'  : np.nan,
                                         'block_truePos_density' : np.nan,
                                         'block_truePos'      : np.nan,
                                         'blockValue'   : np.nan
                                         })
    
    for block,grp in enumerate(grouped):
        name,grp = grp[0],grp[1]
        truePositive = sum(grp.truth == 1)
        grp_truePositive_density = truePositive / float(len(grp))
        idxs = result.block == block
        result.block_truePos_density[idxs] = grp_truePositive_density
        result.block_numElements[idxs] = len(grp)
        result.block_truePos[idxs] = truePositive
        result.blockValue[idxs] = grp.predict.unique()
    result.block = result.block + 1
    result['cum_numElements'] = result.block_numElements.cumsum()
    result['cum_truePos'] = result.block_truePos.cumsum()
    
    return(result)


# In[7]:


def get_precision_recall_fpr(truth, pred):
    
    sub_stats = pandas.DataFrame.from_dict({'predict':pred, 'truth':truth}, dtype='float64')
    sub_stats = sub_stats.sort_values(by=['predict'],ascending=False)

    blockWise_stats = __get_blockWise_stats(sub_stats)
    grouped = sub_stats.groupby(['predict'],sort=False)
    sub_stats = grouped.apply(__nonlinear_interpolated_evalStats,blockWise_stats)
#    sub_stats = sub_stats.sort(columns=['precision'])
    precision, recall,  fpr, threshold = sub_stats.precision.values, sub_stats.recall.values, sub_stats.fpr.values, sub_stats.predict.values 
    
    #YFG suggestion - for the case when Truth == Prediction
    # REF - https://github.com/Sage-Bionetworks/DARPA_Challenge/blob/master/challenge_config.py#L162
    #PR curve AUC (Fixes error when prediction == truth)
    #recall_new=list(recall)
    #precision_new=list(precision)
    #recall_new.reverse()
    #recall_new.append(0)
    #recall_new.reverse()
    #precision_new.reverse()
    #precision_new.append(precision_new[len(precision_new)-1])
    #precision_new.reverse()
    
    ### Implementing the change using numpy style  // Abhishek Pratap - 08/31/2016
    recall_mod = np.insert(recall,0,0)  ## adding 0 at the beginning
    precision_mod = np.insert(precision,0,precision[0]) ## adding corresponding value at the beginning 
    fpr_mod = np.insert(fpr,0,fpr[0]) ## adding corresponding value at the beginning 

    return(precision_mod, recall_mod, fpr_mod, threshold)


def __nonlinear_interpolated_evalStats(block_df, blockWise_stats):
    """
    //needs to be updated
    """
    
    blockValue = block_df.predict.unique()
    if len(blockValue) != 1:
        raise Exception("grouping by predict column doesnt yield unique predict vals per group..WIERD")
    blockValue = blockValue[0]
    blockStats = blockWise_stats[blockWise_stats.blockValue == blockValue].squeeze() #squeeze will convert one row df to series
    
    block_precision = []
    block_recall = []
    block_fpr = []
    test_FP = []
    test_TP = []
    total_elements = blockWise_stats.cum_numElements.max()
    total_truePos = blockWise_stats.cum_truePos.max()
    total_trueNeg = total_elements - total_truePos
    for block_depth,row in enumerate(block_df.iterrows()):
        block_depth += 1  #increase block depth by 1 
        #calculate the cumulative true positives seen till the last block from the current active block
        # and the total number of elements(cumulative) seen till the last block
        if blockStats.block == 1: #no previous obviously
            cum_truePos_till_lastBlock = 0
            cum_numElements_till_lastBlock = 0
            cum_trueNeg_till_lastBlock = 0
        elif blockStats.block > 1:
            last_blockStats = blockWise_stats[blockWise_stats.block == (blockStats.block-1)].squeeze()
            cum_truePos_till_lastBlock = last_blockStats['cum_truePos']
            cum_numElements_till_lastBlock = last_blockStats['cum_numElements']
            cum_trueNeg_till_lastBlock = cum_numElements_till_lastBlock - cum_truePos_till_lastBlock
            
        truePos = cum_truePos_till_lastBlock + (blockStats.block_truePos_density*block_depth)
        falsePos = cum_trueNeg_till_lastBlock + ((1 - blockStats.block_truePos_density ) * block_depth)
        test_FP.append(falsePos)
        test_TP.append(truePos)
        #precision
        interpolated_precision = truePos / float((cum_numElements_till_lastBlock+block_depth))
        block_precision.append(interpolated_precision)
        #recall == true positive rate
        interpolated_recall = truePos / float(total_truePos)
        block_recall.append(interpolated_recall)
        #fpr == false positive rate
        interpolated_fpr = falsePos / float(total_trueNeg)
        block_fpr.append(interpolated_fpr)
        
    block_df['precision'] = block_precision
    block_df['recall'] = block_recall
    block_df['fpr'] = block_fpr
    block_df['block_depth'] = np.arange(1,block_df.shape[0]+1)
    block_df['block'] = blockStats.block
    return(block_df)


# In[45]:


def __temp_plot(truth, pred, debug=False):
    
    fpr1, tpr1, threshold1 = roc_curve(truth, pred)
    frp1 = sorted(fpr1)
    precision1, recall1, thresholds  = precision_recall_curve(truth, pred)
    print 'linear interpolation (AUC: %0.3f)' % auc(recall1,precision1, reorder=True)

    precision2, recall2, fpr2, threshold2= get_precision_recall_fpr(truth, pred)
    print 'Non-linear interpolation (AUC: %0.3f)' % auc(recall2,precision2, reorder=True)

    plt.clf()
    fig = plt.figure(figsize=(8,6))
    fig.subplots_adjust(hspace=.5)
    ax1 = fig.add_subplot(211)
    ax1.plot(recall1, precision1, label='linear (AUC: %0.3f)' % auc(recall1,precision1, reorder=True) )
    ax1.plot(recall2, precision2, label='Non linear (AUC: %0.3f)' % auc(recall2,precision2, reorder=True) )
    ax1.set_xlabel('Recall')
    ax1.set_ylabel('Precision')
    ax1.set_ylim([0.0, 1.2])
    ax1.set_xlim([0.0, 1.2])
    ax1.set_title('Precision-Recall')
    ax1.legend(loc="upper right",fontsize='small' )

    ax2 = fig.add_subplot(212)
    ax2.plot(fpr1,tpr1, label='linear (AUC: %0.3f)' % auc(fpr1,tpr1, reorder=True) )
    ax2.plot(fpr2, recall2, label='Non linear (AUC: %0.3f)' % auc(fpr2,recall2, reorder=True) )
    ax2.plot(np.arange(0,1.1,.1), np.arange(0,1.1,.1),'--', label='TPR=FPR')
    ax2.set_xlabel('FPR')
    ax2.set_ylabel('TPR')
    ax2.set_ylim([0.0, 1.2])
    ax2.set_xlim([0.0, 1.2])
    ax2.set_title('ROC')
    ax2.legend(loc="lower right",fontsize='small' )
    plt.show()
    
    if(debug == True):
        sub_stats = pandas.DataFrame.from_dict({'predict':pred, 'truth':truth}, dtype='float64')
        sub_stats = sub_stats.sort_values(by=['predict'],ascending=False)
        blockWise_stats = __get_blockWise_stats(sub_stats)
        grouped = sub_stats.groupby(['predict'],sort=False)
        
        print('-- Block wise true positive densities-----\n')
        print(blockWise_stats)
        
        print('\n\n --- Stats per observation---\n')
        sub_stats = grouped.apply(__nonlinear_interpolated_evalStats,blockWise_stats)
        print(sub_stats)
        

# ## Examples

# ### 1 : single belief score

# In[46]:


sub = syn.getSubmission(2363293)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)


# ### Example 2 : binary predictions

# In[17]:


sub = syn.getSubmission(2363303)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)


# ### Example 3 : less skewed scores

# In[18]:


sub = syn.getSubmission(2364533)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)


# ### Example 4 : 670 uniq scores

# In[19]:


sub = syn.getSubmission(2368533)
df = pandas.DataFrame.from_csv(sub.filePath).reset_index()
df = df.merge(truth_df[['true_class','ID', 'Response']], left_on="ID", right_on="ID", how="outer")
__temp_plot(df.true_class, df.belief_gen)


# ### Testing --  When truth == prediction (May 9, 2016)
# 
# reported case by Tom  

# In[22]:


pred = np.concatenate([np.repeat(1,25), np.repeat(0,30)])
truth = pred
__temp_plot(truth, pred)


# #### 8/9/2016 - Testing Another case 
# reported by Tom - possible bug

# In[48]:


sub1 = syn.getSubmission(7180273)
sub1 = pandas.DataFrame.from_csv(sub1.filePath).reset_index()
sub1['pred'] = sub1.SHEDDING_SC1

sub2 = syn.getSubmission(7154754)
sub2 = pandas.DataFrame.from_csv(sub2.filePath).reset_index()
sub2['pred'] = sub2.SHEDDING_SC1

truth = syn.get('syn5705153')
truth = pandas.DataFrame.from_csv(truth.path).reset_index()
truth['true_class'] = truth.SHEDDING_SC1


# In[49]:


data = sub1.merge(truth, left_on="SUBJECTID", right_on="SUBJECTID", how="outer")
__temp_plot(data.true_class, data.pred, debug=True)


# In[50]:


data = sub2.merge(truth, left_on="SUBJECTID", right_on="SUBJECTID", how="outer")
__temp_plot(data.true_class, data.pred, debug=True)


# In[ ]: