Notebook

In [1]:

import pandas as pd
import scipy.io
from os import listdir
from sklearn.utils import shuffle
from sklearn import preprocessing
#from nilearn import plotting
#from proteus.io import util
from proteus.visu import sbp_visu

import glob,os
#import nibabel as nib


import pickle
from proteus.predic import high_confidence_at
import numpy as np
import pandas as pd
from proteus.predic import prediction
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn import metrics

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

from copy import deepcopy

from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

/home/angela/anaconda3/envs/vcog_paper_py35/lib/python3.5/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d
/home/angela/anaconda3/envs/vcog_paper_py35/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [2]:

def stats_mask(y_true, y_pred, mask_selected=None):
    if mask_selected is None:
        mask_selected = np.ones(y_pred.shape).astype(bool)
    print('------------------------')
    print('Ratio:', y_true[mask_selected].sum()/y_true.sum()) 
    print('#    : ', y_true[mask_selected].sum()) 
    print('# true values: ',mask_selected.sum())
    print('ACC  : ', np.mean((y_true == y_pred)[mask_selected])) 

In [3]:

def predic_stats(y_, y_pred, lr_decision):
    # number of AD subjects
    n_ad = sum(y_)
    print('Total number of TARGET subjects: ', n_ad)

    # number of CN subjects
    n_cn = len(y_) - sum(y_)
    print('Total number of NON-TARGET subjects: ', n_cn)
    
    # number of subjects predicted as AD at stage 1
    n_pos = sum(y_pred)
    print('Stage 1 number of hits (true and false positives): ', n_pos)
    
    # true positives at stage 1
    n_pos_ad = sum(y_pred[y_.astype(bool)])
    print('Stage 1 TRUE positives: ', n_pos_ad)
    
    # false positives at stage 1
    n_pos_cn = n_pos - n_pos_ad
    print('Stage 1 FALSE positives: ', n_pos_cn)
    
    # number of CN subjects not identified as positive (true negatives)
    n_neg1_cn = n_cn - n_pos_cn
    print('Stage 1 TRUE negatives: ', n_neg1_cn)

    # number of all flagged HPC-AD subjects
    n_flag = sum(y_pred[lr_decision>0])
    print('Total number of flagged HPC-AD subjects: ', n_flag)

    # number of flagged HPC-AD subjects who are actually AD (true positives)
    y_pred_true = y_ + y_pred
    y_pred_true = y_pred_true==2
    n_flag_ad = sum(y_pred_true[lr_decision>0])
    print('Number of flagged HPC-AD subjects that are TRUE positives: ', n_flag_ad)

    # number of flagged HPC-AD subjects that are actually CN (false positives)
    n_flag_cn = n_flag - n_flag_ad
    print('Number of flagged HPC-AD subjects that are FALSE positives: ', n_flag_cn)

    # number of CN subjects that were not flagged (true negatives)
    n_neg_cn = n_cn - n_flag_cn
    print('Number of true negatives: ', n_neg_cn)
    
    print('#############################')
    print('Stage 1 stats for TARGET vs NON-TARGET')
    print('Precision for AD: ', n_pos_ad/(n_pos_ad + n_pos_cn))
    prec = n_pos_ad/(n_pos_ad + n_pos_cn)
    print('Recall (or sensitivity)  for AD: ', n_pos_ad/n_ad)
    sens = n_pos_ad/n_ad
    print('Specificity: ', n_neg1_cn/n_cn)
    spec = n_neg1_cn/n_cn
    fp = (1-spec)*664
    tp = sens*336
    adj_prec = tp/(tp+fp)
    print('Adjusted precision for 33.6% baseline rate: ', adj_prec)
    print('Accuracy: ', (n_pos_ad + n_neg1_cn)/(n_ad + n_cn))
    acc = (n_pos_ad + n_neg1_cn)/(n_ad + n_cn)

    print('#############################')
    print('Stage 2 stats for TARGET vs NON-TARGET')
    print('Precision for HPC-AD: ', n_flag_ad/n_flag)
    prec_2 = n_flag_ad/n_flag
    print('Recall (or sensitivity) for HPC-AD: ', n_flag_ad/n_ad)
    sens_2 = n_flag_ad/n_ad
    print('Specificity: ', n_neg_cn/n_cn)
    spec_2 = n_neg_cn/n_cn
    fp_2 = (1-spec_2)*664
    tp_2 = sens_2*336
    adj_prec_2 = tp_2/(tp_2 + fp_2)
    print('Adjusted precision for 33.6% baseline rate: ', adj_prec_2)
    print('Accuracy: ', (n_flag_ad + n_neg_cn)/(n_ad + n_cn))
    acc_2 = (n_flag_ad + n_neg_cn)/(n_ad + n_cn)
    
    return sens, spec, prec, acc, sens_2, spec_2, prec_2, acc_2

In [4]:

sns.set(font_scale=2)
sns.set_style("white")
#sns.set_context("paper")
#sns.set_palette("colorblind")
#sns.set_palette("GnBu_d")

#sns.set_palette(sns.cubehelix_palette(n_colors=8))
#sns.set_palette(sns.color_palette("BrBG", 6))

cpal = ["#F0DFB2", "#CFA255", "#995D12", "#B3E2DB", "#58B0A6", "#0D7068"]
sns.set_palette(cpal)

In [5]:

path_results = '/home/angela/Desktop/vcog_paper/gigascience/third_submission/roc/cog/'

Load longitudinal data¶

In [6]:

np.random.seed(1)
#np.random.RandomState(1)

In [7]:

long_data = pd.read_csv('/home/angela/Documents/adni_csv/adnimerge_upenn_unw_av45_neurobat.csv')

/home/angela/anaconda3/envs/vcog_paper_py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (101) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Load data for the training set (ADNI1)¶

In [8]:

data = pd.read_csv('/home/angela/Desktop/vcog_paper/adni1_vbm_adcn_subtypes_20171209/7clus/adni1_model_weights.csv')

In [9]:

data.dropna(subset=['sub1','age_scan','gender','mean_gm','tiv', 'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR'],
            inplace=True)

In [10]:

data = data[['RID','age_scan','gender','mean_gm','tiv',
             'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR',
             'sub1','sub2','sub3','sub4','sub5','sub6','sub7',
             'ABETA','TAU','conv_2_ad','AD','MCI','CN','APOE4_bin','DX']]

Organize the AD & CN data for classification task¶

In [11]:

# Mask of the CN and AD subjects only
mask_cnad = data.loc[:,['CN','AD']].values.sum(1).astype(bool)

In [12]:

#reload(high_confidence)
scalerX = preprocessing.StandardScaler()
scaler = preprocessing.StandardScaler()

x_ = data.iloc[mask_cnad,data.columns.get_loc("ADAS13"):data.columns.get_loc("CLOCKSCOR")+1].values
#x_ = scalerX.fit_transform(x_)
y_ = data[['AD']].values.ravel()[mask_cnad]

confounds = data[['gender','age_scan','mean_gm','tiv']].values[mask_cnad,:]
#confounds = data[['sex','age_r']].values[mask_cnad,:]
#confounds[:, 1:] = scaler.fit_transform(confounds[:, 1:])
#confounds[:, 0] = preprocessing.binarize(confounds[:, 0].reshape(-1, 1), threshold=1)[:, 0]

#crm = prediction.ConfoundsRm(confounds, x_)
#x_ = crm.transform(confounds, x_)

x_ = scaler.fit_transform(np.hstack((x_,confounds)))
#x_ = np.hstack((x_,confounds))


x_.shape, y_.shape, confounds.shape

Out[12]:

((370, 9), (370,), (370, 4))

Cross-validation HPC in ADNI1 AD & CN¶

In [13]:

scores_ad_cn=[]
scores_s2 = []
ad_precision = []
cn_precision = []
ad_recall = []
cn_recall = []
ad_f1_score = []
cn_f1_score = []

s1_spec = []
s1_sens = []
s1_prec = []
s1_acc = []

s2_spec = []
s2_sens = []
s2_prec = []
s2_acc = []


skf = StratifiedKFold(n_splits=3)
for train_index, val_index in skf.split(x_,y_):
    X_training, X_val = x_[train_index], x_[val_index]
    y_training, y_val = y_[train_index], y_[val_index]
    
    hpc = high_confidence_at.TwoStagesPrediction(
        n_iter=500,
        shuffle_test_split=0.5,
        min_gamma=.99,
        thresh_ratio=0.1)
    
    hpc.fit(X_training, X_training, y_training)
    
    _, dic_results = hpc.predict(X_val, X_val)
    
    # test in validation sample
    acc = metrics.accuracy_score(y_val, (dic_results['s1df'][:,0]>0).astype(float))
    tmp_mask = (dic_results['s2df'][:,1]>0)
    acc_s2 = metrics.accuracy_score(y_val[tmp_mask], (dic_results['s1df'][:,0]>0).astype(float)[tmp_mask])
    scores_ad_cn.append(acc)
    scores_s2.append(acc_s2)
    print('Classifying AD vs CN...')
    print((dic_results['s1df'][:,0]>0).astype(float))
    
    y_pred = (dic_results['s1df'][:,0]>0).astype(float)
    lr_decision = dic_results['s2df'][:,1]
    
    # BASE SVM PERFORMANCE
    ad_p = metrics.precision_score(y_val, y_pred)
    ad_precision.append(ad_p)
    cn_p = metrics.precision_score(y_val, y_pred, pos_label=0)
    cn_precision.append(cn_p)
    ad_r = metrics.recall_score(y_val, y_pred)
    ad_recall.append(ad_r)
    cn_r = metrics.recall_score(y_val, y_pred, pos_label=0)
    cn_recall.append(cn_r)
    ad_f1 = metrics.f1_score(y_val, y_pred)
    ad_f1_score.append(ad_f1)
    cn_f1 = metrics.f1_score(y_val, y_pred, pos_label=0)
    cn_f1_score.append(cn_f1)
    
    
    sens, spec, prec, acc, sens_2, spec_2, prec_2, acc_2 = predic_stats(y_val, y_pred, lr_decision)
    s1_spec.append(spec)
    s1_sens.append(sens)
    s1_prec.append(prec)
    s1_acc.append(acc)
    s2_spec.append(spec_2)
    s2_sens.append(sens_2)
    s2_prec.append(prec_2)
    s2_acc.append(acc_2)

Stage 1
Proba:
[1.         1.         1.         1.         1.         0.98755187
 1.         1.         1.         1.         0.87649402 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.99595142 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         0.9958159  1.
 0.98387097 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.84063745
 1.         1.         0.06995885 1.         1.         1.
 1.         1.         0.99242424 0.26848249 1.         1.
 1.         0.93562232 0.93951613 1.         0.06934307 1.
 0.61666667 0.30379747 1.         1.         1.         1.
 1.         1.         0.8458498  1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.91304348 1.         1.         1.
 1.         1.         0.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.95801527 1.         1.
 0.50420168 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.98418972 1.         1.         1.         0.67916667 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.98275862 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.99615385 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.        ]
Average hm score 0.9065040650406504
Stage 2
Adjusted gamma:  1.0
Adjusted gamma:  1.0
Classifying AD vs CN...
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1.]
Total number of TARGET subjects:  55.0
Total number of NON-TARGET subjects:  69.0
Stage 1 number of hits (true and false positives):  53.0
Stage 1 TRUE positives:  52.0
Stage 1 FALSE positives:  1.0
Stage 1 TRUE negatives:  68.0
Total number of flagged HPC-AD subjects:  48.0
Number of flagged HPC-AD subjects that are TRUE positives:  48
Number of flagged HPC-AD subjects that are FALSE positives:  0.0
Number of true negatives:  69.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  0.9811320754716981
Recall (or sensitivity)  for AD:  0.9454545454545454
Specificity:  0.9855072463768116
Adjusted precision for 33.6% baseline rate:  0.9705978964453406
Accuracy:  0.967741935483871
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  1.0
Recall (or sensitivity) for HPC-AD:  0.8727272727272727
Specificity:  1.0
Adjusted precision for 33.6% baseline rate:  1.0
Accuracy:  0.9435483870967742
Stage 1
Proba:
[0.47808765 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.99173554 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.99595142 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.07630522 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         0.9922179  1.
 1.         1.         1.         0.66666667 1.         1.
 1.         0.98015873 1.         1.         1.         1.
 1.         1.         0.27667984 1.         1.         0.76226415
 0.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.8
 1.         0.02521008 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.         1.         1.         1.         1.
 1.         0.34008097 1.         1.         0.22510823 1.
 1.         0.1902834  0.83921569 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.70881226 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.99588477 1.         1.         1.         0.66945607
 1.         1.         1.         1.         1.         0.9916318
 1.         0.99570815 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.90041494 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.9561753  1.         1.         1.         1.         1.
 1.         1.         1.         1.         0.80237154 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.        ]
Average hm score 0.8987854251012146
Stage 2
Adjusted gamma:  1.0
Adjusted gamma:  1.0
Classifying AD vs CN...
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 1.]
Total number of TARGET subjects:  55.0
Total number of NON-TARGET subjects:  68.0
Stage 1 number of hits (true and false positives):  52.0
Stage 1 TRUE positives:  52.0
Stage 1 FALSE positives:  0.0
Stage 1 TRUE negatives:  68.0
Total number of flagged HPC-AD subjects:  45.0
Number of flagged HPC-AD subjects that are TRUE positives:  45
Number of flagged HPC-AD subjects that are FALSE positives:  0.0
Number of true negatives:  68.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  1.0
Recall (or sensitivity)  for AD:  0.9454545454545454
Specificity:  1.0
Adjusted precision for 33.6% baseline rate:  1.0
Accuracy:  0.975609756097561
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  1.0
Recall (or sensitivity) for HPC-AD:  0.8181818181818182
Specificity:  1.0
Adjusted precision for 33.6% baseline rate:  1.0
Accuracy:  0.9186991869918699
Stage 1
Proba:
[0.54581673 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.91696751 0.97107438 0.99578059 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.99190283 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.76893939 1.         0.65863454 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         0.99610895 1.
 1.         1.         1.         0.61382114 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.24505929 1.         1.         0.23018868
 0.044      0.97142857 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.9245283  1.         0.81512605 1.         1.
 1.         1.         1.         1.         0.99264706 1.
 1.         1.         1.         1.         1.         0.10196078
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.99579832 1.         1.         1.
 1.         0.99095023 0.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.99137931 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         0.99601594 1.
 1.         1.         0.99166667 1.         0.98367347 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.48790323 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.675      1.         1.         1.         1.         1.
 0.98076923 1.         1.         0.         1.         1.
 1.         1.         0.925      1.         1.         1.
 1.        ]
Average hm score 0.8825910931174089
Stage 2
Adjusted gamma:  1.0
Adjusted gamma:  1.0
Classifying AD vs CN...
[0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0.
 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1.]
Total number of TARGET subjects:  55.0
Total number of NON-TARGET subjects:  68.0
Stage 1 number of hits (true and false positives):  58.0
Stage 1 TRUE positives:  55.0
Stage 1 FALSE positives:  3.0
Stage 1 TRUE negatives:  65.0
Total number of flagged HPC-AD subjects:  51.0
Number of flagged HPC-AD subjects that are TRUE positives:  51
Number of flagged HPC-AD subjects that are FALSE positives:  0.0
Number of true negatives:  68.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  0.9482758620689655
Recall (or sensitivity)  for AD:  1.0
Specificity:  0.9558823529411765
Adjusted precision for 33.6% baseline rate:  0.9198067632850243
Accuracy:  0.975609756097561
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  1.0
Recall (or sensitivity) for HPC-AD:  0.9272727272727272
Specificity:  1.0
Adjusted precision for 33.6% baseline rate:  1.0
Accuracy:  0.967479674796748

In [14]:

print('Stage 1')
print('Mean sensitivity: ', np.mean(s1_sens))
print('Mean specificity: ', np.mean(s1_spec))
print('Mean precision: ', np.mean(s1_prec))
print('Mean accuracy: ', np.mean(s1_acc))
print('#'*10)
print('Stage 2')
print('Mean sensitivity: ', np.mean(s2_sens))
print('Mean specificity: ', np.mean(s2_spec))
print('Mean precision: ', np.mean(s2_prec))
print('Mean accuracy: ', np.mean(s2_acc))

Stage 1
Mean sensitivity:  0.9636363636363635
Mean specificity:  0.9804631997726627
Mean precision:  0.9764693125135545
Mean accuracy:  0.972987149226331
##########
Stage 2
Mean sensitivity:  0.8727272727272727
Mean specificity:  1.0
Mean precision:  1.0
Mean accuracy:  0.9432424162951308

Train HPC (on whole training set of ADNI1 AD & CN)¶

In [15]:

#reload(high_confidence)
hpc = high_confidence_at.TwoStagesPrediction(
    n_iter=500,
    shuffle_test_split=0.5,
    min_gamma=.99,
    thresh_ratio=0.1)

hpc.fit(x_, x_, y_)

Stage 1
Proba:
[0.70416667 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.9766537  0.95454545 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.97165992 1.         0.36       1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.47983871 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.63052209 1.         1.         0.12840467
 0.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.97233202 1.         0.68379447 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.58364312
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.99578059 1.         1.
 1.         1.         0.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         0.97647059 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.49367089 1.         1.
 0.29365079 1.         1.         1.         1.         1.
 1.         0.1042471  1.         1.         1.         1.
 1.         1.         0.01626016 1.         0.82170543 0.61316872
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 0.95256917 1.         1.         1.         1.         1.
 0.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.96124031 1.         1.         0.92765957 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.94488189 1.         1.         1.
 1.         1.         0.99111111 1.         0.99601594 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.99607843
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.99606299 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.        ]
Average hm score 0.918918918918919
Stage 2
Adjusted gamma:  1.0
Adjusted gamma:  1.0

In [16]:

array_results, dic_results = hpc.predict(x_, x_)

# Level 1
print('Level 1')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float))

print('Level 2')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float), dic_results['s2df'][:,1]>0)
#stats_mask(dic_results['s2df'][:,2]>0)
#stats_mask(dic_results['s2df'][:,3]>0)

Level 1
------------------------
Ratio: 1.0
#    :  165.0
# true values:  370
ACC  :  0.9756756756756757
Level 2
------------------------
Ratio: 0.9090909090909091
#    :  150.0
# true values:  150
ACC  :  1.0

In [17]:

y_pred = (dic_results['s1df'][:,0]>0).astype(float)
lr_decision = dic_results['s2df'][:,1]

In [18]:

print('Stage 1 stats for AD vs CN')
print(metrics.classification_report(y_, y_pred))

Stage 1 stats for AD vs CN
             precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       205
        1.0       0.98      0.97      0.97       165

avg / total       0.98      0.98      0.98       370

In [19]:

predic_stats(y_, y_pred, lr_decision)

Total number of TARGET subjects:  165.0
Total number of NON-TARGET subjects:  205.0
Stage 1 number of hits (true and false positives):  164.0
Stage 1 TRUE positives:  160.0
Stage 1 FALSE positives:  4.0
Stage 1 TRUE negatives:  201.0
Total number of flagged HPC-AD subjects:  150.0
Number of flagged HPC-AD subjects that are TRUE positives:  150
Number of flagged HPC-AD subjects that are FALSE positives:  0.0
Number of true negatives:  205.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  0.975609756097561
Recall (or sensitivity)  for AD:  0.9696969696969697
Specificity:  0.9804878048780488
Adjusted precision for 33.6% baseline rate:  0.9617559586143343
Accuracy:  0.9756756756756757
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  1.0
Recall (or sensitivity) for HPC-AD:  0.9090909090909091
Specificity:  1.0
Adjusted precision for 33.6% baseline rate:  1.0
Accuracy:  0.9594594594594594

Out[19]:

(0.9696969696969697,
 0.9804878048780488,
 0.975609756097561,
 0.9756756756756757,
 0.9090909090909091,
 1.0,
 1.0,
 0.9594594594594594)

compare with other methods¶

ROC curve¶

base¶

In [20]:

base = high_confidence_at.BaseSvc()
base.fit(x_, y_)
y_predicted = base.predict(x_)
y_score = base.decision_function(x_)
y_score.shape

Out[20]:

(370,)

In [21]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_b = dict()
tpr_b = dict()
roc_auc_b = dict()
for i in range(n_classes):
    fpr_b[i], tpr_b[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_b[i] = auc(fpr_b[i], tpr_b[i])
    
# Compute micro-average ROC curve and ROC area
fpr_b["micro"], tpr_b["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_b["micro"] = auc(fpr_b["micro"], tpr_b["micro"])

In [22]:

average_precision_b = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_b))

precision_b, recall_b, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 1.00

HPS¶

In [23]:

y_true = y_.astype(int)
y_true = label_binarize(y_, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(lr_decision, (y_score.shape[0],1))

In [24]:

# Compute ROC curve and ROC area for each class
fpr_h = dict()
tpr_h = dict()
roc_auc_h = dict()
for i in range(n_classes):
    fpr_h[i], tpr_h[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_h[i] = auc(fpr_h[i], tpr_h[i])

# Compute micro-average ROC curve and ROC area
fpr_h["micro"], tpr_h["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_h["micro"] = auc(fpr_h["micro"], tpr_h["micro"])

In [25]:

average_precision_h = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_h))

precision_h, recall_h, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 1.00

rbf kernel svm¶

In [26]:

svm_param_grid=dict(C=(np.logspace(-2, 1, 15)))

clf_svm = SVC(kernel='rbf', class_weight='balanced', decision_function_shape='ovr', random_state=1)
grclf_svm = GridSearchCV(clf_svm, param_grid=svm_param_grid, 
                         cv=StratifiedShuffleSplit(n_splits=50, test_size=.2, random_state=1))               

grclf_svm.fit(x_, y_)
y_predicted = grclf_svm.predict(x_)
y_score = grclf_svm.decision_function(x_)
y_score.shape

Out[26]:

(370,)

In [27]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_svcrbf = dict()
tpr_svcrbf = dict()
roc_auc_svcrbf = dict()
for i in range(n_classes):
    fpr_svcrbf[i], tpr_svcrbf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_svcrbf[i] = auc(fpr_svcrbf[i], tpr_svcrbf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_svcrbf["micro"], tpr_svcrbf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_svcrbf["micro"] = auc(fpr_svcrbf["micro"], tpr_svcrbf["micro"])

k nearest neighbours¶

In [28]:

k_range = list(range(3,7))
weight_opt = ["uniform", "distance"]
knn_param_grid = dict(n_neighbors = k_range, weights = weight_opt)

clf_knn = KNeighborsClassifier(algorithm='auto')
grclf_knn = GridSearchCV(clf_knn, param_grid=knn_param_grid, 
                         cv=StratifiedShuffleSplit(n_splits=50, test_size=.2, random_state=1))
grclf_knn.fit(x_, y_)
y_predicted = grclf_knn.predict(x_)
#y_score = clf.decision_function(x_)
y_score = grclf_knn.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[28]:

(370,)

In [29]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_knn = dict()
tpr_knn = dict()
roc_auc_knn = dict()
for i in range(n_classes):
    fpr_knn[i], tpr_knn[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_knn[i] = auc(fpr_knn[i], tpr_knn[i])
    
# Compute micro-average ROC curve and ROC area
fpr_knn["micro"], tpr_knn["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_knn["micro"] = auc(fpr_knn["micro"], tpr_knn["micro"])

random forest¶

In [30]:

n_features = x_.shape[1]

In [31]:

rf_param_grid = [
{'n_estimators': [10, 25], 'max_features': [5, n_features], 
 'max_depth': [10, 50, None], 'bootstrap': [True, False]}
]

clf_rf = RandomForestClassifier(random_state=1)
grclf_rf = GridSearchCV(clf_rf, param_grid=rf_param_grid, 
                       cv=StratifiedShuffleSplit(n_splits=50, test_size=.2, random_state=1))
grclf_rf.fit(x_, y_)
y_predicted = grclf_rf.predict(x_)
#y_score = clf.decision_function(x_)
y_score = grclf_rf.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[31]:

(370,)

In [32]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_rf = dict()
tpr_rf = dict()
roc_auc_rf = dict()
for i in range(n_classes):
    fpr_rf[i], tpr_rf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_rf[i] = auc(fpr_rf[i], tpr_rf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_rf["micro"], tpr_rf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_rf["micro"] = auc(fpr_rf["micro"], tpr_rf["micro"])

gaussian naive bayes¶

In [33]:

clf_gnb = GaussianNB()
clf_gnb.fit(x_, y_)
y_predicted = clf_gnb.predict(x_)
#y_score = clf.decision_function(x_)
y_score = clf_gnb.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[33]:

(370,)

In [34]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_nb = dict()
tpr_nb = dict()
roc_auc_nb = dict()
for i in range(n_classes):
    fpr_nb[i], tpr_nb[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_nb[i] = auc(fpr_nb[i], tpr_nb[i])
    
# Compute micro-average ROC curve and ROC area
fpr_nb["micro"], tpr_nb["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_nb["micro"] = auc(fpr_nb["micro"], tpr_nb["micro"])

In [35]:

#plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(10,8)
lw = 4
plt.rc('xtick', labelsize=40)
plt.rc('ytick', labelsize=40)
plt.plot(fpr_svcrbf[0], tpr_svcrbf[0], color='green',
         lw=lw, label='RBF SVM (AUC=%0.3f)' % roc_auc_svcrbf[0])
plt.plot(fpr_knn[0], tpr_knn[0], color='pink',
         lw=lw, label='KNN (AUC=%0.3f)' % roc_auc_knn[0])
plt.plot(fpr_rf[0], tpr_rf[0], color='brown',
         lw=lw, label='RF (AUC=%0.3f)' % roc_auc_rf[0])
plt.plot(fpr_nb[0], tpr_nb[0], color='orange',
         lw=lw, label='GNB (AUC=%0.3f)' % roc_auc_nb[0])
plt.plot(fpr_b[0], tpr_b[0], color='blue',
         lw=lw, label='Base (AUC=%0.3f)' % roc_auc_b[0])
plt.plot(fpr_h[0], tpr_h[0], color='red',
         lw=lw, label='HPS (AUC=%0.3f)' % roc_auc_h[0])
plt.plot([0, 1], [0, 1], color='grey', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.00])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR', fontdict={'size': 40})
plt.ylabel('TPR', fontdict={'size': 40})
plt.title('ADNI1 AD vs CN', fontdict={'size': 40})
plt.legend(loc="lower right", prop={'size': 25})
plt.show()
fig.savefig(path_results + 'adni1_ad_roc_multi.pdf', bbox_inches='tight')

Test on ADNI1 MCI¶

In [36]:

# load the data
adni1_mci = pd.read_csv('/home/angela/Desktop/vcog_paper/adni1_vbm_adcn_subtypes_20171209/7clus/adni1_mci_bl_demog_weights.csv')

In [37]:

mask_mci = adni1_mci.loc[:,'MCI'].values.astype(bool)
adni1_mci = adni1_mci.iloc[mask_mci]

In [38]:

adni1_mci.dropna(subset=['sub1','age_scan','gender','mean_gm','tiv',
                         'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR'],inplace=True)

In [39]:

adni1_mci = adni1_mci[['RID','age_scan','gender','mean_gm','tiv',
             'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR',
             'sub1','sub2','sub3','sub4','sub5','sub6','sub7',
             'ABETA','TAU','conv_2_ad','AD','MCI','CN','APOE4_bin','DX','Month_conv']]

In [40]:

len(adni1_mci)

Out[40]:

In [41]:

x_ = adni1_mci.iloc[:,adni1_mci.columns.get_loc("ADAS13"):adni1_mci.columns.get_loc("CLOCKSCOR")+1].values
#x_ = scalerX.transform(x_)
y_ = adni1_mci['conv_2_ad'].values.ravel()


confounds = adni1_mci[['gender','age_scan','mean_gm','tiv']].values
#confounds = data[['sex','age_r']].values[mask_mci,:]
#confounds[:, 1:] = scaler.transform(confounds[:, 1:])
#confounds[:, 0] = preprocessing.binarize(confounds[:, 0].reshape(-1, 1), threshold=1)[:, 0]
#confounds = scaler.transform(confounds)
#x_ = crm.transform(confounds, x_)

x_ = scaler.transform(np.hstack((x_,confounds)))

x_.shape, y_.shape, confounds.shape

Out[41]:

((235, 9), (235,), (235, 4))

In [42]:

array_results, dic_results = hpc.predict(x_, x_)

# Level 1
print('Level 1')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float))

print('Level 2')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float), dic_results['s2df'][:,1]>0)
#stats_mask(dic_results['s2df'][:,2]>0)
#stats_mask(dic_results['s2df'][:,3]>0)

Level 1
------------------------
Ratio: 1.0
#    :  147.0
# true values:  235
ACC  :  0.7617021276595745
Level 2
------------------------
Ratio: 0.6462585034013606
#    :  95.0
# true values:  106
ACC  :  0.8962264150943396

In [43]:

y_pred = (dic_results['s1df'][:,0]>0).astype(float)
lr_decision = dic_results['s2df'][:,1]

In [44]:

predic_stats(y_, y_pred, lr_decision)

Total number of TARGET subjects:  147.0
Total number of NON-TARGET subjects:  88.0
Stage 1 number of hits (true and false positives):  143.0
Stage 1 TRUE positives:  117.0
Stage 1 FALSE positives:  26.0
Stage 1 TRUE negatives:  62.0
Total number of flagged HPC-AD subjects:  106.0
Number of flagged HPC-AD subjects that are TRUE positives:  95
Number of flagged HPC-AD subjects that are FALSE positives:  11.0
Number of true negatives:  77.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  0.8181818181818182
Recall (or sensitivity)  for AD:  0.7959183673469388
Specificity:  0.7045454545454546
Adjusted precision for 33.6% baseline rate:  0.5768390386016025
Accuracy:  0.7617021276595745
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  0.8962264150943396
Recall (or sensitivity) for HPC-AD:  0.6462585034013606
Specificity:  0.875
Adjusted precision for 33.6% baseline rate:  0.723465016658734
Accuracy:  0.7319148936170212

Out[44]:

(0.7959183673469388,
 0.7045454545454546,
 0.8181818181818182,
 0.7617021276595745,
 0.6462585034013606,
 0.875,
 0.8962264150943396,
 0.7319148936170212)

ROC curve¶

base¶

In [45]:

#base = high_confidence_at.BaseSvc()
#base.fit(x_, y_)
y_predicted = base.predict(x_)
y_score = base.decision_function(x_)
y_score.shape

Out[45]:

(235,)

In [46]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_b = dict()
tpr_b = dict()
roc_auc_b = dict()
for i in range(n_classes):
    fpr_b[i], tpr_b[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_b[i] = auc(fpr_b[i], tpr_b[i])
    
# Compute micro-average ROC curve and ROC area
fpr_b["micro"], tpr_b["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_b["micro"] = auc(fpr_b["micro"], tpr_b["micro"])

In [47]:

average_precision_b = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_b))

precision_b, recall_b, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 0.89

HPS¶

In [48]:

y_true = y_.astype(int)
y_true = label_binarize(y_, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(lr_decision, (y_score.shape[0],1))

In [49]:

# Compute ROC curve and ROC area for each class
fpr_h = dict()
tpr_h = dict()
roc_auc_h = dict()
for i in range(n_classes):
    fpr_h[i], tpr_h[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_h[i] = auc(fpr_h[i], tpr_h[i])

# Compute micro-average ROC curve and ROC area
fpr_h["micro"], tpr_h["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_h["micro"] = auc(fpr_h["micro"], tpr_h["micro"])

rbf kernel svm¶

In [50]:

y_predicted = grclf_svm.predict(x_)
y_score = grclf_svm.decision_function(x_)
y_score.shape

Out[50]:

(235,)

In [51]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_svcrbf = dict()
tpr_svcrbf = dict()
roc_auc_svcrbf = dict()
for i in range(n_classes):
    fpr_svcrbf[i], tpr_svcrbf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_svcrbf[i] = auc(fpr_svcrbf[i], tpr_svcrbf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_svcrbf["micro"], tpr_svcrbf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_svcrbf["micro"] = auc(fpr_svcrbf["micro"], tpr_svcrbf["micro"])

k nearest neighbours¶

In [52]:

y_predicted = grclf_knn.predict(x_)
y_score = grclf_knn.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[52]:

(235,)

In [53]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_knn = dict()
tpr_knn = dict()
roc_auc_knn = dict()
for i in range(n_classes):
    fpr_knn[i], tpr_knn[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_knn[i] = auc(fpr_knn[i], tpr_knn[i])
    
# Compute micro-average ROC curve and ROC area
fpr_knn["micro"], tpr_knn["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_knn["micro"] = auc(fpr_knn["micro"], tpr_knn["micro"])

random forest¶

In [54]:

y_predicted = grclf_rf.predict(x_)
y_score = grclf_rf.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[54]:

(235,)

In [55]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_rf = dict()
tpr_rf = dict()
roc_auc_rf = dict()
for i in range(n_classes):
    fpr_rf[i], tpr_rf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_rf[i] = auc(fpr_rf[i], tpr_rf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_rf["micro"], tpr_rf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_rf["micro"] = auc(fpr_rf["micro"], tpr_rf["micro"])

gaussian naive bayes¶

In [56]:

y_predicted = clf_gnb.predict(x_)
y_score = clf_gnb.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[56]:

(235,)

In [57]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_nb = dict()
tpr_nb = dict()
roc_auc_nb = dict()
for i in range(n_classes):
    fpr_nb[i], tpr_nb[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_nb[i] = auc(fpr_nb[i], tpr_nb[i])
    
# Compute micro-average ROC curve and ROC area
fpr_nb["micro"], tpr_nb["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_nb["micro"] = auc(fpr_nb["micro"], tpr_nb["micro"])

In [58]:

#plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(10,8)
lw = 4
plt.rc('xtick', labelsize=40)
plt.rc('ytick', labelsize=40)
plt.plot(fpr_svcrbf[0], tpr_svcrbf[0], color='green',
         lw=lw, label='RBF SVM (AUC=%0.3f)' % roc_auc_svcrbf[0])
plt.plot(fpr_knn[0], tpr_knn[0], color='pink',
         lw=lw, label='KNN (AUC=%0.3f)' % roc_auc_knn[0])
plt.plot(fpr_rf[0], tpr_rf[0], color='brown',
         lw=lw, label='RF (AUC=%0.3f)' % roc_auc_rf[0])
plt.plot(fpr_nb[0], tpr_nb[0], color='orange',
         lw=lw, label='GNB (AUC=%0.3f)' % roc_auc_nb[0])
plt.plot(fpr_b[0], tpr_b[0], color='blue',
         lw=lw, label='Base (AUC=%0.3f)' % roc_auc_b[0])
plt.plot(fpr_h[0], tpr_h[0], color='red',
         lw=lw, label='HPS (AUC=%0.3f)' % roc_auc_h[0])
plt.plot([0, 1], [0, 1], color='grey', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.00])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR', fontdict={'size': 40})
plt.ylabel('TPR', fontdict={'size': 40})
plt.title('ADNI1 pMCI vs sMCI', fontdict={'size': 40})
plt.legend(loc="lower right", prop={'size': 25})
#ax.legend(loc='center left', bbox_to_anchor=(1,0.5), prop={'size': 30})
plt.show()
fig.savefig(path_results + 'adni1_mci_roc_multi.pdf', bbox_inches='tight')

REPLICATION IN ADNI2¶

In [59]:

adni2_df = pd.read_csv('/home/angela/Desktop/vcog_paper/adni1_vbm_adcn_subtypes_20171209/7clus/adni2_model_weights.csv')

In [60]:

adni2_df.drop(adni2_df[adni2_df.RID < 2000].index,inplace=True)

In [61]:

# get rid of NaNs
adni2_df.dropna(axis=0,how='any',subset=['sub1','gender','age_scan','mean_gm','tiv','conv_2_ad',
                                        'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR'],inplace=True)

In [62]:

adni2_df = adni2_df[['RID','age_scan','gender','mean_gm','tiv',
             'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR',
             'sub1','sub2','sub3','sub4','sub5','sub6','sub7',
             'ABETA','TAU','conv_2_ad','AD','MCI','CN','APOE4_bin','DX']]

AD vs CN¶

In [63]:

# Mask of the AD and CN subjects only
mask_cnad = adni2_df.loc[:,['CN','AD']].values.sum(1).astype(bool)

x_ = adni2_df.iloc[mask_cnad,adni2_df.columns.get_loc("ADAS13"):adni2_df.columns.get_loc("CLOCKSCOR")+1].values
#x_ = scalerX.transform(x_)
y_ = adni2_df[['AD']].values.ravel()[mask_cnad]


confounds = adni2_df[['gender','age_scan','mean_gm','tiv']].values[mask_cnad,:]
#confounds = data[['sex','age_r']].values[mask_mci,:]
#confounds[:, 1:] = scaler.transform(confounds[:, 1:])
#confounds[:, 0] = preprocessing.binarize(confounds[:, 0].reshape(-1, 1), threshold=1)[:, 0]
#confounds = scaler.transform(confounds)
#x_ = crm.transform(confounds, x_)

x_ = scaler.transform(np.hstack((x_,confounds)))

x_.shape, y_.shape, confounds.shape

Out[63]:

((276, 9), (276,), (276, 4))

In [64]:

array_results, dic_results = hpc.predict(x_, x_)

# Level 1
print('Level 1')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float))

print('Level 2')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float), dic_results['s2df'][:,1]>0)
#stats_mask(dic_results['s2df'][:,2]>0)
#stats_mask(dic_results['s2df'][:,3]>0)

Level 1
------------------------
Ratio: 1.0
#    :  88.0
# true values:  276
ACC  :  0.9565217391304348
Level 2
------------------------
Ratio: 0.8863636363636364
#    :  78.0
# true values:  79
ACC  :  0.9873417721518988

In [65]:

y_pred = (dic_results['s1df'][:,0]>0).astype(float)
lr_decision = dic_results['s2df'][:,1]

In [66]:

predic_stats(y_, y_pred, lr_decision)

Total number of TARGET subjects:  88.0
Total number of NON-TARGET subjects:  188.0
Stage 1 number of hits (true and false positives):  90.0
Stage 1 TRUE positives:  83.0
Stage 1 FALSE positives:  7.0
Stage 1 TRUE negatives:  181.0
Total number of flagged HPC-AD subjects:  79.0
Number of flagged HPC-AD subjects that are TRUE positives:  78
Number of flagged HPC-AD subjects that are FALSE positives:  1.0
Number of true negatives:  187.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  0.9222222222222223
Recall (or sensitivity)  for AD:  0.9431818181818182
Specificity:  0.9627659574468085
Adjusted precision for 33.6% baseline rate:  0.9276315789473684
Accuracy:  0.9565217391304348
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  0.9873417721518988
Recall (or sensitivity) for HPC-AD:  0.8863636363636364
Specificity:  0.9946808510638298
Adjusted precision for 33.6% baseline rate:  0.9882796955031514
Accuracy:  0.9601449275362319

Out[66]:

(0.9431818181818182,
 0.9627659574468085,
 0.9222222222222223,
 0.9565217391304348,
 0.8863636363636364,
 0.9946808510638298,
 0.9873417721518988,
 0.9601449275362319)

ROC curve¶

base¶

In [67]:

y_predicted = base.predict(x_)
y_score = base.decision_function(x_)
y_score.shape

Out[67]:

(276,)

In [68]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_b = dict()
tpr_b = dict()
roc_auc_b = dict()
for i in range(n_classes):
    fpr_b[i], tpr_b[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_b[i] = auc(fpr_b[i], tpr_b[i])
    
# Compute micro-average ROC curve and ROC area
fpr_b["micro"], tpr_b["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_b["micro"] = auc(fpr_b["micro"], tpr_b["micro"])

In [69]:

average_precision_b = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_b))

precision_b, recall_b, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 0.99

HPS¶

In [70]:

y_true = y_.astype(int)
y_true = label_binarize(y_, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(lr_decision, (y_score.shape[0],1))

In [71]:

# Compute ROC curve and ROC area for each class
fpr_h = dict()
tpr_h = dict()
roc_auc_h = dict()
for i in range(n_classes):
    fpr_h[i], tpr_h[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_h[i] = auc(fpr_h[i], tpr_h[i])

# Compute micro-average ROC curve and ROC area
fpr_h["micro"], tpr_h["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_h["micro"] = auc(fpr_h["micro"], tpr_h["micro"])

In [72]:

average_precision_h = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_h))

precision_h, recall_h, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 0.99

rbf kernel svm¶

In [73]:

y_predicted = grclf_svm.predict(x_)
y_score = grclf_svm.decision_function(x_)
y_score.shape

Out[73]:

(276,)

In [74]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_svcrbf = dict()
tpr_svcrbf = dict()
roc_auc_svcrbf = dict()
for i in range(n_classes):
    fpr_svcrbf[i], tpr_svcrbf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_svcrbf[i] = auc(fpr_svcrbf[i], tpr_svcrbf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_svcrbf["micro"], tpr_svcrbf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_svcrbf["micro"] = auc(fpr_svcrbf["micro"], tpr_svcrbf["micro"])

k nearest neighbours¶

In [75]:

y_predicted = grclf_knn.predict(x_)
y_score = grclf_knn.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[75]:

(276,)

In [76]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_knn = dict()
tpr_knn = dict()
roc_auc_knn = dict()
for i in range(n_classes):
    fpr_knn[i], tpr_knn[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_knn[i] = auc(fpr_knn[i], tpr_knn[i])
    
# Compute micro-average ROC curve and ROC area
fpr_knn["micro"], tpr_knn["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_knn["micro"] = auc(fpr_knn["micro"], tpr_knn["micro"])

random forest¶

In [77]:

y_predicted = grclf_rf.predict(x_)
y_score = grclf_rf.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[77]:

(276,)

In [78]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_rf = dict()
tpr_rf = dict()
roc_auc_rf = dict()
for i in range(n_classes):
    fpr_rf[i], tpr_rf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_rf[i] = auc(fpr_rf[i], tpr_rf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_rf["micro"], tpr_rf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_rf["micro"] = auc(fpr_rf["micro"], tpr_rf["micro"])

gaussian naive bayes¶

In [79]:

y_predicted = clf_gnb.predict(x_)
y_score = clf_gnb.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[79]:

(276,)

In [80]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_nb = dict()
tpr_nb = dict()
roc_auc_nb = dict()
for i in range(n_classes):
    fpr_nb[i], tpr_nb[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_nb[i] = auc(fpr_nb[i], tpr_nb[i])
    
# Compute micro-average ROC curve and ROC area
fpr_nb["micro"], tpr_nb["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_nb["micro"] = auc(fpr_nb["micro"], tpr_nb["micro"])

In [81]:

#plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(10,8)
lw = 4
plt.rc('xtick', labelsize=40)
plt.rc('ytick', labelsize=40)
plt.plot(fpr_svcrbf[0], tpr_svcrbf[0], color='green',
         lw=lw, label='RBF SVM (AUC=%0.3f)' % roc_auc_svcrbf[0])
plt.plot(fpr_knn[0], tpr_knn[0], color='pink',
         lw=lw, label='KNN (AUC=%0.3f)' % roc_auc_knn[0])
plt.plot(fpr_rf[0], tpr_rf[0], color='brown',
         lw=lw, label='RF (AUC=%0.3f)' % roc_auc_rf[0])
plt.plot(fpr_nb[0], tpr_nb[0], color='orange',
         lw=lw, label='GNB (AUC=%0.3f)' % roc_auc_nb[0])
plt.plot(fpr_b[0], tpr_b[0], color='blue',
         lw=lw, label='Base (AUC=%0.3f)' % roc_auc_b[0])
plt.plot(fpr_h[0], tpr_h[0], color='red',
         lw=lw, label='HPS (AUC=%0.3f)' % roc_auc_h[0])
plt.plot([0, 1], [0, 1], color='grey', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.00])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR', fontdict={'size': 40})
plt.ylabel('TPR', fontdict={'size': 40})
plt.title('ADNI2 AD vs CN', fontdict={'size': 40})
plt.legend(loc="lower right", prop={'size': 25})
#ax.legend(loc='center left', bbox_to_anchor=(1,0.5), prop={'size': 30})
plt.show()
fig.savefig(path_results + 'adni2_ad_roc_multi.pdf', bbox_inches='tight')

MCI stable vs converters¶

In [82]:

adni2_mci = pd.read_csv('/home/angela/Desktop/vcog_paper/adni1_vbm_adcn_subtypes_20171209/7clus/adni2_mci_bl_demog_weights.csv')

In [83]:

mask_mci = adni2_mci.loc[:,'MCI'].values.astype(bool)
adni2_mci = adni2_mci.iloc[mask_mci]

In [84]:

adni2_mci.dropna(subset=['sub1','age_scan','gender','mean_gm','tiv',
                        'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR'],inplace=True)

In [85]:

adni2_mci = adni2_mci[['RID','age_scan','gender','mean_gm','tiv',
             'ADAS13','ADNI_MEM','ADNI_EF','BNTTOTAL','CLOCKSCOR',
             'sub1','sub2','sub3','sub4','sub5','sub6','sub7',
             'ABETA','TAU','conv_2_ad','AD','MCI','CN','APOE4_bin','DX','SUMMARYSUVR_WHOLECEREBNORM_1.11CUTOFF',
                       'Month_conv']]

In [86]:

x_ = adni2_mci.iloc[:, adni2_mci.columns.get_loc("ADAS13"):adni2_mci.columns.get_loc("CLOCKSCOR")+1].values
#x_ = scalerX.transform(x_)
y_ = adni2_mci[['conv_2_ad']].values.ravel()


confounds = adni2_mci[['gender','age_scan','mean_gm','tiv']].values
#confounds = data[['sex','age_r']].values[mask_mci,:]
#confounds[:, 1:] = scaler.transform(confounds[:, 1:])
#confounds[:, 0] = preprocessing.binarize(confounds[:, 0].reshape(-1, 1), threshold=1)[:, 0]
#confounds = scaler.transform(confounds)
#x_ = crm.transform(confounds, x_)

x_ = scaler.transform(np.hstack((x_,confounds)))

x_.shape, y_.shape, confounds.shape

Out[86]:

((235, 9), (235,), (235, 4))

In [87]:

array_results, dic_results = hpc.predict(x_, x_)

# Level 1
print('Level 1')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float))

print('Level 2')
stats_mask(y_, (dic_results['s1df'][:,0]>0).astype(float), dic_results['s2df'][:,1]>0)
#stats_mask(dic_results['s2df'][:,2]>0)
#stats_mask(dic_results['s2df'][:,3]>0)

Level 1
------------------------
Ratio: 1.0
#    :  55.0
# true values:  235
ACC  :  0.8425531914893617
Level 2
------------------------
Ratio: 0.5636363636363636
#    :  31.0
# true values:  40
ACC  :  0.775

In [88]:

y_pred = (dic_results['s1df'][:,0]>0).astype(float)
lr_decision = dic_results['s2df'][:,1]

In [89]:

predic_stats(y_, y_pred, lr_decision)

Total number of TARGET subjects:  55.0
Total number of NON-TARGET subjects:  180.0
Stage 1 number of hits (true and false positives):  60.0
Stage 1 TRUE positives:  39.0
Stage 1 FALSE positives:  21.0
Stage 1 TRUE negatives:  159.0
Total number of flagged HPC-AD subjects:  40.0
Number of flagged HPC-AD subjects that are TRUE positives:  31
Number of flagged HPC-AD subjects that are FALSE positives:  9.0
Number of true negatives:  171.0
#############################
Stage 1 stats for TARGET vs NON-TARGET
Precision for AD:  0.65
Recall (or sensitivity)  for AD:  0.7090909090909091
Specificity:  0.8833333333333333
Adjusted precision for 33.6% baseline rate:  0.7546358505778016
Accuracy:  0.8425531914893617
#############################
Stage 2 stats for TARGET vs NON-TARGET
Precision for HPC-AD:  0.775
Recall (or sensitivity) for HPC-AD:  0.5636363636363636
Specificity:  0.95
Adjusted precision for 33.6% baseline rate:  0.8508413657899034
Accuracy:  0.8595744680851064

Out[89]:

(0.7090909090909091,
 0.8833333333333333,
 0.65,
 0.8425531914893617,
 0.5636363636363636,
 0.95,
 0.775,
 0.8595744680851064)

ROC curve¶

base¶

In [90]:

y_predicted = base.predict(x_)
y_score = base.decision_function(x_)
y_score.shape

Out[90]:

(235,)

In [91]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_b = dict()
tpr_b = dict()
roc_auc_b = dict()
for i in range(n_classes):
    fpr_b[i], tpr_b[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_b[i] = auc(fpr_b[i], tpr_b[i])
    
# Compute micro-average ROC curve and ROC area
fpr_b["micro"], tpr_b["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_b["micro"] = auc(fpr_b["micro"], tpr_b["micro"])

In [92]:

average_precision_b = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_b))

precision_b, recall_b, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 0.76

HPS¶

In [93]:

y_true = y_.astype(int)
y_true = label_binarize(y_, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(lr_decision, (y_score.shape[0],1))

In [94]:

# Compute ROC curve and ROC area for each class
fpr_h = dict()
tpr_h = dict()
roc_auc_h = dict()
for i in range(n_classes):
    fpr_h[i], tpr_h[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_h[i] = auc(fpr_h[i], tpr_h[i])

# Compute micro-average ROC curve and ROC area
fpr_h["micro"], tpr_h["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_h["micro"] = auc(fpr_h["micro"], tpr_h["micro"])

In [95]:

average_precision_h = average_precision_score(y_true, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision_h))

precision_h, recall_h, _ = precision_recall_curve(y_true, y_score)

Average precision-recall score: 0.71

rbf kernel svm¶

In [96]:

y_predicted = grclf_svm.predict(x_)
y_score = grclf_svm.decision_function(x_)
y_score.shape

Out[96]:

(235,)

In [97]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_svcrbf = dict()
tpr_svcrbf = dict()
roc_auc_svcrbf = dict()
for i in range(n_classes):
    fpr_svcrbf[i], tpr_svcrbf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_svcrbf[i] = auc(fpr_svcrbf[i], tpr_svcrbf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_svcrbf["micro"], tpr_svcrbf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_svcrbf["micro"] = auc(fpr_svcrbf["micro"], tpr_svcrbf["micro"])

k nearest neighbours¶

In [98]:

y_predicted = grclf_knn.predict(x_)
y_score = grclf_knn.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[98]:

(235,)

In [99]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_knn = dict()
tpr_knn = dict()
roc_auc_knn = dict()
for i in range(n_classes):
    fpr_knn[i], tpr_knn[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_knn[i] = auc(fpr_knn[i], tpr_knn[i])
    
# Compute micro-average ROC curve and ROC area
fpr_knn["micro"], tpr_knn["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_knn["micro"] = auc(fpr_knn["micro"], tpr_knn["micro"])

random forest¶

In [100]:

y_predicted = grclf_rf.predict(x_)
y_score = grclf_rf.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[100]:

(235,)

In [101]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_rf = dict()
tpr_rf = dict()
roc_auc_rf = dict()
for i in range(n_classes):
    fpr_rf[i], tpr_rf[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_rf[i] = auc(fpr_rf[i], tpr_rf[i])
    
# Compute micro-average ROC curve and ROC area
fpr_rf["micro"], tpr_rf["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_rf["micro"] = auc(fpr_rf["micro"], tpr_rf["micro"])

gaussian naive bayes¶

In [102]:

y_predicted = clf_gnb.predict(x_)
y_score = clf_gnb.predict_proba(x_)
y_score = y_score[:,1] # take positive class
y_score.shape

Out[102]:

(235,)

In [103]:

y_true = y_.astype(int)
y_true = label_binarize(y_true, classes=[0, 1])
n_classes = y_true.shape[1]
y_score = np.reshape(y_score, (y_score.shape[0],1))

# Compute ROC curve and ROC area for each class
fpr_nb = dict()
tpr_nb = dict()
roc_auc_nb = dict()
for i in range(n_classes):
    fpr_nb[i], tpr_nb[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc_nb[i] = auc(fpr_nb[i], tpr_nb[i])
    
# Compute micro-average ROC curve and ROC area
fpr_nb["micro"], tpr_nb["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
roc_auc_nb["micro"] = auc(fpr_nb["micro"], tpr_nb["micro"])

In [104]:

#plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(10,8)
lw = 4
plt.rc('xtick', labelsize=40)
plt.rc('ytick', labelsize=40)
plt.plot(fpr_svcrbf[0], tpr_svcrbf[0], color='green',
         lw=lw, label='RBF SVM (AUC=%0.3f)' % roc_auc_svcrbf[0])
plt.plot(fpr_knn[0], tpr_knn[0], color='pink',
         lw=lw, label='KNN (AUC=%0.3f)' % roc_auc_knn[0])
plt.plot(fpr_rf[0], tpr_rf[0], color='brown',
         lw=lw, label='RF (AUC=%0.3f)' % roc_auc_rf[0])
plt.plot(fpr_nb[0], tpr_nb[0], color='orange',
         lw=lw, label='GNB (AUC=%0.3f)' % roc_auc_nb[0])
plt.plot(fpr_b[0], tpr_b[0], color='blue',
         lw=lw, label='Base (AUC=%0.3f)' % roc_auc_b[0])
plt.plot(fpr_h[0], tpr_h[0], color='red',
         lw=lw, label='HPS (AUC=%0.3f)' % roc_auc_h[0])
plt.plot([0, 1], [0, 1], color='grey', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.00])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR', fontdict={'size': 40})
plt.ylabel('TPR', fontdict={'size': 40})
plt.title('ADNI2 pMCI vs sMCI', fontdict={'size': 40})
plt.legend(loc="lower right", prop={'size': 25})
#ax.legend(loc='center left', bbox_to_anchor=(1,0.5), prop={'size': 30})
plt.show()
fig.savefig(path_results + 'adni2_mci_roc_multi.pdf', bbox_inches='tight')

In [ ]: