Analysis of the ABIDE dataset¶

Imports¶

In [1]:

import warnings
import os
from os.path import join
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import quantile_transform
from sklearn.metrics import roc_curve, RocCurveDisplay
from nilearn.datasets.utils import _uncompress_file, _fetch_file
from nilearn.connectome import ConnectivityMeasure

from neurocombat_sklearn import CombatModel

import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.formula.api import ols as ols_f

from mlconfound.stats import full_confound_test, partial_confound_test
from mlconfound.plot import plot_graph
from mlconfound.stats import _r2_cat_cont, _r2_cont_cont, _r2_cat_cat

/home/tspisak/src/mlconfound/venv/lib/python3.8/site-packages/nilearn/datasets/__init__.py:86: FutureWarning: Fetchers from the nilearn.datasets module will be updated in version 0.9 to return python strings instead of bytes and Pandas dataframes instead of Numpy arrays.
  warn("Fetchers from the nilearn.datasets module will be "

Load data¶

In [2]:

data_dir = '../data_in/ABIDE'

url = 'https://osf.io/hc4md/download'

# Download the zip file, first
dl_file = _fetch_file(url, data_dir=data_dir)

# Second, uncompress the downloaded zip file
_uncompress_file(dl_file, verbose=2)

Downloading data from https://osf.io/hc4md/download ...

Downloaded 1803198464 of 1811491701 bytes (99.5%,    0.7s remaining) ...done. (162 seconds, 2 min)
Extracting data from ../data_in/ABIDE/download..... done.

In [3]:

def _get_paths(phenotypic, atlas, timeseries_dir):
    """
    """
    timeseries = []
    IDs_subject = []
    diagnosis = []
    subject_ids = phenotypic['SUB_ID']
    mean_fd = []
    num_fd = []
    perc_fd = []
    site = []
    for index, subject_id in enumerate(subject_ids):
        this_pheno = phenotypic[phenotypic['SUB_ID'] == subject_id]
        this_timeseries = join(timeseries_dir, atlas,
                               str(subject_id) + '_timeseries.txt')
        if os.path.exists(this_timeseries):
            timeseries.append(np.loadtxt(this_timeseries))
            IDs_subject.append(subject_id)
            diagnosis.append(this_pheno['DX_GROUP'].values[0])
            mean_fd.append(this_pheno['func_mean_fd'].values[0])
            num_fd.append(this_pheno['func_num_fd'].values[0])
            perc_fd.append(this_pheno['func_perc_fd'].values[0])
            site.append(this_pheno['SITE_ID'].values[0])
    return timeseries, diagnosis, IDs_subject, mean_fd, num_fd, perc_fd, site

Download the phenotypic summary information file form the preprocessed connectomes project.

First read: http://preprocessed-connectomes-project.org/abide/download.html
Then download: https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv
Copy the csv file into the data_in/ABIDE directory

In [4]:

phenotypic = pd.read_csv('../data_in/ABIDE/Phenotypic_V1_0b_preprocessed1.csv').iloc[:,2:]
timeseries, diagnosis, IDs_subject, mean_fd, num_fd, perc_fd, site = _get_paths(phenotypic, "BASC/regions", '../data_in/ABIDE/')
sites, site_int = np.unique(site, return_inverse=True)
phenotypic
    

Out[4]:

	SUB_ID	X	subject	SITE_ID	FILE_ID	DX_GROUP	DSM_IV_TR	AGE_AT_SCAN	SEX	HANDEDNESS_CATEGORY	...	qc_notes_rater_1	qc_anat_rater_2	qc_anat_notes_rater_2	qc_func_rater_2	qc_func_notes_rater_2	qc_anat_rater_3	qc_anat_notes_rater_3	qc_func_rater_3	qc_func_notes_rater_3	SUB_IN_SMP
0	50002	1	50002	PITT	no_filename	1	1	16.77	1	Ambi	...	NaN	OK	NaN	fail	ic-parietal-cerebellum	OK	NaN	fail	ERROR #24	1
1	50003	2	50003	PITT	Pitt_0050003	1	1	24.45	1	R	...	NaN	OK	NaN	OK	NaN	OK	NaN	OK	NaN	1
2	50004	3	50004	PITT	Pitt_0050004	1	1	19.09	1	R	...	NaN	OK	NaN	OK	NaN	OK	NaN	OK	NaN	1
3	50005	4	50005	PITT	Pitt_0050005	1	1	13.73	2	R	...	NaN	OK	NaN	maybe	ic-parietal-cerebellum	OK	NaN	OK	NaN	0
4	50006	5	50006	PITT	Pitt_0050006	1	1	13.37	1	L	...	NaN	OK	NaN	maybe	ic-parietal slight	OK	NaN	OK	NaN	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1107	51583	1108	51583	SBL	SBL_0051583	1	2	35.00	1	NaN	...	NaN	OK	NaN	OK	ic-cerebellum-temporal_lobe	OK	NaN	OK	NaN	0
1108	51584	1109	51584	SBL	SBL_0051584	1	2	49.00	1	NaN	...	NaN	OK	NaN	maybe	vmpfc dropout	OK	NaN	OK	NaN	0
1109	51585	1110	51585	SBL	SBL_0051585	1	1	27.00	1	NaN	...	NaN	OK	NaN	maybe	ic-cerebellum-temporal_lobe	OK	NaN	OK	NaN	0
1110	51606	1111	51606	MAX_MUN	MaxMun_a_0051606	1	2	29.00	2	R	...	NaN	OK	NaN	maybe	ic-cerebellum	OK	NaN	OK	NaN	0
1111	51607	1112	51607	MAX_MUN	MaxMun_a_0051607	1	2	26.00	1	R	...	NaN	OK	NaN	maybe	ic-cerebellum	OK	NaN	OK	NaN	1

1112 rows × 104 columns

In [5]:

sns.histplot(mean_fd, color='gray')
plt.savefig('../data_out/fig/abide_motion_hist.pdf')
kurtosis(mean_fd), skew(mean_fd)

Out[5]:

(41.170464362909215, 5.142594423450439)

In [6]:

sns.histplot(np.log(mean_fd), color='gray')
kurtosis(np.log(mean_fd)), skew(np.log(mean_fd))

Out[6]:

(0.492398885189945, 0.6060249878053774)

In [7]:

rng = np.random.default_rng(42)
mean_fd_trf = quantile_transform(np.array([mean_fd]).T, output_distribution='normal',
                                 n_quantiles=len(mean_fd)).flatten()

sns.histplot(mean_fd_trf, color='gray')
plt.savefig('../data_out/fig/abide_motion_quanttrf_hist.pdf')

kurtosis(mean_fd_trf), skew(mean_fd_trf)
mean_fd = mean_fd_trf

Binning motion data (to be used later)¶

In [8]:

# binning mean_fd
bins = 10 # approximately 80 subject per motion group

limits = np.quantile(mean_fd, np.arange(0, 1, 1/bins))
mean_fd_binned = np.digitize(mean_fd, limits)

Calculate connectivity¶

In [9]:

connections = ConnectivityMeasure(kind='tangent', vectorize=True, discard_diagonal=True)
conn_coefs = connections.fit_transform(timeseries)

In [10]:

_, y = np.unique(diagnosis, return_inverse=True)
X = conn_coefs

Machine learning (raw)¶

In [11]:

outer_cv = StratifiedKFold(10)
inner_cv = StratifiedKFold(10) 
model = Pipeline([
    ('varthr', VarianceThreshold(0)),   # omit zero variance columns (diagonal)
    ('model', LogisticRegression())])

p_grid = {'model__C': [0.1, 1, 10]}

clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=StratifiedKFold(10),
                   scoring="roc_auc", return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
predicted_prob = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):

    clf.fit(X[train], y[train])
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X[test], y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X[test])
    predicted_prob[test] = clf.predict_proba(X[test])[:,0]
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X[test], y[test])
    i = i+1

model	inner_cv mean score	outer vc score
cv: 0 {'model__C': 0.1} 0.7100334219236659 0.690880169671262
cv: 1 {'model__C': 0.1} 0.7120691684716075 0.6458112407211029
cv: 2 {'model__C': 0.1} 0.7018831113343309 0.6851063829787234
cv: 3 {'model__C': 0.1} 0.683804187462724 0.8202127659574467
cv: 4 {'model__C': 1} 0.6986948621094962 0.7351063829787234
cv: 5 {'model__C': 0.1} 0.7201791512767122 0.6872340425531914
cv: 6 {'model__C': 0.1} 0.6974297468199907 0.7592391304347826
cv: 7 {'model__C': 0.1} 0.7031858949541876 0.7597826086956522
cv: 8 {'model__C': 0.1} 0.7127732261878603 0.7440217391304348
cv: 9 {'model__C': 0.1} 0.7333955035174546 0.6059782608695652

In [12]:

print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))

fpr, tpr, _ = roc_curve(y, predicted_prob, pos_label=0)
fig, ax = plt.subplots(figsize=(5,2))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax) 
plt.savefig('../data_out/fig/abide_raw_rocplot.pdf')

** Mean score in the inner crossvaludation (inner_cv):	0.7073448274058031
** Mean Nested Crossvalidation Score (outer_cv):	0.7133372723990885

In [13]:

plt.figure(figsize=(5,1.5))

pal=[sns.color_palette("coolwarm", 10)[-1], sns.color_palette("coolwarm", 10)[0]]
sns.stripplot(x=site, y=predicted_prob, hue=y, palette=pal, alpha=0.2, jitter=0.2, dodge=True)
ax=sns.boxplot(x=site, y=predicted_prob, hue=y, showfliers = False)
for box in ax.artists:
    box.set_facecolor((1,1,1,0))
plt.xticks(rotation=90)
for i in range(len(np.unique(site))):
    plt.axvline(i+0.5, color="gray", alpha=0.5, linewidth=0.5)
plt.savefig('../data_out/fig/abide_site_raw_striplot.pdf')

In [14]:

plt.figure(figsize=(5,2))
sns.stripplot(x=y, y=predicted_prob, hue=mean_fd_binned,
              palette="coolwarm", alpha=0.4, jitter=0.4, dodge=True)
sns.boxplot(x=y, y=predicted_prob, color=(1,1,1,1))
plt.legend([],[], frameon=False)
plt.savefig('../data_out/fig/abide_motion_raw_stripplot.pdf')

In [15]:

plot_graph(partial_confound_test(y, predicted_prob, mean_fd, cat_y=True, cat_yhat=False,
                                     random_state=42),
          outfile_base='../data_out/fig/abide_motion_raw_partial')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 652.30it/s]

Out[15]:

In [16]:

plot_graph(full_confound_test(y, predicted_prob, mean_fd, cat_y=True, cat_yhat=False,
                                 random_state=42),
          outfile_base='../data_out/fig/abide_motion_raw_full')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 511.06it/s]

Out[16]:

In [17]:

nulldist = []
unpermuted = _r2_cont_cont(predicted_prob, mean_fd)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cont_cont(yperm, mean_fd))
(nulldist >= unpermuted).sum()/len(nulldist)

0.11075648305937133

Out[17]:

0.0

In [18]:

nulldist = []
unpermuted = _r2_cat_cont(y, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(y, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.12636970891102528

Out[18]:

0.0

In [19]:

plot_graph(partial_confound_test(y, predicted_prob, site_int,
                                     cat_y=True, cat_yhat=False, cat_c=True,
                                     random_state=42),
          outfile_base='../data_out/fig/abide_site_raw_partial')

Permuting: 100%|██████████| 1000/1000 [00:02<00:00, 466.23it/s]

Out[19]:

In [20]:

plot_graph(full_confound_test(y, predicted_prob, site_int,
                                 cat_y=True, cat_yhat=False, cat_c=True,
                                 random_state=42),
          outfile_base='../data_out/fig/abide_site_raw_full')

Permuting: 100%|██████████| 1000/1000 [00:02<00:00, 496.97it/s]

Out[20]:

In [21]:

nulldist = []
unpermuted = _r2_cat_cont(site_int, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(site_int, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.16862432442215103

Out[21]:

0.0

In [22]:

nulldist = []
unpermuted = _r2_cat_cont(y, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(y, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.12636970891102528

Out[22]:

0.0

Regress out motion from the feature¶

In [23]:

# regress-out motion from connectivity
X_adj = np.zeros_like(X)
for i in range(X.shape[1]):
    OLS_model = OLS(X[:,i], sm.add_constant(mean_fd)).fit()  # training the model
    X_adj[:, i] = OLS_model.resid

In [24]:

outer_cv = StratifiedKFold(10)
inner_cv = StratifiedKFold(10) 
model = Pipeline([
    ('varthr', VarianceThreshold(0)),   # omit zero variance columns (diagonal)
    #('fsel', SelectKBest(f_regression)),
    ('model', LogisticRegression())])

p_grid = {#'fsel__k': [500, 1000, 2000],
          'model__C': [0.1, 1, 10]}

clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=StratifiedKFold(10),
                   scoring="roc_auc", return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
predicted_prob = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X_adj))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X_adj))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):

    clf.fit(X_adj[train], y[train])
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_adj[test], y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_adj[test])
    predicted_prob[test] = clf.predict_proba(X_adj[test])[:,0]
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_adj[test], y[test])
    i = i+1

model	inner_cv mean score	outer vc score
cv: 0 {'model__C': 0.1} 0.575732134573598 0.679745493107105
cv: 1 {'model__C': 0.1} 0.5862695012085257 0.5922587486744432
cv: 2 {'model__C': 0.1} 0.5971689023518292 0.5494680851063829
cv: 3 {'model__C': 0.1} 0.5795941847771116 0.6558510638297872
cv: 4 {'model__C': 0.1} 0.5901033873594849 0.6446808510638298
cv: 5 {'model__C': 0.1} 0.6414546515156271 0.45
cv: 6 {'model__C': 0.1} 0.5846736980883322 0.6701086956521739
cv: 7 {'model__C': 0.1} 0.6039699542748325 0.5945652173913043
cv: 8 {'model__C': 0.1} 0.6184660706002169 0.5201086956521739
cv: 9 {'model__C': 0.1} 0.6278473595546766 0.5266304347826086

In [25]:

print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))

fpr, tpr, _ = roc_curve(y, predicted_prob, pos_label=0)
fig, ax = plt.subplots(figsize=(5,2))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax) 
plt.savefig('../data_out/fig/abide_motion_reg_rocplot.pdf')
    

** Mean score in the inner crossvaludation (inner_cv):	0.6005279844304235
** Mean Nested Crossvalidation Score (outer_cv):	0.5883417285259809

In [26]:

plt.figure(figsize=(5,2))
sns.stripplot(x=y, y=predicted_prob, hue=mean_fd_binned,
              palette="coolwarm", alpha=0.4, jitter=0.4, dodge=True)
sns.boxplot(x=y, y=predicted_prob, color=(1,1,1,1))
plt.legend([],[], frameon=False)
plt.savefig('../data_out/fig/abide_motion_reg_stripplot.pdf')

In [27]:

plot_graph(partial_confound_test(y, predicted_prob, mean_fd, cat_y=True, cat_yhat=False,
                                     random_state=42),
          outfile_base='../data_out/fig/abide_motion_reg_partial')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 792.96it/s]

Out[27]:

In [28]:

plot_graph(full_confound_test(y, predicted_prob, mean_fd, cat_y=True, cat_yhat=False,
                                 random_state=42),
          outfile_base='../data_out/fig/abide_motion_reg_full')

Permuting: 100%|██████████| 1000/1000 [00:02<00:00, 498.13it/s]

Out[28]:

In [29]:

nulldist = []
unpermuted = _r2_cont_cont(predicted_prob, mean_fd)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cont_cont(yperm, mean_fd))
(nulldist >= unpermuted).sum()/len(nulldist)

0.35940432840757963

Out[29]:

0.0

In [30]:

nulldist = []
unpermuted = _r2_cat_cont(y, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(y, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.01983830774607609

Out[30]:

0.0

Regress out site¶

In [31]:

# regress-out acquisition from connectivity
X_adj = np.zeros_like(X)
for i in range(X.shape[1]):
    tmp = pd.DataFrame({
        'x': site_int,
        'y': X[:,i]
    })
    OLS_model = ols_f("y ~ C(x)", tmp).fit()  # training the model
    X_adj[:, i] = OLS_model.resid.values

In [32]:

outer_cv = StratifiedKFold(10)
inner_cv = StratifiedKFold(10) 
model = Pipeline([
    ('varthr', VarianceThreshold(0)),   # omit zero variance columns (diagonal)
    #('fsel', SelectKBest(f_regression)),
    ('model', LogisticRegression())])

p_grid = {#'fsel__k': [500, 1000, 2000],
          'model__C': [0.1, 1, 10]}

clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=StratifiedKFold(10),
                   scoring="roc_auc", return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
predicted_prob = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X_adj))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X_adj))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):

    clf.fit(X_adj[train], y[train])
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_adj[test], y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_adj[test])
    predicted_prob[test] = clf.predict_proba(X_adj[test])[:,0]
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_adj[test], y[test])
    i = i+1

model	inner_cv mean score	outer vc score
cv: 0 {'model__C': 0.1} 0.7512998189827458 0.6956521739130435
cv: 1 {'model__C': 0.1} 0.7602222431490724 0.653764581124072
cv: 2 {'model__C': 0.1} 0.7382248101760297 0.7797872340425532
cv: 3 {'model__C': 10} 0.7533220502732698 0.8308510638297872
cv: 4 {'model__C': 0.1} 0.7311433384604116 0.7856382978723404
cv: 5 {'model__C': 0.1} 0.7137365152609055 0.8664893617021276
cv: 6 {'model__C': 0.1} 0.7290465552660674 0.7478260869565218
cv: 7 {'model__C': 0.1} 0.7255997722461138 0.8108695652173913
cv: 8 {'model__C': 0.1} 0.737317238841629 0.771195652173913
cv: 9 {'model__C': 0.1} 0.7495317617268836 0.6157608695652174

In [33]:

print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))

fpr, tpr, _ = roc_curve(y, predicted_prob, pos_label=0)
fig, ax = plt.subplots(figsize=(5,2))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax) 
plt.savefig('../data_out/fig/abide_site_reg_rocplot.pdf')
    

** Mean score in the inner crossvaludation (inner_cv):	0.7389444104383128
** Mean Nested Crossvalidation Score (outer_cv):	0.7557834886396967

In [34]:

plt.figure(figsize=(5,1.5))

pal=[sns.color_palette("coolwarm", 10)[-1], sns.color_palette("coolwarm", 10)[0]]
sns.stripplot(x=site, y=predicted_prob, hue=y, palette=pal, alpha=0.2, jitter=0.2, dodge=True)
ax=sns.boxplot(x=site, y=predicted_prob, hue=y, showfliers = False)
for box in ax.artists:
    box.set_facecolor((1,1,1,0))
plt.xticks(rotation=90)
for i in range(len(np.unique(site))):
    plt.axvline(i+0.5, color="gray", alpha=0.5, linewidth=0.5)
plt.savefig('../data_out/fig/abide_site_reg_striplot.pdf')

In [35]:

plot_graph(partial_confound_test(y, predicted_prob, site_int,
                                     cat_y=True, cat_yhat=False, cat_c=True,
                                     random_state=42),
          outfile_base='../data_out/fig/abide_site_reg_partial')

Permuting: 100%|██████████| 1000/1000 [00:02<00:00, 484.24it/s]

Out[35]:

In [36]:

plot_graph(full_confound_test(y, predicted_prob, site_int,
                                 cat_y=True, cat_yhat=False, cat_c=True,
                                 random_state=42),
          outfile_base='../data_out/fig/abide_site_reg_full')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 509.17it/s]

Out[36]:

In [37]:

nulldist = []
unpermuted = _r2_cat_cont(site_int, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(site_int, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.04243881759371848

Out[37]:

0.004

In [38]:

nulldist = []
unpermuted = _r2_cat_cont(y, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(y, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.1730211479165208

Out[38]:

0.0

Combat on binned motion data¶

In [39]:

outer_cv = StratifiedKFold(10)
inner_cv = StratifiedKFold(10) 
model = Pipeline([
    ('varthr', VarianceThreshold(0)),   # omit zero variance columns (diagonal)
    #('fsel', SelectKBest(f_regression)),
    ('model', LogisticRegression())])

p_grid = {#'fsel__k': [500, 1000, 2000],
          'model__C': [0.1, 1, 10]}

clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=StratifiedKFold(10),
                   scoring="roc_auc", return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
predicted_prob = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):
    
    comb = CombatModel()
    X_train_combat = comb.fit_transform(X[:,np.sum(X,0)!=0][train],
                                   np.array([mean_fd_binned[train]]).transpose()
                                  )


    clf.fit(X_train_combat, y[train])
    
    X_test_combat = comb.transform(X[:,np.sum(X,0)!=0][test],
                                   np.array([mean_fd_binned[test]]).transpose())

                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_test_combat, y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_test_combat)
    predicted_prob[test] = clf.predict_proba(X_test_combat)[:,0]
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_test_combat, y[test])
    i = i+1

model	inner_cv mean score	outer vc score
cv: 0 {'model__C': 0.1} 0.6645641721861234 0.7067868504772005
cv: 1 {'model__C': 0.1} 0.6695913247742515 0.6813361611876988
cv: 2 {'model__C': 0.1} 0.6594625984869887 0.6930851063829787
cv: 3 {'model__C': 0.1} 0.6608973956534931 0.8074468085106383
cv: 4 {'model__C': 0.1} 0.6629321394565297 0.7287234042553191
cv: 5 {'model__C': 0.1} 0.7064102865932134 0.6425531914893616
cv: 6 {'model__C': 0.1} 0.6678523035230352 0.7581521739130436
cv: 7 {'model__C': 0.1} 0.6746544453861528 0.7472826086956521
cv: 8 {'model__C': 0.1} 0.6961607077460735 0.6646739130434782
cv: 9 {'model__C': 0.1} 0.6957503670918306 0.5815217391304348

In [40]:

print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))

fpr, tpr, _ = roc_curve(y, predicted_prob, pos_label=0)
fig, ax = plt.subplots(figsize=(5,2))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax) 
plt.savefig('../data_out/fig/abide_motion_comb_rocplot.pdf')  
    

** Mean score in the inner crossvaludation (inner_cv):	0.6758275740897691
** Mean Nested Crossvalidation Score (outer_cv):	0.7011561957085806

In [41]:

plt.figure(figsize=(5,2))
sns.stripplot(x=y, y=predicted_prob, hue=mean_fd_binned,
              palette="coolwarm", alpha=0.4, jitter=0.4, dodge=True)
plt.legend([],[], frameon=False)
sns.boxplot(x=y, y=predicted_prob, color=(1,1,1,1))
plt.savefig('../data_out/fig/abide_motion_comb_stripplot.pdf')

In [42]:

plot_graph(partial_confound_test(y, predicted_prob, mean_fd, cat_y=True, cat_yhat=False,
                                     random_state=42),
          outfile_base='../data_out/fig/abide_motion_comb_partial')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 906.33it/s]

Out[42]:

In [43]:

plot_graph(full_confound_test(y, predicted_prob, mean_fd, cat_y=True, cat_yhat=False,
                                 random_state=42),
          outfile_base='../data_out/fig/abide_motion_comb_full')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 503.86it/s]

Out[43]:

In [44]:

nulldist = []
unpermuted = _r2_cont_cont(predicted_prob, mean_fd)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cont_cont(yperm, mean_fd))
(nulldist >= unpermuted).sum()/len(nulldist)

0.001963925089027347

Out[44]:

0.187

In [45]:

nulldist = []
unpermuted = _r2_cat_cont(y, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(y, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.11138396090241687

Out[45]:

0.0

Combat on site¶

In [46]:

outer_cv = StratifiedKFold(10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(10, shuffle=True, random_state=42) 
model = Pipeline([
    ('varthr', VarianceThreshold(0)),   # omit zero variance columns (diagonal)
    #('fsel', SelectKBest(f_regression)),
    ('model', LogisticRegression())])

p_grid = {#'fsel__k': [500, 1000, 2000],
          'model__C': [0.1, 1, 10]}

clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=StratifiedKFold(10),
                   scoring="roc_auc", return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
predicted_prob = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):
    
    comb = CombatModel()
    X_train_combat = comb.fit_transform(X[:,np.sum(X,0)!=0][train],
                                   np.array([site_int[train]]).transpose()
                                  )


    clf.fit(X_train_combat, y[train])
    
    X_test_combat = comb.transform(X[:,np.sum(X,0)!=0][test],
                                   np.array([site_int[test]]).transpose())

                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_test_combat, y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_test_combat)
    predicted_prob[test] = clf.predict_proba(X_test_combat)[:,0]
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_test_combat, y[test])
    i = i+1

model	inner_cv mean score	outer vc score
cv: 0 {'model__C': 0.1} 0.7501508477118233 0.7290562036055144
cv: 1 {'model__C': 0.1} 0.7358902020487387 0.7576882290562036
cv: 2 {'model__C': 0.1} 0.7447366355902941 0.7026595744680851
cv: 3 {'model__C': 0.1} 0.7346093044263775 0.7696808510638298
cv: 4 {'model__C': 0.1} 0.7258576084795597 0.7973404255319149
cv: 5 {'model__C': 0.1} 0.751687881261052 0.6675531914893617
cv: 6 {'model__C': 1} 0.7394399626106943 0.7913043478260869
cv: 7 {'model__C': 1} 0.7423432578920385 0.7369565217391304
cv: 8 {'model__C': 0.1} 0.7492881383125284 0.7266304347826087
cv: 9 {'model__C': 0.1} 0.7345418589321029 0.7728260869565218

In [47]:

print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))

fpr, tpr, _ = roc_curve(y, predicted_prob, pos_label=0)
fig, ax = plt.subplots(figsize=(5,2))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax) 
plt.savefig('../data_out/fig/abide_site_comb_rocplot.pdf')  

** Mean score in the inner crossvaludation (inner_cv):	0.740854569726521
** Mean Nested Crossvalidation Score (outer_cv):	0.7451695866519257

In [48]:

plt.figure(figsize=(5,1.5))

pal=[sns.color_palette("coolwarm", 10)[-1], sns.color_palette("coolwarm", 10)[0]]
sns.stripplot(x=site, y=predicted_prob, hue=y, palette=pal, alpha=0.2, jitter=0.2, dodge=True)
ax=sns.boxplot(x=site, y=predicted_prob, hue=y, showfliers = False)
for box in ax.artists:
    box.set_facecolor((1,1,1,0))
plt.xticks(rotation=90)
for i in range(len(np.unique(site))):
    plt.axvline(i+0.5, color="gray", alpha=0.5, linewidth=0.5)
plt.savefig('../data_out/fig/abide_site_comb_striplot.pdf')

In [49]:

plot_graph(partial_confound_test(y, predicted_prob, site_int,
                                     cat_y=True, cat_yhat=False, cat_c=True,
                                     random_state=42),
          outfile_base='../data_out/fig/abide_site_comb_partial')

Permuting: 100%|██████████| 1000/1000 [00:02<00:00, 490.00it/s]

Out[49]:

In [50]:

plot_graph(full_confound_test(y, predicted_prob, site_int,
                                 cat_y=True, cat_yhat=False, cat_c=True,
                                 random_state=42),
          outfile_base='../data_out/fig/abide_site_comb_full')

Permuting: 100%|██████████| 1000/1000 [00:01<00:00, 527.23it/s]

Out[50]:

In [51]:

nulldist = []
unpermuted = _r2_cat_cont(site_int, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(site_int, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.04978966375151461

Out[51]:

0.001

In [52]:

nulldist = []
unpermuted = _r2_cat_cont(y, predicted_prob)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(predicted_prob)
    nulldist.append(_r2_cat_cont(y, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.17032841128735843

Out[52]:

0.0

In [53]:

sns.color_palette("coolwarm", as_cmap=True)

Out[53]:

coolwarm

under

bad

over

In [54]:

np.max(mean_fd)

Out[54]:

5.19933758270342

In [54]:

In [55]:

nulldist = []
unpermuted = _r2_cat_cont(y, mean_fd)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(y)
    nulldist.append(_r2_cat_cont(yperm, mean_fd))
(nulldist >= unpermuted).sum()/len(nulldist)

0.02798282700643162

Out[55]:

0.0

In [56]:

nulldist = []
unpermuted = _r2_cat_cat(site_int, y)

print(unpermuted)

for i in range(1000):
    yperm = np.random.permutation(y)
    nulldist.append(_r2_cat_cat(site_int, yperm))
(nulldist >= unpermuted).sum()/len(nulldist)

0.019007371520746053

/home/tspisak/src/mlconfound/venv/lib/python3.8/site-packages/statsmodels/base/model.py:547: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  warnings.warn('Inverting hessian failed, no bse or cov_params '

Out[56]:

0.284

In [56]: