Notebook

Replicability of functional connectivity-based multivariate BWAS - figures¶

Plots from this notebook are saved into the directory fig.

Imports¶

In [10]:

import math
import numpy as np
from scipy import stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={"figure.figsize":(3, 3)})
sns.set_style("white")

Define Plotting function¶

This function takes a dataframe created in the notebook multivariate_BWAS_replicability_analysis.ipynb and recreates the "Replication plots" from the target paper (Fig.4) for a given target variable and feature set.

In [11]:

def plot(target, feature, df, alpha=0.05, cv_only=True, filetag=None, ylim=None, xlim=None):
    sns.set(rc={"figure.figsize":(3, 3)})
    sns.set_style("white")
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    if not cv_only:
        sns.scatterplot(x='r_replication', y='r_discovery_overfit', hue='n', data=tmp, palette='Greens_r', linewidth=0, hue_norm=(0,tmp.n.max()*1.5))
        sns.scatterplot(x=[tmp.loc[tmp.n==200, 'r_replication'].mean()], y=[tmp.loc[tmp.n==200, 'r_discovery_overfit'].mean()], color='red')
        sns.scatterplot(x=[tmp.loc[tmp.n==tmp.n.max(), 'r_replication'].mean()], y=[tmp.loc[tmp.n==tmp.n.max(), 'r_discovery_overfit'].mean()], color='purple').set(title='Discovery without CV (overfit)')
        plt.axhline(0, color='gray')
        plt.axvline(0, color='gray')
        plt.axvline(0.088, linestyle='dotted')
        plt.axhline(0.088, linestyle='dotted')
        plt.axvline(0.088, linestyle='dotted')
        plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
        if ylim:
            plt.ylim(ylim)
        if xlim:
            plt.xlim(xlim)
        sns.despine()
        if filetag:
            plt.savefig('fig/overfit_scatter_' + target + '_' + feature + '_' + filetag + '.pdf')
        plt.show()

        for n in tmp.n.unique():
            tmp2 = tmp[tmp.n == n]

            if (tmp2['p_discovery_overfit']<alpha).sum() == 0:
                replication_prob = 'no discovery'
            else:
                 #      #(significant replications among significant discoveries)     / # significant discoveries
                replication_prob = (tmp2.loc[tmp2['p_discovery_overfit']<alpha,'p_replication']<alpha).sum() / (tmp2['p_discovery_overfit']<alpha).sum() * 100
                print("Replication probability at n =", n, ':', replication_prob, '%')
        
    sns.scatterplot(x='r_replication', y='r_discovery_cv', hue='n', data=tmp, palette='Greens_r', linewidth=0, hue_norm=(0,tmp.n.max()*1.5))
    sns.scatterplot(x=[tmp.loc[tmp.n==200, 'r_replication'].mean()], y=[tmp.loc[tmp.n==200, 'r_discovery_cv'].mean()], color='red')
    sns.scatterplot(x=[tmp.loc[tmp.n==tmp.n.max(), 'r_replication'].mean()], y=[tmp.loc[tmp.n==tmp.n.max(), 'r_discovery_cv'].mean()], color='purple').set(title='Discovery with CV')
    plt.axhline(0, color='gray')
    plt.axvline(0, color='gray')
    plt.axhline(0.088, linestyle='dotted')
    plt.axvline(0.088, linestyle='dotted')
    plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
    if ylim:
        plt.ylim(ylim)
    if xlim:
        plt.xlim(xlim)
    sns.despine()
    if filetag:
        plt.savefig('fig/scatter_' + target + '_' + feature + '_' + filetag + '.pdf')
    plt.show()

    for n in tmp.n.unique():
        tmp2 = tmp[tmp.n == n]

        if (tmp2['p_discovery_cv']<alpha).sum() == 0:
            replication_prob = 'no discovery'
        else:
             #      #(significant replications among significant discoveries)     / # significant discoveries
            replication_prob = (tmp2.loc[tmp2['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp2['p_discovery_cv']<alpha).sum() * 100

        print("Replication probability at n =", n, ':', replication_prob, '%')

    return {'r_rep_200': tmp.loc[tmp.n==200, 'r_replication'].mean(),
            'r_rep_max': tmp.loc[tmp.n==tmp.n.max(), 'r_replication'].mean()}

Possible targets:¶

'age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj'

Possible features¶

'netmats_parcor', 'netmats_pearson'

Cognitive ability, PCA+SVR, pearson correlation¶

This basically reproduces Fig. 4 of the target paper with a highly similar methodology.

In [14]:

df = pd.read_csv('res/results_PCA_SVR.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_pearson', df=df, cv_only=False, filetag='PCA-SVR_ylim', ylim=(-0.6,0.8))

Replication probability at n = 50 : 11.0 %
Replication probability at n = 100 : 17.0 %
Replication probability at n = 200 : 43.0 %
Replication probability at n = 300 : 72.0 %
Replication probability at n = 495 : 100.0 %

Replication probability at n = 50 : no discovery %
Replication probability at n = 100 : 0.0 %
Replication probability at n = 200 : 25.0 %
Replication probability at n = 300 : 62.16216216216216 %
Replication probability at n = 495 : 100.0 %

Out[14]:

{'r_rep_200': 0.10612941902239685, 'r_rep_max': 0.17595802104381736}

In [15]:

plot(target='CogTotalComp_AgeAdj', feature='netmats_pearson', df=df, cv_only=True, filetag='PCA-SVR', ylim=(-0.6,0.8), xlim=(-0.5, 0.6))

Replication probability at n = 50 : no discovery %
Replication probability at n = 100 : 0.0 %
Replication probability at n = 200 : 25.0 %
Replication probability at n = 300 : 62.16216216216216 %
Replication probability at n = 495 : 100.0 %

Out[15]:

{'r_rep_200': 0.10612941902239685, 'r_rep_max': 0.17595802104381736}

Cognitive ability, PCA+SVR, partial correlation¶

In [16]:

df = pd.read_csv('res/results_PCA_SVR.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, filetag='PCA-SVR', ylim=(-0.6,0.8), xlim=(-0.5, 0.6))

Replication probability at n = 50 : no discovery %
Replication probability at n = 100 : 90.9090909090909 %
Replication probability at n = 200 : 100.0 %
Replication probability at n = 300 : 100.0 %
Replication probability at n = 495 : 100.0 %

Out[16]:

{'r_rep_200': 0.2713589974517626, 'r_rep_max': 0.34611199458096925}

Cognitive ability, Ridge, pearson correlation¶

In [19]:

df = pd.read_csv('res/results_Ridge.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_pearson', df=df, filetag='Ridge', ylim=(-0.6,0.8), xlim=(-0.5, 0.6))

Replication probability at n = 50 : 40.625 %
Replication probability at n = 100 : 78.66666666666666 %
Replication probability at n = 200 : 100.0 %
Replication probability at n = 300 : 100.0 %
Replication probability at n = 495 : 100.0 %

Out[19]:

{'r_rep_200': 0.31597434384989187, 'r_rep_max': 0.38191979503949747}

Cognitive ability, Ridge, partial correlation¶

In [20]:

df = pd.read_csv('res/results_Ridge.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, filetag='Ridge', ylim=(-0.6,0.8), xlim=(-0.5, 0.6))

Replication probability at n = 50 : 64.86486486486487 %
Replication probability at n = 100 : 97.5 %
Replication probability at n = 200 : 100.0 %
Replication probability at n = 300 : 100.0 %
Replication probability at n = 495 : 100.0 %

Out[20]:

{'r_rep_200': 0.4062055329971098, 'r_rep_max': 0.47903946590450225}

In [24]:

df = pd.read_csv('res/results_Ridge.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, cv_only=False, filetag='Ridge', ylim=(0,1.1), xlim=(-0.3, 0.7))

Replication probability at n = 50 : 64.0 %
Replication probability at n = 100 : 97.0 %
Replication probability at n = 200 : 100.0 %
Replication probability at n = 300 : 100.0 %
Replication probability at n = 495 : 100.0 %

Replication probability at n = 50 : 64.86486486486487 %
Replication probability at n = 100 : 97.5 %
Replication probability at n = 200 : 100.0 %
Replication probability at n = 300 : 100.0 %
Replication probability at n = 495 : 100.0 %

Out[24]:

{'r_rep_200': 0.4062055329971098, 'r_rep_max': 0.47903946590450225}

In [ ]:

df = pd.read_csv('res/results_null_Ridge.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, cv_only=False, filetag='Ridge', ylim=(0,1.1), xlim=(-0.4, 0.4))

In [ ]:

df = pd.read_csv('res/results_null_Ridge.csv')
plot(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, cv_only=True, filetag='Ridge_null', ylim=(-0.45,0.45), xlim=(-0.4, 0.4))

Inflation histograms¶

In [ ]:

sns.set(rc={"figure.figsize":(3, 1.5)})
sns.set_style("white")
df = pd.read_csv('res/results_PCA_SVR.csv')
tmp = df.loc[(df.target=='CogTotalComp_AgeAdj') & (df.connectivity=='netmats_pearson')]
sns.histplot(tmp.r_replication - tmp.r_discovery_overfit, color='orange', alpha=0.5)
sns.histplot(tmp.r_replication - tmp.r_discovery_overfit, color='black',fill=False)


df = pd.read_csv('res/results_Ridge.csv')
tmp = df.loc[(df.target=='CogTotalComp_AgeAdj') & (df.connectivity=='netmats_pearson')]
sns.histplot(tmp.r_replication - tmp.r_discovery_overfit, color='red', alpha=0.5)
sns.histplot(tmp.r_replication - tmp.r_discovery_overfit, color='black',fill=False)
sns.despine()

plt.savefig('fig/hist_inflation_overfit.pdf')
plt.show()

In [ ]:

sns.set(rc={"figure.figsize":(3, 1.5)})
sns.set_style("white")
df = pd.read_csv('res/results_PCA_SVR.csv')
tmp = df.loc[(df.target=='CogTotalComp_AgeAdj') & (df.connectivity=='netmats_pearson')]
sns.histplot(tmp.r_replication - tmp.r_discovery_cv, color='orange', alpha=0.5)
sns.histplot(tmp.r_replication - tmp.r_discovery_cv, color='black',fill=False)


df = pd.read_csv('res/results_Ridge.csv')
tmp = df.loc[(df.target=='CogTotalComp_AgeAdj') & (df.connectivity=='netmats_pearson')]
sns.histplot(tmp.r_replication - tmp.r_discovery_cv, color='red', alpha=0.5)
sns.histplot(tmp.r_replication - tmp.r_discovery_cv, color='black',fill=False)
sns.despine()

plt.savefig('fig/hist_inflation_cv.pdf')
plt.show()

In [ ]:

sns.set(rc={"figure.figsize":(3, 1.5)})
sns.set_style("white")
df = pd.read_csv('res/results_PCA_SVR.csv')
tmp = df.loc[(df.target=='CogTotalComp_AgeAdj') & (df.connectivity=='netmats_pearson')]
sns.histplot(tmp.r_discovery_cv - tmp.r_replication, color='blue', alpha=0.5)
sns.histplot(tmp.r_discovery_cv - tmp.r_replication, color='black',fill=False)

tmp = df.loc[(df.target=='CogTotalComp_AgeAdj') & (df.connectivity=='netmats_pearson')]
sns.histplot(tmp.r_discovery_overfit - tmp.r_replication, color='red', alpha=0.5)
sns.histplot(tmp.r_discovery_overfit - tmp.r_replication, color='black',fill=False)
sns.despine()

plt.axvline(0, color='gray')

plt.savefig('fig/hist_inflation_cv_vs_overfit.pdf')
plt.show()

False positives with biased and unbiased estimates¶

Evaluated via a null model

In [ ]:

sns.set(rc={"figure.figsize":(3, 1.5)})
sns.set_style("white")

df = pd.read_csv('res/results_null_PCA_SVR.csv')

def fpr(x, alpha=0.05):
    return (x<alpha).sum()/len(x)

df_fpr = df.groupby('n')['p_discovery_cv'].agg([fpr])
sns.lineplot(x='n', y='fpr', data=df_fpr)
df_fpr = df.groupby('n')['p_discovery_overfit'].agg([fpr])
sns.lineplot(x='n', y='fpr', data=df_fpr)
sns.despine()

In [ ]:

sns.set(rc={"figure.figsize":(2, 1)})
sns.set_style("white")

df = pd.read_csv('res/results_PCA_SVR.csv')
tmp = df[df.target == 'CogTotalComp_AgeAdj']

num_perm = 1000


# helper functions to compoute conf. intervals for correlation
def r_to_z(r):
    return math.log((1 + r) / (1 - r)) / 2.0

def z_to_r(z):
    e = math.exp(2 * z)
    return (e - 1) / (e + 1)

def r_confidence_interval(r, alpha, n):
    z = r_to_z(r)
    se = 1.0 / math.sqrt(n - 3)
    z_crit = stats.norm.ppf((1 + alpha)/2)  # 2-tailed z critical value

    lo = z - z_crit * se
    hi = z + z_crit * se
    # Return a sequence
    return (z_to_r(lo), z_to_r(hi))

r_ci_lo = [r_confidence_interval(0, 0.95, n=n)[0] for n in df.n.unique()]
r_ci_hi = [r_confidence_interval(0, 0.95, n=n)[1] for n in df.n.unique()]

tmp['inflation_cv'] = tmp.r_discovery_cv - tmp.r_replication
tmp['inflation_overfit'] = tmp.r_discovery_overfit - tmp.r_replication

sns.lineplot(x='n', y='inflation_cv', data=tmp, ci='sd', color="blue")
sns.lineplot(x='n', y='inflation_overfit', data=tmp, ci='sd', color="red")
sns.lineplot(x=df.n.unique(), y=r_ci_lo, linestyle='dotted', color='gray')
sns.lineplot(x=df.n.unique(), y=r_ci_hi, linestyle='dotted', color='gray')
sns.despine()
plt.savefig('fig/curves_biased_vs_unbiased.pdf')

In [ ]:

sns.set(rc={"figure.figsize":(2, 1)})
sns.set_style("white")

df = pd.read_csv('res/results_null_PCA_SVR.csv')
tmp = df[df.target == 'CogTotalComp_AgeAdj']

num_perm = 1000


# helper functions to compoute conf. intervals for correlation
def r_to_z(r):
    return math.log((1 + r) / (1 - r)) / 2.0

def z_to_r(z):
    e = math.exp(2 * z)
    return (e - 1) / (e + 1)

def r_confidence_interval(r, alpha, n):
    z = r_to_z(r)
    se = 1.0 / math.sqrt(n - 3)
    z_crit = stats.norm.ppf((1 + alpha)/2)  # 2-tailed z critical value

    lo = z - z_crit * se
    hi = z + z_crit * se
    # Return a sequence
    return (z_to_r(lo), z_to_r(hi))

r_ci_lo = [r_confidence_interval(0, 0.95, n=n)[0] for n in df.n.unique()]
r_ci_hi = [r_confidence_interval(0, 0.95, n=n)[1] for n in df.n.unique()]

sns.lineplot(x='n', y='r_discovery_cv', data=tmp, ci='sd', color="blue")
sns.lineplot(x='n', y='r_discovery_overfit', data=tmp, ci='sd', color="red")
sns.lineplot(x=df.n.unique(), y=r_ci_lo, linestyle='dotted', color='gray')
sns.lineplot(x=df.n.unique(), y=r_ci_hi, linestyle='dotted', color='gray')
sns.despine()
plt.savefig('fig/null_biased_vs_unbiased.pdf')

In [ ]:

df = pd.read_csv('res/results_null_PCA_SVR.csv')
tmp = df[df.target == 'CogTotalComp_AgeAdj']

binwidth = 0.05

sns.histplot(tmp.r_discovery_cv, color='blue', alpha=0.5, binwidth=binwidth)
sns.histplot(tmp.r_discovery_cv, color='black',fill=False, binwidth=binwidth)

sns.histplot(tmp.r_discovery_overfit, color='red', alpha=0.5, binwidth=binwidth)
sns.histplot(tmp.r_discovery_overfit, color='black',fill=False, binwidth=binwidth)

plt.axvline(0, color='gray')

Replication Probabilities¶

PCA-SVR, pearson

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_pearson'
model = "PCA_SVR"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100
    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target], ax=axes[i_target]).set(title=target)
    axes[i_target].fill_between(tmp.n.unique(), replication_prob, color=cols[i_target])
    sns.despine()
    fig.suptitle('PCA-SVR, pearson')
    plt.savefig('fig/replication_' + feature + '_' + model + '.pdf')

Ridge, pearson

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_pearson'
model = "Ridge"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100
    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target], ax=axes[i_target]).set(title=target)
    #fig.xlim((50,500))
    #fig.ylim((0,100))
    axes[i_target].fill_between(tmp.n.unique(), replication_prob, color=cols[i_target])
    sns.despine()
    fig.suptitle('Ridge, pearson')
    plt.savefig('fig/replication_' + feature + '_' + model + '.pdf')

PCA-SVR parcor

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "PCA_SVR"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100
    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target], ax=axes[i_target]).set(title=target)
    axes[i_target].fill_between(tmp.n.unique(), replication_prob, color=cols[i_target])
    sns.despine()
    fig.suptitle('PCA-SVR, parcor')
    plt.savefig('fig/replication_' + feature + '_' + model + '.pdf')

Ridge, parcor

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100
    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target], ax=axes[i_target]).set(title=target)
    axes[i_target].fill_between(tmp.n.unique(), replication_prob, color=cols[i_target])
    sns.despine()
    fig.suptitle('Ridge, parcor')
    plt.savefig('fig/replication_' + feature + '_' + model + '.pdf')

Multivariate Statistical Power¶

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("whitegrid")
# Replicability of:
feature = 'netmats_pearson'
model = "PCA_SVR"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    power = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            power[i] = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() / len(tmp2['r_discovery_cv']) * 100

    sns.lineplot(x=tmp.n.unique(), y=power, color=cols[i_target], ax=axes[i_target]).set(title=target)
    #fig.xlim((50,500))
    #fig.ylim((0,100))
    axes[i_target].fill_between(tmp.n.unique(), power, color=cols[i_target])
    sns.despine()
    fig.suptitle('PCA-SVR, pearson')
    plt.savefig('fig/power_' + feature + '_' + model + '.pdf')

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("whitegrid")
# Replicability of:
feature = 'netmats_pearson'
model = "Ridge"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    power = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            power[i] = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() / len(tmp2['r_discovery_cv']) * 100

    sns.lineplot(x=tmp.n.unique(), y=power, color=cols[i_target], ax=axes[i_target]).set(title=target)
    #fig.xlim((50,500))
    #fig.ylim((0,100))
    axes[i_target].fill_between(tmp.n.unique(), power, color=cols[i_target])
    sns.despine()
    fig.suptitle('Ridge, pearson')
    plt.savefig('fig/power_' + feature + '_' + model + '.pdf')

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("whitegrid")
# Replicability of:
feature = 'netmats_parcor'
model = "PCA_SVR"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    power = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            power[i] = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() / len(tmp2['r_discovery_cv']) * 100

    sns.lineplot(x=tmp.n.unique(), y=power, color=cols[i_target], ax=axes[i_target]).set(title=target)
    axes[i_target].fill_between(tmp.n.unique(), power, color=cols[i_target])
    sns.despine()
    fig.suptitle('PCA-SVR, parcor')
    plt.savefig('fig/power_' + feature + '_' + model + '.pdf')

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(4, 4)})
sns.set_style("whitegrid")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/results_' + model + '.csv')
df_null = pd.read_csv('res/results_null_' + model + '.csv')

fig, axes = plt.subplots(6, sharex=True, sharey=True)
cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:cyan']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    power = np.zeros(len(tmp.n.unique()))
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            power[i] = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() / len(tmp2['r_discovery_cv']) * 100

    sns.lineplot(x=tmp.n.unique(), y=power, color=cols[i_target], ax=axes[i_target]).set(title=target)
    #fig.xlim((50,500))
    #fig.ylim((0,100))
    axes[i_target].fill_between(tmp.n.unique(), power, color=cols[i_target])
    sns.despine()
    fig.suptitle('Ridge, parcor')
    plt.savefig('fig/power_' + feature + '_' + model + '.pdf')

Learning Curves¶

In [ ]:

def plot_learning_curve(target, feature, data, filetag=None):
    sns.set(rc={"figure.figsize":(3, 2)})
    sns.set_style("white")
    tmp = df.loc[(data.target==target) & (data.connectivity==feature)]

    sns.lineplot(x='n', y='r_discovery_cv', data=tmp, ci="sd")
    sns.lineplot(x='n', y='r_discovery_overfit', data=tmp, ci="sd")
    sns.lineplot(x='n', y='r_replication', data=tmp, ci="sd")
    plt.ylim((-0.2, 1.01))
    sns.despine()
    if filetag:
        plt.savefig('fig/learning_curve_' + target + '_' + feature + '_' + filetag + '.pdf')

In [ ]:

df = pd.read_csv('res/results_PCA_SVR.csv')
plot_learning_curve(target='CogTotalComp_AgeAdj', feature='netmats_pearson', df=df, filetag='pca-svr')

In [ ]:

df = pd.read_csv('res/results_Ridge.csv')
plot_learning_curve(target='CogTotalComp_AgeAdj', feature='netmats_pearson', df=df, filetag='ridge')

In [ ]:

df = pd.read_csv('res/results_PCA_SVR.csv')
plot_learning_curve(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, filetag='pca-svr')

In [ ]:

df = pd.read_csv('res/results_Ridge.csv')
plot_learning_curve(target='CogTotalComp_AgeAdj', feature='netmats_parcor', df=df, filetag='ridge')

Composite figures to summarize improvement with Ridge and partial correlation¶

Power: Pearson + PCA-SVR¶

In [ ]:

sample_size_needed = []
variable = []
method = []

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_pearson'
model = "PCA_SVR"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    power = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            power[i] = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() / len(tmp2['r_discovery_cv']) * 100
            if power[i] >= 80 and n_req == 600:
                n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sns.lineplot(x=tmp.n.unique(), y=power, color=cols[i_target]).set(title=target)
sns.despine()
plt.axhline(80, linestyle = 'dashed')
plt.axhline(100)
plt.xlim(0,500)
plt.ylim(0,100)
plt.savefig('fig/power_all_' + feature + '_' + model + '.pdf')

Power: Partial corr + Ridge¶

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    power = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            power[i] = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() / len(tmp2['r_discovery_cv']) * 100
            if power[i] >= 80 and n_req == 600:
                n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sns.lineplot(x=tmp.n.unique(), y=power, color=cols[i_target]).set(title=target)
sns.despine()
plt.axhline(80, linestyle = 'dashed')
plt.axhline(100)
plt.xlim(0,500)
plt.ylim(0,100)
plt.savefig('fig/power_all_' + feature + '_' + model + '.pdf')

In [ ]:

sns.set(rc={"figure.figsize":(1, 2.3)})
sns.set_style("white")

bar_df = pd.DataFrame(
    {
        'sample size needed': sample_size_needed,
        'target': variable,
        'method' : method
    }
)

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

sns.barplot(x='sample size needed', y='method', hue='target', data=bar_df, palette=palette, ci=None,
            hue_order = ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.savefig('fig/power_bar_all.pdf')
bar_df

Replication: Pearson + PCA-SVR¶

In [58]:

sample_size_needed = []
variable = []
method = []

In [59]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_pearson'
model = "PCA_SVR"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100

            if replication_prob[i] >= 80 and n_req == 600:
                n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target]).set(title=target)
sns.despine()
plt.axhline(80, linestyle = 'dashed')
plt.axhline(100)
plt.xlim(0,500)
plt.ylim(0,100)
plt.savefig('fig/replication_all_' + feature + '_' + model + '.pdf')

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100

            if replication_prob[i] >= 80 and n_req == 600:
                n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target]).set(title=target)
sns.despine()
plt.axhline(80, linestyle = 'dashed')
plt.axhline(100)
plt.xlim(0,500)
plt.ylim(0,100)
plt.savefig('fig/replication_all_' + feature + '_' + model + '.pdf')

In [ ]:

sns.set(rc={"figure.figsize":(1, 2.3)})
sns.set_style("white")

bar_df = pd.DataFrame(
    {
        'sample size needed': sample_size_needed,
        'target': variable,
        'method' : method
    }
)

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

sns.barplot(x='sample size needed', y='method', hue='target', data=bar_df, palette=palette, ci=None,
            hue_order = ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.savefig('fig/replication_bar_all.pdf')
bar_df

True inflation¶

In [ ]:

sample_size_needed = []
variable = []
method = []

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_pearson'
model = "PCA_SVR"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

r1 = []
r2 = []
targets = []
ns = []

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            num = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum()
            if num > 0:

                _r1 = tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold, 'r_discovery_cv'].values.tolist()
                _r2 = tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold, 'r_replication'].values.tolist()

                r1 += _r1
                r2 += _r2
                ns += [n]*num
                targets += [target]*num

                # less than 10% inflation
                print(target, n, np.mean(np.array(_r1)-np.array(_r2))/np.mean(_r2) )
                if 0 < np.mean(np.array(_r1)-np.array(_r2))/np.mean(_r2) < 0.10 and n_req == 600:
                    print('!')
                    n_req = n

    print('*', target, n_req)
    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

inf_df = pd.DataFrame({
    'r_discovery': r1,
    'r_replication': r2,
    'inflation' : np.array(r1)-np.array(r2),
    'n': ns,
    'target': targets
})

inf_df.groupby(['target', 'n'])

sns.lineplot(x='n', y='inflation', data=inf_df, hue="target", palette=palette, ci=None,
             hue_order=['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.xlim(0,500)
plt.ylim(0,1)
sns.despine()
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
plt.savefig('fig/inflation_' + feature + '_' + model + '.pdf')

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

r1 = []
r2 = []
targets = []
ns = []

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            num = (tmp2['r_discovery_cv']>=r_discovery_threshold).sum()
            if num > 0:

                _r1 = tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold, 'r_discovery_cv'].values.tolist()
                _r2 = tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold, 'r_replication'].values.tolist()

                r1 += _r1
                r2 += _r2
                ns += [n]*num
                targets += [target]*num

                # less than 10% inflation
                print(target, n, np.mean(np.array(_r1)-np.array(_r2))/np.mean(_r2) )
                if 0 < np.mean(np.array(_r1)-np.array(_r2))/np.mean(_r2) < 0.10 and n_req == 600:
                    print('!')
                    n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

inf_df = pd.DataFrame({
    'r_discovery': r1,
    'r_replication': r2,
    'inflation' : np.array(r1)-np.array(r2),
    'n': ns,
    'target': targets
})

sns.lineplot(x='n', y='inflation', data=inf_df, hue="target", palette=palette, ci=None,
             hue_order=['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
plt.xlim(0,500)
plt.ylim(0,1)
sns.despine()
plt.savefig('fig/inflation_' + feature + '_' + model + '.pdf')

In [ ]:

sns.set_style("white")

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

g = sns.FacetGrid(inf_df, col="target", hue="target", xlim=(0,500), ylim=(-0.2, 0.7), height=1.8, aspect=0.8, palette=palette,
                  col_order=['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
g.map(sns.lineplot, 'n', 'inflation', ci='sd')
g.refline(y=0)
plt.savefig('fig/true_inflation_all_' + feature + '_' + model + '.pdf')

In [ ]:

sns.set(rc={"figure.figsize":(1, 2.3)})
sns.set_style("white")

bar_df = pd.DataFrame(
    {
        'sample size needed': sample_size_needed,
        'target': variable,
        'method' : method
    }
)

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

sns.barplot(x='sample size needed', y='method', hue='target', data=bar_df, palette=palette, ci=None,
            hue_order = ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.savefig('fig/inflation_bar_all.pdf')
bar_df

In [ ]:

sample_size_needed = []
variable = []
method = []

In [ ]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100

            if replication_prob[i] >= 80 and n_req == 600:
                n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target]).set(title=target)
sns.despine()
plt.axhline(80, linestyle = 'dashed')
plt.axhline(100)
plt.xlim(0,500)
plt.ylim(0,100)
plt.savefig('fig/replication_all_' + feature + '_' + model + '.pdf')

In [ ]:

sns.set(rc={"figure.figsize":(1, 2.3)})
sns.set_style("white")

bar_df = pd.DataFrame(
    {
        'sample size needed': sample_size_needed,
        'target': variable,
        'method' : method
    }
)

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

sns.barplot(x='sample size needed', y='method', hue='target', data=bar_df, palette=palette, ci=None,
            hue_order = ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.savefig('fig/replication_bar_all.pdf')
bar_df

In [ ]:

sns.set(rc={"figure.figsize":(3, 2)})
sns.set_style("white")
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')
sns.histplot(df_null.r_discovery_cv, bins=np.linspace(-0.5, 0.5, 20))
plt.show()
sns.histplot(df_null.r_discovery_overfit, bins=np.linspace(-0.6+1, 0.6+1, 20))

In [ ]:

df_null.r_discovery_overfit.describe()

In [ ]:

np.linspace(-1.0, 1.0, 11)

In [ ]:

In [405]:

sns.set(rc={"figure.figsize":(1, 2.3)})
sns.set_style("white")

bar_df = pd.DataFrame(
    {
        'sample size needed': sample_size_needed,
        'target': variable,
        'method' : method
    }
)

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

sns.barplot(x='sample size needed', y='method', hue='target', data=bar_df, palette=palette, ci=None,
            hue_order = ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.savefig('fig/inflation_bar_all.pdf')
bar_df

Out[405]:

	sample size needed	target	method
0	375	age	netmats_pearson_PCA_SVR
1	400	CogTotalComp_AgeAdj	netmats_pearson_PCA_SVR
2	375	PMAT24_A_CR	netmats_pearson_PCA_SVR
3	600	Flanker_AgeAdj	netmats_pearson_PCA_SVR
4	600	CardSort_AgeAdj	netmats_pearson_PCA_SVR
5	600	PicSeq_AgeAdj	netmats_pearson_PCA_SVR
6	100	age	netmats_parcor_Ridge
7	75	CogTotalComp_AgeAdj	netmats_parcor_Ridge
8	150	PMAT24_A_CR	netmats_parcor_Ridge
9	600	Flanker_AgeAdj	netmats_parcor_Ridge
10	350	CardSort_AgeAdj	netmats_parcor_Ridge
11	475	PicSeq_AgeAdj	netmats_parcor_Ridge

In [29]:

sample_size_needed = []
variable = []
method = []

In [30]:

import numpy as np
sns.set(rc={"figure.figsize":(1, 1)})
sns.set_style("white")
# Replicability of:
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')

cols = ['tab:blue', 'tab:green', 'tab:orange', 'tab:purple', 'tab:red', 'tab:brown']
for i_target, target in enumerate(['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']):
    tmp = df.loc[(df.target==target) & (df.connectivity==feature)]
    tmp_null = df_null.loc[(df_null.target==target) & (df_null.connectivity==feature)]

    alpha=0.05
    replication_prob = np.zeros(len(tmp.n.unique()))
    n_req = 600
    for i, n in enumerate(tmp.n.unique()):
            tmp2 = tmp[tmp.n == n]
            tmp2_null = tmp_null[tmp_null.n == n]
            r_discovery_threshold = np.quantile(tmp2_null.r_discovery_cv.dropna(), 1-alpha)

            if (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() == 0:
                replication_prob[i] = np.nan
            else:
                 #      #(significant replications among significant discoveries)     / # significant replications
                replication_prob[i] = (tmp2.loc[tmp2['r_discovery_cv']>=r_discovery_threshold,'p_replication']<alpha).sum() / (tmp2['r_discovery_cv']>=r_discovery_threshold).sum() * 100

            if replication_prob[i] >= 80 and n_req == 600:
                n_req = n

    sample_size_needed.append(n_req)
    variable.append(target)
    method.append(feature + '_' + model)

    sns.lineplot(x=tmp.n.unique(), y=replication_prob, color=cols[i_target]).set(title=target)
sns.despine()
plt.axhline(80, linestyle = 'dashed')
plt.axhline(100)
plt.xlim(0,500)
plt.ylim(0,100)
plt.savefig('fig/replication_all_' + feature + '_' + model + '.pdf')

In [31]:

sns.set(rc={"figure.figsize":(1, 2.3)})
sns.set_style("white")

bar_df = pd.DataFrame(
    {
        'sample size needed': sample_size_needed,
        'target': variable,
        'method' : method
    }
)

palette = {
    'age' : 'tab:blue',
    'CogTotalComp_AgeAdj' : 'tab:green',
    'PMAT24_A_CR' : 'tab:orange',
    'Flanker_AgeAdj' : 'tab:purple',
    'CardSort_AgeAdj' : 'tab:red',
    'PicSeq_AgeAdj' : 'tab:brown'
}

sns.barplot(x='sample size needed', y='method', hue='target', data=bar_df, palette=palette, ci=None,
            hue_order = ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'PicSeq_AgeAdj', 'CardSort_AgeAdj', 'Flanker_AgeAdj'])
plt.legend(bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.)
sns.despine()
plt.savefig('fig/replication_bar_all.pdf')
bar_df

Out[31]:

	sample size needed	target	method
0	75	age	netmats_parcor_Ridge
1	75	CogTotalComp_AgeAdj	netmats_parcor_Ridge
2	125	PMAT24_A_CR	netmats_parcor_Ridge
3	375	Flanker_AgeAdj	netmats_parcor_Ridge
4	275	CardSort_AgeAdj	netmats_parcor_Ridge
5	200	PicSeq_AgeAdj	netmats_parcor_Ridge

In [57]:

sns.set(rc={"figure.figsize":(3, 2)})
sns.set_style("white")
feature = 'netmats_parcor'
model = "Ridge"
df = pd.read_csv('res/hires_results_' + model + '.csv')
df_null = pd.read_csv('res/hires_results_null_' + model + '.csv')
sns.histplot(df_null.r_discovery_cv, bins=np.linspace(-0.5, 0.5, 20))
plt.show()
sns.histplot(df_null.r_discovery_overfit, bins=np.linspace(-0.6+1, 0.6+1, 20))

Out[57]:

<AxesSubplot:xlabel='r_discovery_overfit', ylabel='Count'>

In [46]:

df_null.r_discovery_overfit.describe()

Out[46]:

count    1.200000e+04
mean     1.000000e+00
std      3.644590e-10
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: r_discovery_overfit, dtype: float64

In [51]:

np.linspace(-1.0, 1.0, 11)

Out[51]:

array([-1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6,  0.8,  1. ])

In [13]:

Traceback (most recent call last):
  File "/home/tspisak/miniconda3/envs/rapids-0.18/lib/python3.7/site-packages/matplotlib/cbook/__init__.py", line 196, in process
    func(*args, **kwargs)
  File "/home/tspisak/miniconda3/envs/rapids-0.18/lib/python3.7/site-packages/matplotlib/animation.py", line 1467, in _stop
    self.event_source.remove_callback(self._loop_delay)
AttributeError: 'NoneType' object has no attribute 'remove_callback'

In [ ]: