The HCP dataset comprises resting-state and task-based fMRI from a large sample of human subjects. The NMA-curated dataset includes time series data that has been preprocessed and spatially-downsampled by aggregating within 360 regions of interest.
# @title Install dependencies
!pip install nilearn --quiet
|████████████████████████████████| 9.6 MB 15.5 MB/s
|████████████████████████████████| 38.1 MB 59.1 MB/s
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.
import os
import numpy as np
import matplotlib.pyplot as plt
# Necessary for visualization
from nilearn import plotting, datasets
#@title Figure settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/course-content/master/nma.mplstyle")
# The download cells will store the data in nested directories starting here:
HCP_DIR = "./DATA"
if not os.path.isdir(HCP_DIR):
os.mkdir(HCP_DIR)
# The data shared for NMA projects is a subset of the full HCP dataset
N_SUBJECTS = 339
# The data have already been aggregated into ROIs from the Glasesr parcellation
N_PARCELS = 360
# The acquisition parameters for all tasks were identical
TR = 0.72 # Time resolution, in sec
# The parcels are matched across hemispheres with the same order
HEMIS = ["Right", "Left"]
# Each experiment was repeated multiple times in each subject
N_RUNS_REST = 4
N_RUNS_TASK = 2
# Time series data are organized by experiment, with each experiment
# having an LR and RL (phase-encode direction) acquistion
BOLD_NAMES = [
"rfMRI_REST1_LR", "rfMRI_REST1_RL",
"rfMRI_REST2_LR", "rfMRI_REST2_RL",
"tfMRI_MOTOR_RL", "tfMRI_MOTOR_LR",
"tfMRI_WM_RL", "tfMRI_WM_LR",
"tfMRI_EMOTION_RL", "tfMRI_EMOTION_LR",
"tfMRI_GAMBLING_RL", "tfMRI_GAMBLING_LR",
"tfMRI_LANGUAGE_RL", "tfMRI_LANGUAGE_LR",
"tfMRI_RELATIONAL_RL", "tfMRI_RELATIONAL_LR",
"tfMRI_SOCIAL_RL", "tfMRI_SOCIAL_LR"
]
# You may want to limit the subjects used during code development.
# This will use all subjects:
subjects = range(N_SUBJECTS)
The rest and task data are shared in different files, but they will unpack into the same directory structure.
Each file is fairly large and will take some time to download. If you are focusing only on rest or task analyses, you may not want to download only that dataset.
We also separately provide some potentially useful behavioral covariate information.
# @title Download the data
# @markdown Task data in `HCP_DIR/hcp_task`, rest in `HCP_DIR/hcp_rest`, covariate in `HCP_DIR/hcp`
import os, requests, tarfile
fnames = ["hcp_rest.tgz",
"hcp_task.tgz",
"hcp_covariates.tgz",
"atlas.npz"]
urls = ["https://osf.io/bqp7m/download",
"https://osf.io/s4h8j/download",
"https://osf.io/x5p4g/download",
"https://osf.io/j5kuc/download"]
for fname, url in zip(fnames, urls):
if not os.path.isfile(fname):
try:
r = requests.get(url)
except requests.ConnectionError:
print("!!! Failed to download data !!!")
else:
if r.status_code != requests.codes.ok:
print("!!! Failed to download data !!!")
else:
print(f"Downloading {fname}...")
with open(fname, "wb") as fid:
fid.write(r.content)
print(f"Download {fname} completed!")
Downloading hcp_rest.tgz... Download hcp_rest.tgz completed! Downloading hcp_task.tgz... Download hcp_task.tgz completed! Downloading hcp_covariates.tgz... Download hcp_covariates.tgz completed! Downloading atlas.npz... Download atlas.npz completed!
# @title Extract the data in `HCP_DIR`
fnames = ["hcp_covariates", "hcp_rest", "hcp_task"]
for fname in fnames:
# open file
path_name = os.path.join(HCP_DIR, fname)
if not os.path.exists(path_name):
print(f"Extracting {fname}.tgz...")
with tarfile.open(f"{fname}.tgz") as fzip:
fzip.extractall(HCP_DIR)
else:
print(f"File {fname}.tgz has already been extracted.")
Extracting hcp_covariates.tgz... Extracting hcp_rest.tgz... Extracting hcp_task.tgz...
Downloading either dataset will create the regions.npy
file, which contains the region name and network assignment for each parcel.
Detailed information about the name used for each region is provided in the Supplement to Glasser et al. 2016.
Information about the network parcellation is provided in Ji et al, 2019.
dir = os.path.join(HCP_DIR, "hcp_task") # choose the data directory
regions = np.load(os.path.join(dir, "regions.npy")).T
region_info = dict(name=regions[0].tolist(),
network=regions[1],
myelin=regions[2].astype(np.float)
)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations """
We also provide the parcellation on the fsaverage5 surface and approximate MNI coordinates of each region, which can be useful for visualization:
with np.load(f"atlas.npz") as dobj:
atlas = dict(**dobj)
def get_image_ids(name):
"""Get the 1-based image indices for runs in a given experiment.
Args:
name (str) : Name of experiment ("rest" or name of task) to load
Returns:
run_ids (list of int) : Numeric ID for experiment image files
"""
run_ids = [
i for i, code in enumerate(BOLD_NAMES, 1) if name.upper() in code
]
if not run_ids:
raise ValueError(f"Found no data for '{name}'")
return run_ids
def load_timeseries(subject, name, dir,
runs=None, concat=True, remove_mean=True):
"""Load timeseries data for a single subject.
Args:
subject (int): 0-based subject ID to load
name (str) : Name of experiment ("rest" or name of task) to load
dir (str) : data directory
run (None or int or list of ints): 0-based run(s) of the task to load,
or None to load all runs.
concat (bool) : If True, concatenate multiple runs in time
remove_mean (bool) : If True, subtract the parcel-wise mean
Returns
ts (n_parcel x n_tp array): Array of BOLD data values
"""
# Get the list relative 0-based index of runs to use
if runs is None:
runs = range(N_RUNS_REST) if name == "rest" else range(N_RUNS_TASK)
elif isinstance(runs, int):
runs = [runs]
# Get the first (1-based) run id for this experiment
offset = get_image_ids(name)[0]
# Load each run's data
bold_data = [
load_single_timeseries(subject,
offset + run,
dir,
remove_mean) for run in runs
]
# Optionally concatenate in time
if concat:
bold_data = np.concatenate(bold_data, axis=-1)
return bold_data
def load_single_timeseries(subject, bold_run, dir, remove_mean=True):
"""Load timeseries data for a single subject and single run.
Args:
subject (int): 0-based subject ID to load
bold_run (int): 1-based run index, across all tasks
dir (str) : data directory
remove_mean (bool): If True, subtract the parcel-wise mean
Returns
ts (n_parcel x n_timepoint array): Array of BOLD data values
"""
bold_path = os.path.join(dir, "subjects", str(subject), "timeseries")
bold_file = f"bold{bold_run}_Atlas_MSMAll_Glasser360Cortical.npy"
ts = np.load(os.path.join(bold_path, bold_file))
if remove_mean:
ts -= ts.mean(axis=1, keepdims=True)
return ts
def load_evs(subject, name, condition, dir):
"""Load EV (explanatory variable) data for one task condition.
Args:
subject (int): 0-based subject ID to load
name (str) : Name of task
condition (str) : Name of condition
dir (str) : data directory
Returns
evs (list of dicts): A dictionary with the onset, duration, and amplitude
of the condition for each run.
"""
evs = []
for id in get_image_ids(name):
task_key = BOLD_NAMES[id - 1]
ev_file = os.path.join(dir, "subjects", str(subject), "EVs",
task_key, f"{condition}.txt")
ev_array = np.loadtxt(ev_file, ndmin=2, unpack=True)
ev = dict(zip(["onset", "duration", "amplitude"], ev_array))
evs.append(ev)
return evs
def condition_frames(run_evs, skip=0):
"""Identify timepoints corresponding to a given condition in each run.
Args:
run_evs (list of dicts) : Onset and duration of the event, per run
skip (int) : Ignore this many frames at the start of each trial, to account
for hemodynamic lag
Returns:
frames_list (list of 1D arrays): Flat arrays of frame indices, per run
"""
frames_list = []
for ev in run_evs:
# Determine when trial starts, rounded down
start = np.floor(ev["onset"] / TR).astype(int)
# Use trial duration to determine how many frames to include for trial
duration = np.ceil(ev["duration"] / TR).astype(int)
# Take the range of frames that correspond to this specific trial
frames = [s + np.arange(skip, d) for s, d in zip(start, duration)]
frames_list.append(np.concatenate(frames))
return frames_list
def selective_average(timeseries_data, ev, skip=0):
"""Take the temporal mean across frames for a given condition.
Args:
timeseries_data (array or list of arrays): n_parcel x n_tp arrays
ev (dict or list of dicts): Condition timing information
skip (int) : Ignore this many frames at the start of each trial, to account
for hemodynamic lag
Returns:
avg_data (1D array): Data averagted across selected image frames based
on condition timing
"""
# Ensure that we have lists of the same length
if not isinstance(timeseries_data, list):
timeseries_data = [timeseries_data]
if not isinstance(ev, list):
ev = [ev]
if len(timeseries_data) != len(ev):
raise ValueError("Length of `timeseries_data` and `ev` must match.")
# Identify the indices of relevant frames
frames = condition_frames(ev, skip)
# Select the frames from each image
selected_data = []
for run_data, run_frames in zip(timeseries_data, frames):
run_frames = run_frames[run_frames < run_data.shape[1]]
selected_data.append(run_data[:, run_frames])
# Take the average in each parcel
avg_data = np.concatenate(selected_data, axis=-1).mean(axis=-1)
return avg_data
Load a single run of resting-state data:
help(load_timeseries)
Help on function load_timeseries in module __main__: load_timeseries(subject, name, dir, runs=None, concat=True, remove_mean=True) Load timeseries data for a single subject. Args: subject (int): 0-based subject ID to load name (str) : Name of experiment ("rest" or name of task) to load dir (str) : data directory run (None or int or list of ints): 0-based run(s) of the task to load, or None to load all runs. concat (bool) : If True, concatenate multiple runs in time remove_mean (bool) : If True, subtract the parcel-wise mean Returns ts (n_parcel x n_tp array): Array of BOLD data values
timeseries = load_timeseries(subject=0,
name="rest",
dir=os.path.join(HCP_DIR, "hcp_rest"),
runs=1)
print(timeseries.shape) # n_parcel x n_timepoint
(360, 1200)
Load a concatenated resting-state timeseries (using all runs' data) for each subject:
timeseries_rest = []
for subject in subjects:
ts_concat = load_timeseries(subject, name="rest",
dir=os.path.join(HCP_DIR, "hcp_rest"))
timeseries_rest.append(ts_concat)
Generate a correlation matrix (showing "functional connectivity" or FC) for each subject and plot the group average:
fc = np.zeros((N_SUBJECTS, N_PARCELS, N_PARCELS))
for sub, ts in enumerate(timeseries_rest):
fc[sub] = np.corrcoef(ts)
group_fc = fc.mean(axis=0)
plt.figure()
plt.imshow(group_fc, interpolation="none", cmap="bwr", vmin=-1, vmax=1)
plt.colorbar()
plt.show()
Plot the profile of FC values between a particular "seed" parcel and every parcel in the dataset, separated by hemisphere:
seed_roi = "R_FEF" # name of seed parcel
ind = region_info["name"].index(seed_roi)
hemi_fc = np.split(group_fc, 2)
# Plot the FC profile across the right and left hemisphere target regions
plt.figure()
for i, hemi_fc in enumerate(hemi_fc):
plt.plot(hemi_fc[:, ind], label=f"{HEMIS[i]} hemisphere")
plt.title(f"FC for region {seed_roi}")
plt.xlabel("Target region")
plt.ylabel("Correlation (FC)")
plt.legend()
plt.show()