To ensure that our cell type labels are as accurate as possible, we'll subset our dataset based on our AIFI_L2
labels for review.
Previously, we split our dataset up into 8 subsets based on cohort, biological sex, and CMV status. Here, we'll combine these per L2 cell type for less abundant cell types to simplify review. For very abundant cell types (i.e. Naive CD4 T cells), we'll keep them separated into the 8 groups and review each.
This should reduce the burden of trying to cluster >2M cells (which can be very slow) without significantly reducing our power to identify doublets or mislabeled cells, as we'll still have >100k cells for these large classes.
In a later step, we'll generate rules to filter these clustered subsets of cells to help us in identifying doublets, contaminated clusters (i.e. with erythrocyte content), and mislabeled cells.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from datetime import date
import hisepy
import os
import pandas as pd
import re
import scanpy as sc
import scanpy.external as sce
out_dir = 'output'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
cohort = 'BR2'
subject_sex = 'Female'
These make it a bit simpler to cache and read in files from HISE
def cache_uuid_path(uuid):
cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
if not os.path.isdir(cache_path):
hise_res = hisepy.reader.cache_files([uuid])
filename = os.listdir(cache_path)[0]
cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
return cache_file
def read_parquet_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = pd.read_parquet(cache_file)
return res
def sort_adata_uuid(uuid, sort_cols = ['AIFI_L2', 'sample.sampleKitGuid']):
cache_file = cache_uuid_path(uuid)
adata = sc.read_h5ad(cache_file)
obs = adata.obs
obs = obs.sort_values(sort_cols)
adata = adata[obs.index]
adata.write_h5ad(cache_file)
This function will enable us to connect to our .h5ad files without loading the entire thing into memory. We'll then load only the cells that we want for each cell class to assemble them for writing. This should save us some overhead as we do our subsetting.
def read_adata_backed_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = sc.read_h5ad(cache_file, backed = 'r')
return res
This function will apply a standard normalization, nearest neighbors, clustering, and UMAP process to our cell subsets:
def process_adata(adata):
# Keep a copy of the raw data
adata.raw = adata
print('Normalizing')
# Normalize and log transform
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
print('Finding HVGs')
# Restrict downstream steps to variable genes
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()
print(adata.shape)
print('Scaling')
# Scale variable genes
sc.pp.scale(adata)
print('PCA')
# Run PCA
sc.tl.pca(adata, svd_solver = 'arpack')
print('Neighbors')
# Find nearest neighbors
sc.pp.neighbors(
adata,
n_neighbors = 50,
n_pcs = 30
)
print('Leiden')
# Find clusters
sc.tl.leiden(
adata,
resolution = 2,
key_added = 'leiden_2',
n_iterations = 2
)
print('UMAP')
# Run UMAP
sc.tl.umap(adata, min_dist = 0.05)
return adata
def format_cell_type(cell_type):
cell_type = re.sub('\\+', 'pos', cell_type)
cell_type = re.sub('-', 'neg', cell_type)
cell_type = re.sub(' ', '_', cell_type)
return cell_type
def element_id(n = 3):
import periodictable
from random import randrange
rand_el = []
for i in range(n):
el = randrange(0,118)
rand_el.append(periodictable.elements[el].name)
rand_str = '-'.join(rand_el)
return rand_str
search_id = 'chromium-meitnerium-europium'
Retrieve files stored in our HISE project store
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]
Filter for files from the previous notebook using our search_id
search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')
pq_df = search_df[search_df['name'].str.contains('.parquet')]
pq_uuids = {}
for i in range(pq_df.shape[0]):
fn = pq_df['name'].tolist()[i]
group_name = re.sub('.+_PBMC_', '', fn)
group_name = re.sub('_qc.+', '', group_name)
pq_uuids[group_name] = pq_df['id'].tolist()[i]
pq_uuids
{'BR1_Female_Negative': 'a0ce94a0-de3f-46a1-99bc-3acf24dae0ea', 'BR1_Female_Positive': 'd0b5c470-2481-455f-9b8e-b6ca28ea4bbe', 'BR1_Male_Negative': 'd9cffce7-4018-43b8-afea-9176b1b54532', 'BR1_Male_Positive': '28a22e8e-0aca-45db-9e28-19907d297a38', 'BR2_Female_Negative': 'ee89cb24-7439-486c-8f50-983895e6d4b7', 'BR2_Female_Positive': 'bb47f341-d036-4054-8783-0a66b44e5c08', 'BR2_Male_Negative': '6331f471-6bc3-49a8-9b4b-d83c7dd95320', 'BR2_Male_Positive': 'bdaef83d-76da-4bc7-acc6-5db2006908fa'}
meta_dict = {}
for group_name, uuid in pq_uuids.items():
meta_dict[group_name] = read_parquet_uuid(uuid)
downloading fileID: a0ce94a0-de3f-46a1-99bc-3acf24dae0ea Files have been successfully downloaded! downloading fileID: d0b5c470-2481-455f-9b8e-b6ca28ea4bbe Files have been successfully downloaded! downloading fileID: d9cffce7-4018-43b8-afea-9176b1b54532 Files have been successfully downloaded! downloading fileID: 28a22e8e-0aca-45db-9e28-19907d297a38 Files have been successfully downloaded! downloading fileID: ee89cb24-7439-486c-8f50-983895e6d4b7 Files have been successfully downloaded! downloading fileID: bb47f341-d036-4054-8783-0a66b44e5c08 Files have been successfully downloaded! downloading fileID: 6331f471-6bc3-49a8-9b4b-d83c7dd95320 Files have been successfully downloaded! downloading fileID: bdaef83d-76da-4bc7-acc6-5db2006908fa Files have been successfully downloaded!
l2_counts = {}
for group_name, meta in meta_dict.items():
counts = meta['AIFI_L2'].value_counts()
l2_counts[group_name] = counts
total_l2_counts = sum(list(l2_counts.values()))
total_l2_counts
AIFI_L2 ASDC 4008 CD14 monocyte 2268111 CD16 monocyte 405852 CD56bright NK cell 96063 CD56dim NK cell 1123560 CD8aa 17893 DN T cell 17648 Effector B cell 86541 Erythrocyte 28362 ILC 5734 Intermediate monocyte 76160 MAIT 381960 Memory B cell 389804 Memory CD4 T cell 2812203 Memory CD8 T cell 1539618 Naive B cell 725113 Naive CD4 T cell 2986280 Naive CD8 T cell 833785 Plasma cell 16569 Platelet 72385 Progenitor cell 12457 Proliferating NK cell 20542 Proliferating T cell 18632 Transitional B cell 79851 Treg 308864 cDC1 8142 cDC2 113465 gdT 318356 pDC 62064 Name: count, dtype: int64
These L2 labels have > 1M cells each. We'll keep them separated per sample group.
large_types = total_l2_counts.index[total_l2_counts > 1e6]
large_types.tolist()
['CD14 monocyte', 'CD56dim NK cell', 'Memory CD4 T cell', 'Memory CD8 T cell', 'Naive CD4 T cell']
Sorting the .h5ad files by AIFI_L2 will make reading each cell type much faster by placing cells of the same type next to each other in the sparse matrix.
h5ad_df = search_df[search_df['name'].str.contains('.h5ad')]
h5ad_df = h5ad_df[h5ad_df['name'].str.contains(cohort)]
h5ad_df = h5ad_df[h5ad_df['name'].str.contains(subject_sex)]
h5ad_uuids = {}
for i in range(h5ad_df.shape[0]):
fn = h5ad_df['name'].tolist()[i]
group_name = re.sub('.+_PBMC_', '', fn)
group_name = re.sub('_qc.+', '', group_name)
h5ad_uuids[group_name] = h5ad_df['id'].tolist()[i]
h5ad_uuids
{'BR2_Female_Negative': '74755a15-f0a1-411e-b071-7fd802b60e80', 'BR2_Female_Positive': '277489fd-12fe-48ed-b3a0-7494858fa512'}
for uuid in h5ad_uuids.values():
sort_adata_uuid(uuid, sort_cols = ['AIFI_L2', 'sample.sampleKitGuid'])
downloading fileID: 74755a15-f0a1-411e-b071-7fd802b60e80 Files have been successfully downloaded! downloading fileID: 277489fd-12fe-48ed-b3a0-7494858fa512 Files have been successfully downloaded!
Now that they're sorted, we can open these files with on-disk backing so we don't have to read the entire file at once.
h5ad_conn = {}
for group_name, uuid in h5ad_uuids.items():
h5ad_conn[group_name] = read_adata_backed_uuid(uuid)
l2_types = large_types
def read_l2_type(adata, cell_type):
type_adata = adata[adata.obs['AIFI_L2'] == cell_type].to_memory()
return type_adata
out_files = []
for cell_type in l2_types:
print(cell_type)
# Read data from each group for this type in parallel
print('Loading data')
type_adata_dict = {}
with ThreadPoolExecutor(max_workers = 8) as executor:
futures = {
executor.submit(
read_l2_type,
h5ad_conn[group_name],
cell_type): group_name
for group_name in h5ad_conn.keys()
}
for future in concurrent.futures.as_completed(futures):
future_group = futures[future]
type_adata_dict[future_group] = future.result()
if cell_type in large_types:
# If large, process and save separately
for group_name, type_adata in type_adata_dict.items():
print('Processing {g}'.format(g = group_name))
print(type_adata.shape)
type_adata = process_adata(type_adata)
print('Saving processed data')
out_type = format_cell_type(cell_type)
out_file = 'output/diha_qc_{g}_{c}_{d}.h5ad'.format(
g = group_name,
c = out_type,
d = date.today()
)
type_adata.write_h5ad(out_file)
out_files.append(out_file)
CD14 monocyte Loading data
/opt/conda/lib/python3.10/site-packages/anndata/_core/anndata.py:1179: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_sub[k] = df_sub[k].cat.remove_unused_categories()
Processing BR2_Female_Negative (219053, 33538) Normalizing Finding HVGs (219053, 869) Scaling PCA Neighbors Leiden UMAP Saving processed data Processing BR2_Female_Positive (399151, 33538) Normalizing Finding HVGs (399151, 812) Scaling PCA Neighbors Leiden UMAP Saving processed data CD56dim NK cell Loading data Processing BR2_Female_Negative (129551, 33538) Normalizing Finding HVGs (129551, 1000) Scaling PCA Neighbors Leiden UMAP Saving processed data Processing BR2_Female_Positive (163751, 33538) Normalizing Finding HVGs (163751, 965) Scaling PCA Neighbors Leiden UMAP Saving processed data Memory CD4 T cell Loading data Processing BR2_Female_Negative (307239, 33538) Normalizing Finding HVGs (307239, 1241) Scaling PCA Neighbors Leiden UMAP Saving processed data Processing BR2_Female_Positive (593363, 33538) Normalizing Finding HVGs (593363, 1324) Scaling PCA Neighbors Leiden UMAP Saving processed data Memory CD8 T cell Loading data Processing BR2_Female_Negative (81669, 33538) Normalizing Finding HVGs (81669, 1259) Scaling PCA Neighbors Leiden UMAP Saving processed data Processing BR2_Female_Positive (413233, 33538) Normalizing Finding HVGs (413233, 1049) Scaling PCA Neighbors Leiden UMAP Saving processed data Naive CD4 T cell Loading data Processing BR2_Female_Negative (350194, 33538) Normalizing Finding HVGs (350194, 514) Scaling PCA Neighbors Leiden UMAP Saving processed data Processing BR2_Female_Positive (571529, 33538) Normalizing Finding HVGs (571529, 465) Scaling PCA Neighbors Leiden UMAP Saving processed data
Finally, we'll use hisepy.upload.upload_files()
to send a copy of our output to HISE to use for downstream analysis steps.
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA PBMC L2 Pre-cleanup {c} {s} {d}'.format(
c = cohort, s = subject_sex, d = date.today())
search_id = element_id()
search_id
'iodine-titanium-livermorium'
in_files = list(h5ad_uuids.values()) + list(pq_uuids.values())
in_files
['74755a15-f0a1-411e-b071-7fd802b60e80', '277489fd-12fe-48ed-b3a0-7494858fa512', 'a0ce94a0-de3f-46a1-99bc-3acf24dae0ea', 'd0b5c470-2481-455f-9b8e-b6ca28ea4bbe', 'd9cffce7-4018-43b8-afea-9176b1b54532', '28a22e8e-0aca-45db-9e28-19907d297a38', 'ee89cb24-7439-486c-8f50-983895e6d4b7', 'bb47f341-d036-4054-8783-0a66b44e5c08', '6331f471-6bc3-49a8-9b4b-d83c7dd95320', 'bdaef83d-76da-4bc7-acc6-5db2006908fa']
out_files
['output/diha_qc_BR2_Female_Negative_CD14_monocyte_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_CD14_monocyte_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_CD56dim_NK_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_CD56dim_NK_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Memory_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Memory_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Memory_CD8_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Memory_CD8_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Naive_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Naive_CD4_T_cell_2024-04-20.h5ad']
hisepy.upload.upload_files(
files = out_files,
study_space_id = study_space_uuid,
title = title,
input_file_ids = in_files,
destination = search_id
)
you are trying to upload file_ids... ['output/diha_qc_BR2_Female_Negative_CD14_monocyte_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_CD14_monocyte_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_CD56dim_NK_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_CD56dim_NK_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Memory_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Memory_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Memory_CD8_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Memory_CD8_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Naive_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Naive_CD4_T_cell_2024-04-20.h5ad']. Do you truly want to proceed?
{'trace_id': 'b7c64bad-c906-45b2-a17e-d59df16eb69c', 'files': ['output/diha_qc_BR2_Female_Negative_CD14_monocyte_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_CD14_monocyte_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_CD56dim_NK_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_CD56dim_NK_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Memory_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Memory_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Memory_CD8_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Memory_CD8_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Negative_Naive_CD4_T_cell_2024-04-20.h5ad', 'output/diha_qc_BR2_Female_Positive_Naive_CD4_T_cell_2024-04-20.h5ad']}
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 numpy 1.24.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 periodictable 1.5.2 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pycparser 2.21 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynndescent 0.5.11 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sparse 0.14.0 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA umap 0.5.5 uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.5 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1042-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-04-20 19:51