For use in plotting, we'll compute "pseudobulk" summary statistics for gene expression based on cell metadata grouping.
Note that this approach doesn't use the careful pseudobulk approaches implemented in packages like scran
- we're instead taking the fraction of cells in which gene expression was detected (pct_exp
) and the median of expression in those non-zero cells (median_expression
).
For use in our visualization tools, we'll group by each of our cell type labeling levels (AIFI_L1, _L2, and _L3), as well as by both cell type and the originating, age-related cohort for these cells.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
from datetime import date
import hisepy
import numpy as np
import os
import pandas as pd
import pickle
import re
import scanpy as sc
import scipy.sparse as scs
if not os.path.exists('output'):
os.mkdir('output')
out_files = []
def read_adata_uuid(h5ad_uuid):
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file)
return adata
def sparse_nz_median(sparse_mat, feat_names):
# transpose the matrix so values for each gene are together in memory
sparse_mat = sparse_mat.transpose().tocsr()
# get dimensions for calculations
n_cells = sparse_mat.shape[1]
# compute the fraction non-zero per gene (row = axis 1)
pct_exp = sparse_mat.getnnz(axis = 1) / n_cells
# split non-zero values from the matrix directly
split_values = np.split(sparse_mat.data, sparse_mat.indptr)
# remove first and last entry in split - indptr starts with 0 and ends with the last value
# so the first and last split entries are not meaningful
split_values = split_values[1:-1].copy()
# compute medians for every split
median_expression = [np.median(x) for x in split_values]
# assemble a DataFrame with results
res_df = pd.DataFrame({
'gene': feat_names,
'pct_exp': pct_exp,
'median_expression': median_expression
})
# replace missing values from splits without any values
# these were not detected so median expression is 0
res_df['median_expression'] = res_df['median_expression'].fillna(0)
return res_df
h5ad_uuid = '5b3a0cc8-1811-4126-90c7-e9cdd41459fd'
adata = read_adata_uuid(h5ad_uuid)
adata.shape
(1823666, 33538)
obs = adata.obs
For our UMAP viewer, we plot plot expression at each level of cell type resolution
levels = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3']
%%time
level_results = {}
for level in levels:
print('>>> {l}'.format(l = level))
level_obs = obs.groupby(level)
level_dict = {}
for cell_type, type_obs in level_obs:
type_bc = type_obs['barcodes']
type_adata = adata[adata.obs['barcodes'].isin(type_bc)]
type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())
level_dict[cell_type] = type_res
level_results[level] = level_dict
>>> AIFI_L1 >>> AIFI_L2 >>> AIFI_L3 CPU times: user 7min 2s, sys: 1min 29s, total: 8min 31s Wall time: 8min 31s
level_combined = {}
for level in levels:
all_level = pd.concat(level_results[level])
all_level = all_level.reset_index(drop = False, names = [level, 'row_num'])
all_level = all_level.drop('row_num', axis = 1)
level_combined[level] = all_level
for level, df in level_combined.items():
out_csv = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())
df.to_csv(out_csv)
out_files.append(out_csv)
out_parquet = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())
df.to_parquet(out_parquet)
out_files.append(out_parquet)
gene_results = {}
for level in levels:
all_res = pd.concat(level_results[level])
split_res = all_res.groupby('gene')
split_dict = {}
for gene, df in split_res:
df = df.reset_index(drop = False)
df = df.drop('level_1', axis = 1)
df = df.rename({'level_0': 'obs_group'}, axis = 1)
split_dict[gene] = df
gene_results[level] = split_dict
Check that a gene or two look accurate
gene_results['AIFI_L1']['CD79A']
obs_group | gene | pct_exp | median_expression | |
---|---|---|---|---|
0 | B cell | CD79A | 0.988365 | 2.726919 |
1 | DC | CD79A | 0.079959 | 0.725910 |
2 | Erythrocyte | CD79A | 0.013926 | 1.230366 |
3 | ILC | CD79A | 0.120853 | 1.526564 |
4 | Monocyte | CD79A | 0.017343 | 0.818575 |
5 | NK cell | CD79A | 0.019809 | 1.260133 |
6 | Platelet | CD79A | 0.011135 | 2.423751 |
7 | Progenitor cell | CD79A | 0.150721 | 0.677979 |
8 | T cell | CD79A | 0.034470 | 1.093967 |
gene_results['AIFI_L2']['FCN1']
obs_group | gene | pct_exp | median_expression | |
---|---|---|---|---|
0 | ASDC | FCN1 | 0.055556 | 0.595097 |
1 | CD8aa | FCN1 | 0.035210 | 1.298121 |
2 | CD14 monocyte | FCN1 | 0.996254 | 3.177890 |
3 | CD16 monocyte | FCN1 | 0.933907 | 2.601019 |
4 | CD56bright NK cell | FCN1 | 0.048847 | 1.143188 |
5 | CD56dim NK cell | FCN1 | 0.049148 | 1.298217 |
6 | DN T cell | FCN1 | 0.054066 | 1.043778 |
7 | Effector B cell | FCN1 | 0.052255 | 0.973340 |
8 | Erythrocyte | FCN1 | 0.070955 | 1.265223 |
9 | ILC | FCN1 | 0.046209 | 1.412349 |
10 | Intermediate monocyte | FCN1 | 0.997159 | 3.025013 |
11 | MAIT | FCN1 | 0.048519 | 1.132813 |
12 | Memory B cell | FCN1 | 0.048281 | 1.045450 |
13 | Memory CD4 T cell | FCN1 | 0.049716 | 1.050017 |
14 | Memory CD8 T cell | FCN1 | 0.048406 | 1.210995 |
15 | Naive B cell | FCN1 | 0.041079 | 1.271097 |
16 | Naive CD4 T cell | FCN1 | 0.047838 | 1.111647 |
17 | Naive CD8 T cell | FCN1 | 0.044030 | 1.023475 |
18 | Plasma cell | FCN1 | 0.103208 | 0.351336 |
19 | Platelet | FCN1 | 0.053651 | 2.532171 |
20 | Progenitor cell | FCN1 | 0.053735 | 0.564411 |
21 | Proliferating NK cell | FCN1 | 0.054159 | 0.986173 |
22 | Proliferating T cell | FCN1 | 0.064655 | 0.571081 |
23 | Transitional B cell | FCN1 | 0.042692 | 1.196304 |
24 | Treg | FCN1 | 0.048686 | 1.184835 |
25 | cDC1 | FCN1 | 0.083775 | 0.453340 |
26 | cDC2 | FCN1 | 0.796488 | 2.183593 |
27 | gdT | FCN1 | 0.046415 | 1.194536 |
28 | pDC | FCN1 | 0.076183 | 0.794052 |
type_pkl = 'output/pbmc-ref_type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())
with open(type_pkl, 'wb') as out_file:
pickle.dump(gene_results, out_file)
out_files.append(type_pkl)
For our Gene Expression Summary viewer, we plot plot expression at each level of cell type resolution partitioned by each cohort in the reference.
%%time
level_results = {}
for level in levels:
print('>>> {l}'.format(l = level))
level_obs = obs.groupby(['cohort.cohortGuid', level])
level_dict = {}
for group_tuple, type_obs in level_obs:
cohort = group_tuple[0]
cell_type = group_tuple[1]
c_t = '{c}_{t}'.format(c = cohort, t = cell_type)
type_bc = type_obs['barcodes']
type_adata = adata[adata.obs['barcodes'].isin(type_bc)]
type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())
type_res['cohort'] = [cohort] * type_res.shape[0]
level_dict[c_t] = type_res
level_results[level] = level_dict
>>> AIFI_L1 >>> AIFI_L2 >>> AIFI_L3 CPU times: user 10min 21s, sys: 1min 33s, total: 11min 55s Wall time: 11min 55s
level_combined = {}
for level in levels:
all_level = pd.concat(level_results[level])
all_level = all_level.reset_index(drop = False, names = ['group', 'row_num'])
all_level[level] = [re.sub('.+_', '', x) for x in all_level['group']]
all_level = all_level.drop(['group','row_num'], axis = 1)
all_level = all_level[['cohort', level, 'gene', 'pct_exp', 'median_expression']]
level_combined[level] = all_level
for level, df in level_combined.items():
out_csv = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())
df.to_csv(out_csv)
out_files.append(out_csv)
out_parquet = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())
df.to_parquet(out_parquet)
out_files.append(out_parquet)
gene_results = {}
for level in levels:
all_res = pd.concat(level_results[level])
split_res = all_res.groupby('gene')
split_dict = {}
for gene, df in split_res:
df = df.reset_index(drop = False)
df = df.drop('level_1', axis = 1)
df['obs_group'] = [re.sub('.+_', '', x) for x in df['level_0']]
df = df.drop('level_0', axis = 1)
df = df[['obs_group', 'gene', 'pct_exp', 'median_expression', 'cohort']]
split_dict[gene] = df
gene_results[level] = split_dict
Check that a gene or two look accurate
gene_results['AIFI_L1']['CD79A']
obs_group | gene | pct_exp | median_expression | cohort | |
---|---|---|---|---|---|
0 | B cell | CD79A | 0.990822 | 2.746202 | BR1 |
1 | DC | CD79A | 0.087709 | 0.714458 | BR1 |
2 | Erythrocyte | CD79A | 0.020000 | 2.958432 | BR1 |
3 | ILC | CD79A | 0.119444 | 1.399105 | BR1 |
4 | Monocyte | CD79A | 0.018308 | 0.798966 | BR1 |
5 | NK cell | CD79A | 0.020623 | 1.232686 | BR1 |
6 | Platelet | CD79A | 0.011520 | 2.419073 | BR1 |
7 | Progenitor cell | CD79A | 0.157895 | 0.659168 | BR1 |
8 | T cell | CD79A | 0.041634 | 1.068730 | BR1 |
9 | B cell | CD79A | 0.988087 | 2.716038 | BR2 |
10 | DC | CD79A | 0.072828 | 0.710909 | BR2 |
11 | Erythrocyte | CD79A | 0.007937 | 1.125242 | BR2 |
12 | ILC | CD79A | 0.113139 | 1.515654 | BR2 |
13 | Monocyte | CD79A | 0.016689 | 0.844489 | BR2 |
14 | NK cell | CD79A | 0.019250 | 1.271189 | BR2 |
15 | Platelet | CD79A | 0.009580 | 2.481542 | BR2 |
16 | Progenitor cell | CD79A | 0.128664 | 0.651074 | BR2 |
17 | T cell | CD79A | 0.023565 | 1.097746 | BR2 |
18 | B cell | CD79A | 0.983492 | 2.706248 | UP1 |
19 | DC | CD79A | 0.077181 | 0.821561 | UP1 |
20 | Erythrocyte | CD79A | 0.015625 | 1.007309 | UP1 |
21 | ILC | CD79A | 0.133333 | 1.595389 | UP1 |
22 | Monocyte | CD79A | 0.015871 | 0.794617 | UP1 |
23 | NK cell | CD79A | 0.018834 | 1.324052 | UP1 |
24 | Platelet | CD79A | 0.015686 | 2.211980 | UP1 |
25 | Progenitor cell | CD79A | 0.198830 | 0.790927 | UP1 |
26 | T cell | CD79A | 0.042297 | 1.163522 | UP1 |
cohort_type_pkl = 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())
with open(cohort_type_pkl, 'wb') as out_file:
pickle.dump(gene_results, out_file)
out_files.append(cohort_type_pkl)
Finally, we'll use hisepy.upload.upload_files()
to send a copy of our output to HISE to use for downstream analysis steps.
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'PBMC Reference Pseudobulk Frac Medians {d}'.format(d = date.today())
in_files = [h5ad_uuid]
in_files
['5b3a0cc8-1811-4126-90c7-e9cdd41459fd']
out_files
['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']
hisepy.upload.upload_files(
files = out_files,
study_space_id = study_space_uuid,
title = title,
input_file_ids = in_files
)
output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl you are trying to upload file_ids... ['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']. Do you truly want to proceed?
{'trace_id': '78f8b150-b1a4-48af-a87c-648dcd2d7ad5', 'files': ['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']}
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 numpy 1.24.0 pandas 2.1.4 scanpy 1.9.6 scipy 1.11.4 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.2 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1054-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-03-27 23:45