#!/usr/bin/env python # coding: utf-8 # # Pseudobulk data with fraction and median non-zero # # For use in plotting, we'll compute "pseudobulk" summary statistics for gene expression based on cell metadata grouping. # # Note that this approach doesn't use the careful pseudobulk approaches implemented in packages like `scran` - we're instead taking the fraction of cells in which gene expression was detected (`pct_exp`) and the median of expression in those non-zero cells (`median_expression`). # # For use in our visualization tools, we'll group by each of our cell type labeling levels (AIFI_L1, _L2, and _L3), as well as by both cell type and the originating, age-related cohort for these cells. # ## Load packages # In[1]: import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=RuntimeWarning) from datetime import date import hisepy import numpy as np import os import pandas as pd import pickle import re import scanpy as sc import scipy.sparse as scs # In[2]: if not os.path.exists('output'): os.mkdir('output') # In[3]: out_files = [] # ## Helper functions # In[4]: def read_adata_uuid(h5ad_uuid): h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid) if not os.path.isdir(h5ad_path): hise_res = hisepy.cache_files([h5ad_uuid]) h5ad_filename = os.listdir(h5ad_path)[0] h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename) adata = sc.read_h5ad(h5ad_file) return adata # In[5]: def sparse_nz_median(sparse_mat, feat_names): # transpose the matrix so values for each gene are together in memory sparse_mat = sparse_mat.transpose().tocsr() # get dimensions for calculations n_cells = sparse_mat.shape[1] # compute the fraction non-zero per gene (row = axis 1) pct_exp = sparse_mat.getnnz(axis = 1) / n_cells # split non-zero values from the matrix directly split_values = np.split(sparse_mat.data, sparse_mat.indptr) # remove first and last entry in split - indptr starts with 0 and ends with the last value # so the first and last split entries are not meaningful split_values = split_values[1:-1].copy() # compute medians for every split median_expression = [np.median(x) for x in split_values] # assemble a DataFrame with results res_df = pd.DataFrame({ 'gene': feat_names, 'pct_exp': pct_exp, 'median_expression': median_expression }) # replace missing values from splits without any values # these were not detected so median expression is 0 res_df['median_expression'] = res_df['median_expression'].fillna(0) return res_df # ## Retrieve atlas data from HISE # In[6]: h5ad_uuid = '5b3a0cc8-1811-4126-90c7-e9cdd41459fd' # In[7]: adata = read_adata_uuid(h5ad_uuid) # In[8]: adata.shape # In[9]: obs = adata.obs # ### Summarized expression by cell type # # For our UMAP viewer, we plot plot expression at each level of cell type resolution # In[10]: levels = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'] # In[11]: get_ipython().run_cell_magic('time', '', "level_results = {}\n\nfor level in levels:\n print('>>> {l}'.format(l = level))\n \n level_obs = obs.groupby(level)\n \n level_dict = {}\n for cell_type, type_obs in level_obs:\n type_bc = type_obs['barcodes']\n type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n \n type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n \n level_dict[cell_type] = type_res\n \n level_results[level] = level_dict\n") # ### Save assembled results for each level # In[12]: level_combined = {} for level in levels: all_level = pd.concat(level_results[level]) all_level = all_level.reset_index(drop = False, names = [level, 'row_num']) all_level = all_level.drop('row_num', axis = 1) level_combined[level] = all_level # In[13]: for level, df in level_combined.items(): out_csv = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today()) df.to_csv(out_csv) out_files.append(out_csv) out_parquet = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today()) df.to_parquet(out_parquet) out_files.append(out_parquet) # ### Restructure for use in visualization apps # In[14]: gene_results = {} for level in levels: all_res = pd.concat(level_results[level]) split_res = all_res.groupby('gene') split_dict = {} for gene, df in split_res: df = df.reset_index(drop = False) df = df.drop('level_1', axis = 1) df = df.rename({'level_0': 'obs_group'}, axis = 1) split_dict[gene] = df gene_results[level] = split_dict # Check that a gene or two look accurate # In[15]: gene_results['AIFI_L1']['CD79A'] # In[16]: gene_results['AIFI_L2']['FCN1'] # In[17]: type_pkl = 'output/pbmc-ref_type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today()) with open(type_pkl, 'wb') as out_file: pickle.dump(gene_results, out_file) out_files.append(type_pkl) # ### Summarized expression by cell type and cohort # # For our Gene Expression Summary viewer, we plot plot expression at each level of cell type resolution partitioned by each cohort in the reference. # In[18]: get_ipython().run_cell_magic('time', '', "level_results = {}\n\nfor level in levels:\n print('>>> {l}'.format(l = level))\n \n level_obs = obs.groupby(['cohort.cohortGuid', level])\n \n level_dict = {}\n for group_tuple, type_obs in level_obs:\n cohort = group_tuple[0]\n cell_type = group_tuple[1]\n c_t = '{c}_{t}'.format(c = cohort, t = cell_type)\n \n type_bc = type_obs['barcodes']\n type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n \n type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n \n type_res['cohort'] = [cohort] * type_res.shape[0]\n level_dict[c_t] = type_res\n \n level_results[level] = level_dict\n") # ### Save assembled results for each level # In[19]: level_combined = {} for level in levels: all_level = pd.concat(level_results[level]) all_level = all_level.reset_index(drop = False, names = ['group', 'row_num']) all_level[level] = [re.sub('.+_', '', x) for x in all_level['group']] all_level = all_level.drop(['group','row_num'], axis = 1) all_level = all_level[['cohort', level, 'gene', 'pct_exp', 'median_expression']] level_combined[level] = all_level # In[20]: for level, df in level_combined.items(): out_csv = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today()) df.to_csv(out_csv) out_files.append(out_csv) out_parquet = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today()) df.to_parquet(out_parquet) out_files.append(out_parquet) # ### Restructure for use in visualization apps # In[21]: gene_results = {} for level in levels: all_res = pd.concat(level_results[level]) split_res = all_res.groupby('gene') split_dict = {} for gene, df in split_res: df = df.reset_index(drop = False) df = df.drop('level_1', axis = 1) df['obs_group'] = [re.sub('.+_', '', x) for x in df['level_0']] df = df.drop('level_0', axis = 1) df = df[['obs_group', 'gene', 'pct_exp', 'median_expression', 'cohort']] split_dict[gene] = df gene_results[level] = split_dict # Check that a gene or two look accurate # In[22]: gene_results['AIFI_L1']['CD79A'] # In[23]: cohort_type_pkl = 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today()) with open(cohort_type_pkl, 'wb') as out_file: pickle.dump(gene_results, out_file) out_files.append(cohort_type_pkl) # ## Upload Results to HISE # # Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps. # In[24]: study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0' title = 'PBMC Reference Pseudobulk Frac Medians {d}'.format(d = date.today()) # In[25]: in_files = [h5ad_uuid] # In[26]: in_files # In[27]: out_files # In[28]: hisepy.upload.upload_files( files = out_files, study_space_id = study_space_uuid, title = title, input_file_ids = in_files ) # In[29]: import session_info session_info.show() # In[ ]: