#!/usr/bin/env python # coding: utf-8 # # Generate "clean" reference dataset # # In this notebook, we'll read the assembled PBMC reference dataset and remove cell types flagged as Doublets, Contamination, or with high mitochondrial content. We'll then re-project the clean reference for use in visualization and for construction of reference models. # In[1]: import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=RuntimeWarning) from datetime import date import hisepy import os import pandas as pd import scanpy as sc import scanpy.external as sce # In[2]: def read_adata_uuid(h5ad_uuid): h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid) if not os.path.isdir(h5ad_path): hise_res = hisepy.reader.cache_files([h5ad_uuid]) h5ad_filename = os.listdir(h5ad_path)[0] h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename) adata = sc.read_h5ad(h5ad_file) return adata # In[3]: out_dir = 'output' if not os.path.isdir(out_dir): os.makedirs(out_dir) # In[4]: def element_id(n = 3): import periodictable from random import randrange rand_el = [] for i in range(n): el = randrange(0,118) rand_el.append(periodictable.elements[el].name) rand_str = '-'.join(rand_el) return rand_str # ## Read annotated dataset # In[5]: h5ad_uuid = '157bd496-0f1e-4239-83bc-a9616696b63a' # In[6]: adata = read_adata_uuid(h5ad_uuid) # In[7]: adata.shape # ## Filter cell types # In[8]: exclude_terms = [ 'Contamination', 'contamination', 'Doublet', 'doublet', 'HBB+', 'Mito', 'mito' ] # In[9]: all_types = adata.obs['AIFI_L3'].unique().tolist() # In[10]: exclude_types = [] for cell_type in all_types: for term in exclude_terms: if term in cell_type: exclude_types.append(cell_type) # In[11]: exclude_types # In[12]: keep_types = [] for cell_type in all_types: if not cell_type in exclude_types: keep_types.append(cell_type) # ## How many are excluded? # In[13]: all_counts = adata.obs['AIFI_L3'].value_counts() n_all = sum(all_counts) n_all # In[14]: exclude_counts = all_counts[exclude_types] # In[15]: exclude_counts # In[16]: n_exclude = sum(exclude_counts) n_exclude # Percent removed: # In[17]: n_exclude / n_all * 100 # ## Generate filtered object # In[18]: adata.obs['keep'] = adata.obs['AIFI_L3'].isin(keep_types).astype('category') # In[19]: sc.pl.umap( adata, color = ['AIFI_L3', 'keep'], ncols = 1 ) # In[20]: adata_subset = adata[adata.obs['AIFI_L3'].isin(keep_types)] adata_subset.shape # In[21]: obs = adata_subset.obs.copy() obs['AIFI_L1'] = obs['AIFI_L1'].cat.remove_unused_categories() obs['AIFI_L2'] = obs['AIFI_L2'].cat.remove_unused_categories() obs['AIFI_L3'] = obs['AIFI_L3'].cat.remove_unused_categories() adata_subset.obs = obs # In[22]: sc.pl.umap( adata_subset, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1 ) # ## Output final annotations # In[23]: obs = adata_subset.obs obs = obs.reset_index(drop = True) # In[24]: obs_out_csv = '{p}/ref_clean_pbmc_labeled_meta_{d}.csv'.format(p = out_dir, d = date.today()) obs.to_csv(obs_out_csv, index = False) # In[25]: obs_out_parquet = '{p}/ref_clean_pbmc_labeled_meta_{d}.parquet'.format(p = out_dir, d = date.today()) obs.to_parquet(obs_out_parquet, index = False) # In[26]: bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']] # In[27]: label_out_csv = '{p}/ref_clean_pbmc_barcode_labels_{d}.csv'.format(p = out_dir, d = date.today()) bc_anno.to_csv(label_out_csv, index = False) # In[28]: label_out_parquet = '{p}/ref_clean_pbmc_barcode_labels_{d}.parquet'.format(p = out_dir, d = date.today()) bc_anno.to_parquet(label_out_parquet, index = False) # ## Output labeled AnnData # In[29]: out_h5ad = '{p}/ref_clean_pbmc_labeled_{d}.h5ad'.format(p = out_dir, d = date.today()) adata_subset.write_h5ad(out_h5ad) # ## Upload results to HISE # # Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps. # In[30]: study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0' title = '10x 3-prime PBMC Clean Reference {d}'.format(d = date.today()) # In[31]: search_id = element_id() search_id # In[32]: in_files = [h5ad_uuid] # In[33]: in_files # In[34]: out_files = [out_h5ad, obs_out_csv, obs_out_parquet, label_out_csv, label_out_parquet] # In[35]: out_files # In[36]: hisepy.upload.upload_files( files = out_files, study_space_id = study_space_uuid, title = title, input_file_ids = in_files, destination = search_id ) # In[37]: import session_info session_info.show() # In[ ]: