#!/usr/bin/env python # coding: utf-8 # # Assign B cell annotations # # To assemble our annotations, we'll read our clustered B cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells. # # For B cells, we have two groups of cells to label - Most of the B cells were assigned labels at one resolution, and the non-effector memory B cells were assigned labels after additional, iterative clustering. So, we'll load both of these sets, remove the memory cells from the rest of the B cells, assign identities based on clusters in each, and finally concatenate all of the cell barcodes. # In[1]: import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=RuntimeWarning) from datetime import date import hisepy import os import pandas as pd import scanpy as sc # ### Helper function # # This function makes it easy to pull csv files stored in HISE as a pandas data.frame # In[2]: def read_csv_uuid(csv_uuid): csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid) if not os.path.isdir(csv_path): hise_res = hisepy.reader.cache_files([csv_uuid]) csv_filename = os.listdir(csv_path)[0] csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename) df = pd.read_csv(csv_file, index_col = 0) return df # ## Read subclustering results from HISE # In[3]: cell_class = 'b-cells' # In[4]: h5ad_uuid = '99f83994-26ee-49af-a882-c1f2558daed2' h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid) # In[5]: if not os.path.isdir(h5ad_path): hise_res = hisepy.reader.cache_files([h5ad_uuid]) # In[6]: h5ad_filename = os.listdir(h5ad_path)[0] h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename) # In[7]: adata = sc.read_h5ad(h5ad_file) # In[8]: adata.shape # ## Read memory cell subclustering results from HISE # In[9]: mem_uuid = '51838754-b378-4f13-b447-82511bcd0a66' mem_path = '/home/jupyter/cache/{u}'.format(u = mem_uuid) # In[10]: if not os.path.isdir(mem_path): hise_res = hisepy.reader.cache_files([mem_path]) # In[11]: mem_filename = os.listdir(mem_path)[0] mem_file = '{p}/{f}'.format(p = mem_path, f = mem_filename) # In[12]: mem_adata = sc.read_h5ad(mem_file) # In[13]: mem_adata # ## Subset non-memory cells # In[14]: drop_lgl = [not x for x in adata.obs['barcodes'].isin(mem_adata.obs['barcodes'])] # In[15]: nonmem_adata = adata[drop_lgl].copy() # In[16]: nonmem_adata.shape # ## Read non-memory annotations # In[17]: anno_uuid = 'aed5e5f4-1166-48c1-ad84-c92aee8edcf8' # In[18]: anno = read_csv_uuid(anno_uuid) # ## Assign non-memory labels # In[19]: join_col = 'ms_leiden_2' # In[20]: anno[join_col] = anno[join_col].astype('string').astype('category') # In[21]: obs = nonmem_adata.obs # In[22]: sum(obs[join_col].isin(anno[join_col])) # In[23]: nonmem_anno = obs.merge(anno, how = 'left', on = join_col) # In[24]: nonmem_anno.head() # ## Read memory annotations # In[25]: mem_anno_uuid = 'afc4fe7f-6426-41ce-a10c-0dcdce422dd1' mem_anno = read_csv_uuid(mem_anno_uuid) # ## Assign memory labels # In[26]: join_col = 'ms_leiden_2.5' # In[27]: mem_anno[join_col] = mem_anno[join_col].astype('string').astype('category') # In[28]: obs = mem_adata.obs # In[29]: sum(obs[join_col].isin(mem_anno[join_col])) # In[30]: mem_anno = obs.merge(mem_anno, how = 'left', on = join_col) # In[31]: mem_anno.head() # ## Concatenate annotations # In[32]: anno = pd.concat([nonmem_anno, mem_anno], axis = 0) # In[33]: anno = anno[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']] # ## Add to AnnData to preview assignments # In[34]: anno = anno.set_index('barcodes') # In[35]: obs = adata.obs obs = obs.merge(anno, how = 'left', left_index = True, right_index = True) # In[36]: adata.obs = obs # In[37]: adata.obs.head() # In[38]: sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1) # In[39]: sc.pl.umap(adata, color = ['leiden_resolution_1', 'leiden_resolution_1.5', 'leiden_resolution_2', 'ms_leiden_2'], ncols = 1) # ## Output final annotations # In[40]: obs = adata.obs obs = obs.reset_index(drop = True) # In[41]: umap_mat = adata.obsm['X_umap'] umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2']) obs['umap_1'] = umap_df['umap_1'] obs['umap_2'] = umap_df['umap_2'] # In[42]: obs.head() # In[43]: out_dir = 'output' if not os.path.isdir(out_dir): os.makedirs(out_dir) # In[44]: obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today()) obs.to_csv(obs_out_csv, index = False) # In[45]: obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today()) obs.to_parquet(obs_out_parquet, index = False) # In[46]: bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']] # In[47]: label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today()) bc_anno.to_csv(label_out_csv, index = False) # In[48]: label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today()) bc_anno.to_parquet(label_out_parquet, index = False) # ## Upload annotations to HISE # # Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps. # In[50]: study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0' title = 'B cell barcode annotations {d}'.format(d = date.today()) # In[51]: in_files = [h5ad_uuid, mem_uuid, anno_uuid, mem_anno_uuid] # In[52]: in_files # In[53]: out_files = [obs_out_csv, obs_out_parquet, label_out_csv, label_out_parquet] # In[54]: out_files # In[55]: hisepy.upload.upload_files( files = out_files, study_space_id = study_space_uuid, title = title, input_file_ids = in_files ) # In[56]: import session_info session_info.show() # In[ ]: