#!/usr/bin/env python # coding: utf-8 # # Partition L3 Cell Types # # For review post-doublet filtering, we'll partition cell types based on their CellTypist L3 label designations. Later, we'll perform clustering within each group for inspection. # In[1]: import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=RuntimeWarning) from datetime import date import hisepy import os import pandas as pd import re import scanpy as sc # In[2]: out_dir = 'output/l3_types' if not os.path.isdir(out_dir): os.makedirs(out_dir) # ## Helper functions # In[3]: def cache_uuid_path(uuid): cache_path = '/home/jupyter/cache/{u}'.format(u = uuid) if not os.path.isdir(cache_path): hise_res = hisepy.reader.cache_files([uuid]) filename = os.listdir(cache_path)[0] cache_file = '{p}/{f}'.format(p = cache_path, f = filename) return cache_file # In[4]: def read_parquet_uuid(uuid): cache_file = cache_uuid_path(uuid) res = pd.read_parquet(cache_file) return res # In[5]: def read_adata_uuid(uuid): cache_file = cache_uuid_path(uuid) res = sc.read_h5ad(cache_file) return res # In[6]: def rm_cache_uuid(uuid): cache_path = '/home/jupyter/cache/{u}'.format(u = uuid) rm_call = 'rm -r {d}'.format(d = cache_path) os.system(rm_call) # In[7]: def format_cell_type(cell_type): cell_type = re.sub('\\+', 'pos', cell_type) cell_type = re.sub('-', 'neg', cell_type) cell_type = re.sub(' ', '_', cell_type) return cell_type # In[8]: def element_id(n = 3): import periodictable from random import randrange rand_el = [] for i in range(n): el = randrange(0,118) rand_el.append(periodictable.elements[el].name) rand_str = '-'.join(rand_el) return rand_str # ## Read L3 labels from HISE # In[9]: l3_uuids = ['20e97e8f-2cb9-4fa6-bd84-ddcaf6a2c28b', 'd595ab6f-d1ad-4c7c-a8f6-395642927262', '56e0840c-d432-45e3-ac57-4302b0e350a4', '8bfe1a92-35c9-433a-9640-4d7cdd7cbad6'] l3_list = [] for l3_uuid in l3_uuids: l3_list.append(read_parquet_uuid(l3_uuid)) l3_labels = pd.concat(l3_list) # In[10]: l3_labels = l3_labels[['barcodes', 'AIFI_L3', 'AIFI_L3_score']] # ## Identify files for use in HISE # # Cells that were filtered in a previous notebook to remove most doublets and low-quality cells # In[11]: search_id = 'lawrencium-chromium-vanadium' # Retrieve files stored in our HISE project store # In[12]: ps_df = hisepy.list_files_in_project_store('cohorts') ps_df = ps_df[['id', 'name']] # Filter for files from the previous notebook using our search_id # In[13]: search_df = ps_df[ps_df['name'].str.contains(search_id)] search_df = search_df.sort_values('name') # In[14]: search_df.shape # In[15]: h5ad_uuids = {} for i in range(search_df.shape[0]): name = search_df['name'].tolist()[i] if '.h5ad' in name: group_name = re.sub('.+diha_','',name) group_name = re.sub('_filtered.+','',group_name) h5ad_uuids[group_name] = search_df['id'].tolist()[i] # In[16]: h5ad_uuids # ## Separate files # In[19]: for group_name, uuid in h5ad_uuids.items(): adata = read_adata_uuid(uuid) # Integrate L3 labels obs = adata.obs obs = obs.reset_index(drop = True) obs = obs.merge(l3_labels, on = 'barcodes', how = 'left') obs = obs.set_index('barcodes', drop = False) adata.obs = obs l3_types = adata.obs['AIFI_L3'].unique().tolist() for l3_type in l3_types: out_type = format_cell_type(l3_type) type_dir = 'output/l3_types/{ct}'.format(ct = out_type) if not os.path.isdir(type_dir): os.makedirs(type_dir) out_file = '{td}/diha_celltypist_L3_{g}_{ct}.h5ad'.format(td = type_dir, g = group_name, ct = out_type) type_adata = adata[adata.obs['AIFI_L3'] == l3_type].copy() type_adata.write_h5ad(out_file) rm_cache_uuid(uuid) # ## Merge files for the same type # In[20]: type_dirs = os.listdir('output/l3_types') type_h5ads = [] for type_dir in type_dirs: type_path = 'output/l3_types/{td}'.format(td = type_dir) type_files = os.listdir(type_path) adata_list = [] for type_file in type_files: adata = sc.read_h5ad('{tp}/{tf}'.format(tp = type_path, tf = type_file)) adata_list.append(adata) type_adata = sc.concat(adata_list) cell_type = type_adata.obs['AIFI_L3'][0] out_type = format_cell_type(cell_type) out_file = 'output/diha_celltypist_L3_{ct}.h5ad'.format(ct = out_type) type_adata.write_h5ad(out_file) type_h5ads.append(out_file) # ## Upload assembled results to HISE # In[26]: study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a' title = 'DIHA CellTypist L3 .h5ads {d}'.format(d = date.today()) # In[27]: search_id = element_id() search_id # In[28]: in_files = [] for group_name, uuid in h5ad_uuids.items(): in_files.append(uuid) in_files # In[29]: out_files = type_h5ads # In[30]: out_files # In[31]: hisepy.upload.upload_files( files = out_files, study_space_id = study_space_uuid, title = title, input_file_ids = in_files, destination = search_id ) # In[32]: import session_info session_info.show() # In[ ]: