#!/usr/bin/env python # coding: utf-8 # # Detect doublets using Scrublet # # Our cell hashing processes catch and remove many doublets that are generated by mixing of cells from different samples, but some percentage of doublets (~7-8%) will be generated by collisions of cells from the same sample, and will not be detected. # # To detect and remove these, we'll utilize the Scrublet package. Scrublet's process for doublet identification is described in this publication: # # Wolock, S. L., Lopez, R. & Klein, A. M. Scrublet: Computational Identification of Cell Doublets in Single-Cell Transcriptomic Data. Cell Syst 8, 281–291.e9 (2019) # # We'll use scrublet's integration into the scanpy package's [scanpy.external tools](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.scrublet.html#scanpy.external.pp.scrublet). # # Here, we apply scrublet to each of our .h5 files, and store the results in HISE for downstream analysis steps. # ## Load Packages # # `anndata`: Data structures for scRNA-seq # `concurrent.futures`: parallelization methods # `datetime`: date and time functions # `h5py`: HDF5 file I/O # `hisepy`: The HISE SDK for Python # `numpy`: Mathematical data structures and computation # `os`: operating system calls # `pandas`: DataFrame data structures # `re`: Regular expressions # `scanpy`: scRNA-seq analysis # `scipy.sparse`: Spare matrix data structures # In[1]: import warnings warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=UserWarning) import anndata import concurrent.futures from concurrent.futures import ThreadPoolExecutor from datetime import date import h5py import hisepy import numpy as np import os import pandas as pd import re import scanpy as sc import scanpy.external as sce import scipy.sparse as scs # ## Read sample metadata from HISE # In[2]: sample_meta_file_uuid = '2da66a1a-17cc-498b-9129-6858cf639caf' file_query = hisepy.reader.read_files( [sample_meta_file_uuid] ) # In[3]: meta_data = file_query['values'] # ## Helper functions # # These functions will retrieve data from HISE and read as AnnData for use with scrublet, and for reading and applying scrublet to each file. # In[4]: # define a function to read count data def read_mat(h5_con): mat = scs.csc_matrix( (h5_con['matrix']['data'][:], # Count values h5_con['matrix']['indices'][:], # Row indices h5_con['matrix']['indptr'][:]), # Pointers for column positions shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions ) return mat # define a function to read obeservation metadata (i.e. cell metadata) def read_obs(h5con): bc = h5con['matrix']['barcodes'][:] bc = [x.decode('UTF-8') for x in bc] # Initialized the DataFrame with cell barcodes obs_df = pd.DataFrame({ 'barcodes' : bc }) obs_df = obs_df.set_index('barcodes', drop = False) obs_df['barcodes'] = obs_df['barcodes'].astype("category") return obs_df # define a function to construct anndata object from a h5 file def read_h5_anndata(h5_con): #h5_con = h5py.File(h5_file, mode = 'r') # extract the expression matrix mat = read_mat(h5_con) # extract gene names genes = h5_con['matrix']['features']['name'][:] genes = [x.decode('UTF-8') for x in genes] # extract metadata obs_df = read_obs(h5_con) # construct anndata adata = anndata.AnnData(mat.T, obs = obs_df) # make sure the gene names aligned adata.var_names = genes adata.var_names_make_unique() return adata # In[5]: def get_adata(uuid): # Load the file using HISE res = hisepy.reader.read_files([uuid]) # If there's an error, read_files returns a list instead of a dictionary. # We should raise and exception with the message when this happens. if(isinstance(res, list)): error_message = res[0]['message'] raise Exception(error_message) # Read the file to adata h5_con = res['values'][0] adata = read_h5_anndata(h5_con) # Clean up the file now that we're done with it h5_con.close() return(adata) # In[6]: def process_file(file_uuid): adata = get_adata(file_uuid) sc.external.pp.scrublet( adata, random_state = 3030, verbose = False ) result = adata.obs[['barcodes','predicted_doublet','doublet_score']] return result # ## Apply to each sample in parallel # # Here, we'll use `concurrent.futures` to apply the function above to our samples in parallel. # In[7]: results = [] file_uuids = meta_data['file.id'].tolist() with ThreadPoolExecutor(max_workers = 20) as executor: for result in executor.map(process_file, file_uuids): results.append(result) # In[8]: final_result = pd.concat(results, ignore_index = True) # In[9]: prediction_counts = final_result['predicted_doublet'].value_counts() prediction_counts # In[10]: prediction_counts[1] / sum(prediction_counts) # ## Save results to CSV # In[12]: out_dir = 'output' if not os.path.isdir(out_dir): os.makedirs(out_dir) # In[13]: out_file = 'output/ref_scrublet_results_{d}.csv'.format(d = date.today()) final_result.to_csv(out_file) # ## Upload assembled data to HISE # # Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps. # In[14]: study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0' title = 'Reference Scrublet results' # In[15]: in_files = [sample_meta_file_uuid] + meta_data['file.id'].to_list() # In[16]: out_files = [out_file] # In[17]: hisepy.upload.upload_files( files = out_files, study_space_id = study_space_uuid, title = title, input_file_ids = in_files ) # In[18]: import session_info session_info.show() # In[ ]: