#!/usr/bin/env python
# coding: utf-8

# # Detect doublets using Scrublet
# 
# Our cell hashing processes catch and remove many doublets that are generated by mixing of cells from different samples, but some percentage of doublets (~7-8%) will be generated by collisions of cells from the same sample, and will not be detected.
# 
# To detect and remove these, we'll utilize the Scrublet package. Scrublet's process for doublet identification is described in this publication:
# 
# Wolock, S. L., Lopez, R. & Klein, A. M. Scrublet: Computational Identification of Cell Doublets in Single-Cell Transcriptomic Data. Cell Syst 8, 281–291.e9 (2019)
# 
# We'll use scrublet's integration into the scanpy package's [scanpy.external tools](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.scrublet.html#scanpy.external.pp.scrublet).
# 
# Here, we apply scrublet to each of our .h5 files, and store the results in HISE for downstream analysis steps.

# ## Load Packages
# 
# `anndata`: Data structures for scRNA-seq  
# `concurrent.futures`: parallelization methods  
# `datetime`: date and time functions  
# `h5py`: HDF5 file I/O  
# `hisepy`: The HISE SDK for Python  
# `numpy`: Mathematical data structures and computation  
# `os`: operating system calls  
# `pandas`: DataFrame data structures  
# `re`: Regular expressions  
# `scanpy`: scRNA-seq analysis  
# `scipy.sparse`: Spare matrix data structures  

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import anndata
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from datetime import date
import h5py
import hisepy
import numpy as np
import os
import pandas as pd 
import re
import scanpy as sc
import scanpy.external as sce
import scipy.sparse as scs


# ## Read sample metadata from HISE

# In[2]:


sample_meta_file_uuid = '2da66a1a-17cc-498b-9129-6858cf639caf'
file_query = hisepy.reader.read_files(
    [sample_meta_file_uuid]
)


# In[3]:


meta_data = file_query['values']


# ## Helper functions
# 
# These functions will retrieve data from HISE and read as AnnData for use with scrublet, and for reading and applying scrublet to each file.

# In[4]:


# define a function to read count data
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat

# define a function to read obeservation metadata (i.e. cell metadata)
def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })
    obs_df = obs_df.set_index('barcodes', drop = False)
    obs_df['barcodes'] = obs_df['barcodes'].astype("category")

    return obs_df

# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_con):
    #h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                            obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata


# In[5]:


def get_adata(uuid):
    # Load the file using HISE
    res = hisepy.reader.read_files([uuid])

    # If there's an error, read_files returns a list instead of a dictionary.
    # We should raise and exception with the message when this happens.
    if(isinstance(res, list)):
        error_message = res[0]['message']
        raise Exception(error_message)
    
    # Read the file to adata
    h5_con = res['values'][0]
    adata = read_h5_anndata(h5_con)
    
    # Clean up the file now that we're done with it
    h5_con.close()

    return(adata)


# In[6]:


def process_file(file_uuid):
    adata = get_adata(file_uuid)
    sc.external.pp.scrublet(
        adata,
        random_state = 3030,
        verbose = False
    )
    result = adata.obs[['barcodes','predicted_doublet','doublet_score']]
    return result


# ## Apply to each sample in parallel
# 
# Here, we'll use `concurrent.futures` to apply the function above to our samples in parallel.

# In[7]:


results = []
file_uuids = meta_data['file.id'].tolist()
with ThreadPoolExecutor(max_workers = 20) as executor:  
    for result in executor.map(process_file, file_uuids):
        results.append(result)


# In[8]:


final_result = pd.concat(results, ignore_index = True)


# In[9]:


prediction_counts = final_result['predicted_doublet'].value_counts()
prediction_counts


# In[10]:


prediction_counts[1] / sum(prediction_counts)


# ## Save results to CSV

# In[12]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[13]:


out_file = 'output/ref_scrublet_results_{d}.csv'.format(d = date.today())
final_result.to_csv(out_file)


# ## Upload assembled data to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[14]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Reference Scrublet results'


# In[15]:


in_files = [sample_meta_file_uuid] + meta_data['file.id'].to_list()


# In[16]:


out_files = [out_file]


# In[17]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[18]:


import session_info
session_info.show()


# In[ ]: