#!/usr/bin/env python
# coding: utf-8

# # Generate "clean" reference dataset
# 
# In this notebook, we'll read the assembled PBMC reference dataset and remove cell types flagged as Doublets, Contamination, or with high mitochondrial content. We'll then re-project the clean reference for use in visualization and for construction of reference models.

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce


# In[2]:


def read_adata_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file)
    return adata


# In[3]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[4]:


def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str


# ## Read annotated dataset

# In[5]:


h5ad_uuid = '157bd496-0f1e-4239-83bc-a9616696b63a'


# In[6]:


adata = read_adata_uuid(h5ad_uuid)


# In[7]:


adata.shape


# ## Filter cell types

# In[8]:


exclude_terms = [
    'Contamination', 'contamination',
    'Doublet', 'doublet', 'HBB+',
    'Mito', 'mito'
]


# In[9]:


all_types = adata.obs['AIFI_L3'].unique().tolist()


# In[10]:


exclude_types = []
for cell_type in all_types:
    for term in exclude_terms:
        if term in cell_type:
            exclude_types.append(cell_type)


# In[11]:


exclude_types


# In[12]:


keep_types = []
for cell_type in all_types:
    if not cell_type in exclude_types:
        keep_types.append(cell_type)


# ## How many are excluded?

# In[13]:


all_counts = adata.obs['AIFI_L3'].value_counts()
n_all = sum(all_counts)
n_all


# In[14]:


exclude_counts = all_counts[exclude_types]


# In[15]:


exclude_counts


# In[16]:


n_exclude = sum(exclude_counts)
n_exclude


# Percent removed:

# In[17]:


n_exclude / n_all * 100


# ## Generate filtered object

# In[18]:


adata.obs['keep'] = adata.obs['AIFI_L3'].isin(keep_types).astype('category')


# In[19]:


sc.pl.umap(
    adata,
    color = ['AIFI_L3', 'keep'],
    ncols = 1
)


# In[20]:


adata_subset = adata[adata.obs['AIFI_L3'].isin(keep_types)]
adata_subset.shape


# In[21]:


obs = adata_subset.obs.copy()
obs['AIFI_L1'] = obs['AIFI_L1'].cat.remove_unused_categories()
obs['AIFI_L2'] = obs['AIFI_L2'].cat.remove_unused_categories()
obs['AIFI_L3'] = obs['AIFI_L3'].cat.remove_unused_categories()
adata_subset.obs = obs


# In[22]:


sc.pl.umap(
    adata_subset,
    color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'],
    ncols = 1
)


# ## Output final annotations

# In[23]:


obs = adata_subset.obs
obs = obs.reset_index(drop = True)


# In[24]:


obs_out_csv = '{p}/ref_clean_pbmc_labeled_meta_{d}.csv'.format(p = out_dir, d = date.today())
obs.to_csv(obs_out_csv, index = False)


# In[25]:


obs_out_parquet = '{p}/ref_clean_pbmc_labeled_meta_{d}.parquet'.format(p = out_dir, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)


# In[26]:


bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# In[27]:


label_out_csv = '{p}/ref_clean_pbmc_barcode_labels_{d}.csv'.format(p = out_dir, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)


# In[28]:


label_out_parquet = '{p}/ref_clean_pbmc_barcode_labels_{d}.parquet'.format(p = out_dir, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)


# ## Output labeled AnnData

# In[29]:


out_h5ad = '{p}/ref_clean_pbmc_labeled_{d}.h5ad'.format(p = out_dir, d = date.today())
adata_subset.write_h5ad(out_h5ad)


# ## Upload results to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[30]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = '10x 3-prime PBMC Clean Reference {d}'.format(d = date.today())


# In[31]:


search_id = element_id()
search_id


# In[32]:


in_files = [h5ad_uuid]


# In[33]:


in_files


# In[34]:


out_files = [out_h5ad,
             obs_out_csv, obs_out_parquet,
             label_out_csv, label_out_parquet]


# In[35]:


out_files


# In[36]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)


# In[37]:


import session_info
session_info.show()


# In[ ]: