#!/usr/bin/env python
# coding: utf-8

# # Assign Myeloid cell annotations
# 
# To assemble our annotations, we'll read our clustered Myeloid cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells.
# 
# For Myeloid cells, we have two groups of cells to label - Most of the Myeloid cells were assigned labels at one resolution, and the Dendritic cells were assigned labels after additional, iterative clustering. So, we'll load both of these sets, remove DCs from the rest of the Myeloid cells, assign identities based on clusters in each, and finally concatenate all of the cell barcodes.

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc


# ### Helper function
# 
# This function makes it easy to pull csv files stored in HISE as a pandas data.frame

# In[2]:


def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pd.read_csv(csv_file, index_col = 0)
    return df


# ## Read subclustering results from HISE

# In[3]:


cell_class = 'myeloid'


# In[4]:


h5ad_uuid = 'c38df326-662d-459b-982d-0186c022f70d'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)


# In[5]:


if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])


# In[6]:


h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)


# In[7]:


adata = sc.read_h5ad(h5ad_file)


# In[8]:


adata.shape


# ## Read DC subclustering results from HISE

# In[9]:


dc_uuid = '892e4fb0-8dad-4cb6-bcec-8f29b3dcd15e'
dc_path = '/home/jupyter/cache/{u}'.format(u = dc_uuid)


# In[10]:


if not os.path.isdir(dc_path):
    hise_res = hisepy.reader.cache_files([dc_path])


# In[11]:


dc_filename = os.listdir(dc_path)[0]
dc_file = '{p}/{f}'.format(p = dc_path, f = dc_filename)


# In[12]:


dc_adata = sc.read_h5ad(dc_file)


# In[13]:


dc_adata.shape


# ## Subset non-memory cells

# In[14]:


drop_lgl = [not x for x in adata.obs['barcodes'].isin(dc_adata.obs['barcodes'])]


# In[15]:


nondc_adata = adata[drop_lgl].copy()


# In[16]:


nondc_adata.shape


# ## Read non-DC annotations

# In[17]:


anno_uuid = '9f7d59f2-7aa8-4c2a-86b9-fe6c46b1068f'


# In[18]:


anno = read_csv_uuid(anno_uuid)


# ## Assign non-memory labels

# In[19]:


join_col = 'leiden_resolution_3'


# In[20]:


anno[join_col] = anno[join_col].astype('string').astype('category')


# In[21]:


obs = nondc_adata.obs


# In[22]:


sum(obs[join_col].isin(anno[join_col]))


# In[23]:


nondc_anno = obs.merge(anno, how = 'left', on = join_col)


# In[24]:


nondc_anno.head()


# ## Read DC annotations

# In[25]:


dc_anno_uuid = '98c74523-e518-49f3-a021-f30b87a8f565'
dc_anno = read_csv_uuid(dc_anno_uuid)


# ## Assign memory labels

# In[26]:


join_col = 'leiden_resolution_2_myeloid-dcs'


# In[27]:


dc_anno[join_col] = dc_anno[join_col].astype('string').astype('category')


# In[28]:


obs = dc_adata.obs


# In[29]:


sum(obs[join_col].isin(dc_anno[join_col]))


# In[30]:


dc_anno = obs.merge(dc_anno, how = 'left', on = join_col)


# In[31]:


dc_anno.head()


# ## Concatenate annotations

# In[32]:


anno = pd.concat([nondc_anno, dc_anno], axis = 0)


# In[33]:


anno = anno[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# ## Add to AnnData to preview assignments

# In[34]:


anno = anno.set_index('barcodes')


# In[35]:


obs = adata.obs
obs = obs.merge(anno, how = 'left', left_index = True, right_index = True)


# In[36]:


adata.obs = obs


# In[37]:


adata.obs.head()


# In[38]:


sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)


# In[39]:


sc.pl.umap(adata, 
           color = ['leiden_resolution_1',
                    'leiden_resolution_1.5',
                    'leiden_resolution_2',
                    'leiden_resolution_2.5',
                    'leiden_resolution_3'],
           ncols = 1)


# ## Output final annotations

# In[40]:


obs = adata.obs
obs = obs.reset_index(drop = True)


# In[41]:


umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']


# In[42]:


obs.head()


# In[43]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[44]:


obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)


# In[45]:


obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)


# In[46]:


bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# In[47]:


label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)


# In[48]:


label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)


# ## Upload annotations to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[49]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Myeloid cell barcode annotations {d}'.format(d = date.today())


# In[50]:


in_files = [h5ad_uuid, dc_uuid, anno_uuid, dc_anno_uuid]


# In[51]:


in_files


# In[52]:


out_files = [obs_out_csv, obs_out_parquet,
             label_out_csv, label_out_parquet]


# In[53]:


out_files


# In[54]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[55]:


import session_info
session_info.show()


# In[ ]: