#!/usr/bin/env python
# coding: utf-8

# # Assign B cell annotations
# 
# To assemble our annotations, we'll read our clustered B cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells.
# 
# For B cells, we have two groups of cells to label - Most of the B cells were assigned labels at one resolution, and the non-effector memory B cells were assigned labels after additional, iterative clustering. So, we'll load both of these sets, remove the memory cells from the rest of the B cells, assign identities based on clusters in each, and finally concatenate all of the cell barcodes.

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc


# ### Helper function
# 
# This function makes it easy to pull csv files stored in HISE as a pandas data.frame

# In[2]:


def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pd.read_csv(csv_file, index_col = 0)
    return df


# ## Read subclustering results from HISE

# In[3]:


cell_class = 'b-cells'


# In[4]:


h5ad_uuid = '99f83994-26ee-49af-a882-c1f2558daed2'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)


# In[5]:


if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])


# In[6]:


h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)


# In[7]:


adata = sc.read_h5ad(h5ad_file)


# In[8]:


adata.shape


# ## Read memory cell subclustering results from HISE

# In[9]:


mem_uuid = '51838754-b378-4f13-b447-82511bcd0a66'
mem_path = '/home/jupyter/cache/{u}'.format(u = mem_uuid)


# In[10]:


if not os.path.isdir(mem_path):
    hise_res = hisepy.reader.cache_files([mem_path])


# In[11]:


mem_filename = os.listdir(mem_path)[0]
mem_file = '{p}/{f}'.format(p = mem_path, f = mem_filename)


# In[12]:


mem_adata = sc.read_h5ad(mem_file)


# In[13]:


mem_adata


# ## Subset non-memory cells

# In[14]:


drop_lgl = [not x for x in adata.obs['barcodes'].isin(mem_adata.obs['barcodes'])]


# In[15]:


nonmem_adata = adata[drop_lgl].copy()


# In[16]:


nonmem_adata.shape


# ## Read non-memory annotations

# In[17]:


anno_uuid = 'aed5e5f4-1166-48c1-ad84-c92aee8edcf8'


# In[18]:


anno = read_csv_uuid(anno_uuid)


# ## Assign non-memory labels

# In[19]:


join_col = 'ms_leiden_2'


# In[20]:


anno[join_col] = anno[join_col].astype('string').astype('category')


# In[21]:


obs = nonmem_adata.obs


# In[22]:


sum(obs[join_col].isin(anno[join_col]))


# In[23]:


nonmem_anno = obs.merge(anno, how = 'left', on = join_col)


# In[24]:


nonmem_anno.head()


# ## Read memory annotations

# In[25]:


mem_anno_uuid = 'afc4fe7f-6426-41ce-a10c-0dcdce422dd1'
mem_anno = read_csv_uuid(mem_anno_uuid)


# ## Assign memory labels

# In[26]:


join_col = 'ms_leiden_2.5'


# In[27]:


mem_anno[join_col] = mem_anno[join_col].astype('string').astype('category')


# In[28]:


obs = mem_adata.obs


# In[29]:


sum(obs[join_col].isin(mem_anno[join_col]))


# In[30]:


mem_anno = obs.merge(mem_anno, how = 'left', on = join_col)


# In[31]:


mem_anno.head()


# ## Concatenate annotations

# In[32]:


anno = pd.concat([nonmem_anno, mem_anno], axis = 0)


# In[33]:


anno = anno[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# ## Add to AnnData to preview assignments

# In[34]:


anno = anno.set_index('barcodes')


# In[35]:


obs = adata.obs
obs = obs.merge(anno, how = 'left', left_index = True, right_index = True)


# In[36]:


adata.obs = obs


# In[37]:


adata.obs.head()


# In[38]:


sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)


# In[39]:


sc.pl.umap(adata, 
           color = ['leiden_resolution_1',
                    'leiden_resolution_1.5',
                    'leiden_resolution_2',
                    'ms_leiden_2'],
           ncols = 1)


# ## Output final annotations

# In[40]:


obs = adata.obs
obs = obs.reset_index(drop = True)


# In[41]:


umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']


# In[42]:


obs.head()


# In[43]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[44]:


obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)


# In[45]:


obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)


# In[46]:


bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# In[47]:


label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)


# In[48]:


label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)


# ## Upload annotations to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[50]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'B cell barcode annotations {d}'.format(d = date.today())


# In[51]:


in_files = [h5ad_uuid, mem_uuid, anno_uuid, mem_anno_uuid]


# In[52]:


in_files


# In[53]:


out_files = [obs_out_csv, obs_out_parquet,
             label_out_csv, label_out_parquet]


# In[54]:


out_files


# In[55]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[56]:


import session_info
session_info.show()


# In[ ]: