#!/usr/bin/env python
# coding: utf-8

# # Assign T cell annotations
# 
# To assemble our annotations, we'll read our clustered T cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells.
# 
# For T cells, we have multiple groups of cells to label. We clustered all T cells, then subset cell types for additional resolution. So, we'll load these sets, remove the subsets from the rest of the T cells, assign identities based on clusters in each, and finally concatenate all of the cell barcodes.

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc


# ### Helper function
# 
# These function makes it easy to pull csv and h5ad files stored in HISE as pandas DataFrames

# In[2]:


def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pd.read_csv(csv_file, index_col = 0)
    return df


# In[3]:


def read_obs_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file, backed = 'r')
    obs = adata.obs.copy()
    return obs


# ## Read subclustering results from HISE

# In[4]:


cell_class = 't-cells'


# In[5]:


h5ad_uuid = 'd6ebc576-34ea-4394-a569-e35e16f20253'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)


# In[6]:


if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])


# In[7]:


h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)


# In[8]:


adata = sc.read_h5ad(h5ad_file)


# In[9]:


adata.shape


# ## Read iterative results from hise

# In[10]:


iter_uuids = {
    't-cd4-naive':      '70651e60-282b-4ed0-96f6-414547297232',
    't-cd8-mait':       '0f821486-866b-4c08-b0b8-508a5c544547',
    't-cd8-cm':         '6c1dff43-ddc5-437b-8e3d-dd5a32553b16',
    't-cd8-em':         'b671c53a-2698-41c1-a886-9ab939306716',
    'treg':             '35b11bcf-7a45-4714-b470-9f7627d6fbbd',
    't-cd8-naive':      '5ae29893-5a77-4081-86d1-523713a237e6',
    't-proliferating':  '90a71622-5713-47f7-82e8-18e164ca9454',
    't-gd':             '71d79aee-5600-4f3f-a3d1-e3f830e1c0ff',
    't-isg-high':       'd33ef147-59db-4fb6-950c-1dd8af242d4f',
    't-other':          'bda4fe2f-1d8a-4ec5-9ce7-6bee1a158d7b'
}


# In[11]:


iter_obs = {}
for cell_type, uuid in iter_uuids.items():
    obs = read_obs_uuid(uuid)
    iter_obs[cell_type] = obs


# ## Drop gdT cells from non-gdT data
# 
# For gdT cells subclustering, we included some cells that initially clustered with MAIT, CD8 CM, and CD8 EM cells. Here, we'll identify our gdT cells, then drop the cells that were labeled with gdTs from the other subclustering results so we don't have duplicates.

# In[12]:


gdt_bc = iter_obs["t-gd"]['barcodes'].tolist()
len(gdt_bc)


# In[13]:


drop_set = ['t-cd8-mait', 't-cd8-cm', 't-cd8-em']


# In[14]:


for cell_type in drop_set:
    obs = iter_obs[cell_type]
    n_start = obs.shape[0]
    keep_bc = [not x for x in obs['barcodes'].isin(gdt_bc)]
    obs = obs[keep_bc]
    n_end = obs.shape[0]
    print('{c}; N Start: {s}; N End: {e}'.format(c = cell_type, s = str(n_start), e=str(n_end)))
    
    iter_obs[cell_type] = obs


# ## Assign labels to cell barcodes
# 
# Now, we'll join cell type labels from our cluster annotations to our cell barcode-level observations.

# In[15]:


anno_uuids = {
    't-gd': '18df15b0-769e-4af5-a2b9-2df0413c4519',
    'treg': '401874cf-6700-4721-803d-acf66d9db321',
    't-cd8-mait': '6b77ef51-9b88-427c-bfce-a596a97610a8',
    't-cd4-naive': '8dd06068-ec33-4bbd-ab10-00944db3d304',
    't-cd8-naive': '921181ac-ceaf-4508-bc87-3165dddb0451',
    't-cd8-cm': 'ae96a58d-e0da-4838-a473-86c59d786e02',
    't-other': 'b0207abb-6e23-457e-aa63-72b0c04d57dd',
    't-proliferating': 'cc09cd70-2b40-44d5-836a-bbae87eb7e8f',
    't-isg-high': 'dbded4c8-644a-4b81-a6b2-c13e97f3733b',
    't-cd8-em': 'df6cef89-3647-4a67-87de-daa9d9bf8171',
}


# In[16]:


iter_anno = {}
for cell_type,uuid in anno_uuids.items():
    iter_anno[cell_type] = read_csv_uuid(uuid)


# In[17]:


iter_bc_anno = {}
for cell_type,sub_obs in iter_obs.items():
    sub_anno = iter_anno[cell_type]
    join_col = sub_anno.columns[0]
    sub_anno[join_col] = sub_anno[join_col].astype(str).astype('category')
    sub_obs = sub_obs.merge(sub_anno, on = join_col, how = 'left')
    sub_obs = sub_obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
    iter_bc_anno[cell_type] = sub_obs


# ## Assemble all labels

# In[18]:


all_anno = pd.concat(iter_bc_anno)


# In[19]:


all_anno.shape


# In[20]:


adata.shape


# In[21]:


sum(adata.obs['barcodes'].isin(all_anno['barcodes']))


# ## Add to AnnData to preview assignments

# In[22]:


obs = adata.obs
obs = obs.reset_index(drop = True)
obs = obs.merge(all_anno, on = 'barcodes', how = 'left')
obs = obs.set_index('barcodes', drop = True)


# In[23]:


adata.obs = obs


# In[24]:


sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)


# In[25]:


sc.pl.umap(adata, 
           color = ['leiden_resolution_1',
                    'leiden_resolution_1.5',
                    'leiden_resolution_2'],
           ncols = 1)


# ## Output final annotations

# In[26]:


obs = adata.obs
obs = obs.reset_index(drop = False)


# In[27]:


umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']


# In[28]:


obs.head()


# In[29]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[30]:


obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)


# In[31]:


obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)


# In[32]:


bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# In[33]:


label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)


# In[34]:


label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)


# ## Upload annotations to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[35]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'T cell barcode annotations {d}'.format(d = date.today())


# In[36]:


iter_h5ad_uuids = list(iter_uuids.values())
iter_anno_uuids = list(anno_uuids.values())


# In[37]:


in_files = [h5ad_uuid] + iter_h5ad_uuids + iter_anno_uuids


# In[38]:


in_files


# We should have 10 h5ad's and 10 annotations

# In[39]:


len(in_files)


# In[40]:


out_files = [obs_out_csv, obs_out_parquet,
             label_out_csv, label_out_parquet]


# In[41]:


out_files


# In[42]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[43]:


import session_info
session_info.show()


# In[ ]: