#!/usr/bin/env python
# coding: utf-8

# # Assign Other annotations
# 
# To assemble our annotations, we'll read our Other cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc


# ### Helper function
# 
# This function makes it easy to pull csv files stored in HISE as a pandas data.frame

# In[2]:


def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pd.read_csv(csv_file, index_col = 0)
    return df


# ## Read subclustering results from HISE

# In[3]:


cell_class = 'other'


# In[4]:


h5ad_uuid = '1eb6ca8c-b8ed-4968-b515-c954497441dc'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)


# In[5]:


if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])


# In[6]:


h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)


# In[7]:


adata = sc.read_h5ad(h5ad_file)


# In[8]:


adata


# ## Read annotations

# In[9]:


anno_uuid = '03817547-e2e3-412b-b36e-538d2bc74c87'
anno = read_csv_uuid(anno_uuid)


# In[10]:


anno.head()


# In[11]:


join_col = 'leiden_resolution_1'


# In[12]:


anno[join_col] = anno[join_col].astype('string').astype('category')


# In[13]:


obs = adata.obs


# In[14]:


sum(obs[join_col].isin(anno[join_col]))


# In[15]:


obs_anno = obs.merge(anno, how = 'left', on = join_col)


# In[16]:


adata.obs = obs_anno
adata.obs = adata.obs.set_index('barcodes', drop = False)


# In[17]:


adata.obs.head()


# In[18]:


sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)


# In[19]:


sc.pl.umap(adata, 
           color = ['leiden_resolution_1',
                    'leiden_resolution_1.5',
                    'leiden_resolution_2'],
           ncols = 1)


# ## Output final annotations

# In[20]:


obs = adata.obs
obs = obs.reset_index(drop = True)


# In[21]:


umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']


# In[22]:


obs.head()


# In[23]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[24]:


obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)


# In[25]:


obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)


# In[26]:


bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]


# In[27]:


label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)


# In[28]:


label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)


# ## Upload annotations to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[29]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Other barcode annotations {d}'.format(d = date.today())


# In[30]:


in_files = [h5ad_uuid, anno_uuid]


# In[31]:


in_files


# In[32]:


out_files = [obs_out_csv, obs_out_parquet,
             label_out_csv, label_out_parquet]


# In[33]:


out_files


# In[34]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[35]:


import session_info
session_info.show()


# In[ ]: