#!/usr/bin/env python
# coding: utf-8

# # Partition L3 Cell Types
# 
# For review post-doublet filtering, we'll partition cell types based on their CellTypist L3 label designations. Later, we'll perform clustering within each group for inspection.

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import re
import scanpy as sc


# In[2]:


out_dir = 'output/l3_types'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# ## Helper functions

# In[3]:


def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file


# In[4]:


def read_parquet_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_parquet(cache_file)
    return res


# In[5]:


def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res


# In[6]:


def rm_cache_uuid(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    rm_call = 'rm -r {d}'.format(d = cache_path)
    os.system(rm_call)


# In[7]:


def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type


# In[8]:


def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str


# ## Read L3 labels from HISE

# In[9]:


l3_uuids = ['20e97e8f-2cb9-4fa6-bd84-ddcaf6a2c28b',
            'd595ab6f-d1ad-4c7c-a8f6-395642927262',
            '56e0840c-d432-45e3-ac57-4302b0e350a4',
            '8bfe1a92-35c9-433a-9640-4d7cdd7cbad6']
l3_list = []
for l3_uuid in l3_uuids:
    l3_list.append(read_parquet_uuid(l3_uuid))
l3_labels = pd.concat(l3_list)


# In[10]:


l3_labels = l3_labels[['barcodes', 'AIFI_L3', 'AIFI_L3_score']]


# ## Identify files for use in HISE
# 
# Cells that were filtered in a previous notebook to remove most doublets and low-quality cells

# In[11]:


search_id = 'lawrencium-chromium-vanadium'


# Retrieve files stored in our HISE project store

# In[12]:


ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]


# Filter for files from the previous notebook using our search_id

# In[13]:


search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')


# In[14]:


search_df.shape


# In[15]:


h5ad_uuids = {}
for i in range(search_df.shape[0]):
    name = search_df['name'].tolist()[i]
    if '.h5ad' in name:
        group_name = re.sub('.+diha_','',name)
        group_name = re.sub('_filtered.+','',group_name)
        h5ad_uuids[group_name] = search_df['id'].tolist()[i]


# In[16]:


h5ad_uuids


# ## Separate files

# In[19]:


for group_name, uuid in h5ad_uuids.items():
    adata = read_adata_uuid(uuid)

    # Integrate L3 labels
    obs = adata.obs
    obs = obs.reset_index(drop = True)
    obs = obs.merge(l3_labels, on = 'barcodes', how = 'left')
    obs = obs.set_index('barcodes', drop = False)
    adata.obs = obs
    
    l3_types = adata.obs['AIFI_L3'].unique().tolist()
    
    for l3_type in l3_types:
        out_type = format_cell_type(l3_type)
        type_dir = 'output/l3_types/{ct}'.format(ct = out_type)
        if not os.path.isdir(type_dir):
            os.makedirs(type_dir)
        out_file = '{td}/diha_celltypist_L3_{g}_{ct}.h5ad'.format(td = type_dir, g = group_name, ct = out_type)
        
        type_adata = adata[adata.obs['AIFI_L3'] == l3_type].copy()
        type_adata.write_h5ad(out_file)

    rm_cache_uuid(uuid)


# ## Merge files for the same type

# In[20]:


type_dirs = os.listdir('output/l3_types')
type_h5ads = []
for type_dir in type_dirs:
    type_path = 'output/l3_types/{td}'.format(td = type_dir)
    type_files = os.listdir(type_path)

    adata_list = []
    for type_file in type_files:
        adata = sc.read_h5ad('{tp}/{tf}'.format(tp = type_path, tf = type_file))
        adata_list.append(adata)

    type_adata = sc.concat(adata_list)

    cell_type = type_adata.obs['AIFI_L3'][0]
    out_type = format_cell_type(cell_type)
    
    out_file = 'output/diha_celltypist_L3_{ct}.h5ad'.format(ct = out_type)
    type_adata.write_h5ad(out_file)
    type_h5ads.append(out_file)


# ## Upload assembled results to HISE

# In[26]:


study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA CellTypist L3 .h5ads {d}'.format(d = date.today())


# In[27]:


search_id = element_id()
search_id


# In[28]:


in_files = []
for group_name, uuid in h5ad_uuids.items():
    in_files.append(uuid)
in_files


# In[29]:


out_files = type_h5ads


# In[30]:


out_files


# In[31]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = search_id
)


# In[32]:


import session_info
session_info.show()


# In[ ]: