#!/usr/bin/env python
# coding: utf-8

# # CD8 CM cell subclustering and markers
# 
# In this notebook, we use marker gene detection to select clusters that contain CD8 central memory cells, then subset our dataset and perform a round of iterative clustering.
# 
# The outputs of this analysis are used by our domain experts to assign cell type identities to our reference. 

# ## Load packages

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import copy
from datetime import date
import hisepy
import os
import pandas as pd 
import re
import scanpy as sc
import scanpy.external as sce


# ## Helper functions
# 
# These functions will help with subsetting and performing leiden clustering at multiple resolutions in parallel.
# 
# `select_clusters_by_gene_frac()` allows us to compute the fraction of cells in each cluster that express the provided gene (> 0 UMIs). This fraction is provided by `scanpy`'s dotplot function, which calculates these fractions for use in display. We then filter clusters based on the cutoff provided as a parameter to this function.

# In[2]:


def select_clusters_by_gene_frac(adata, gene, cutoff, clusters = 'leiden'):
    gene_cl_frac = sc.pl.dotplot(
        adata, 
        groupby = clusters,
        var_names = gene,
        return_fig = True
    ).dot_size_df

    select_cl = gene_cl_frac.index[gene_cl_frac[gene] > cutoff].tolist()

    return select_cl


# ## Read full dataset from HISE

# In[3]:


cell_class = 't-cd8-cm'


# In[4]:


h5ad_uuid = 'd6ebc576-34ea-4394-a569-e35e16f20253'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)


# In[5]:


if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])


# In[6]:


h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)


# In[7]:


adata = sc.read_h5ad(h5ad_file)


# In[8]:


adata


# ## Plot major T cell class markers
# 
# To get an overview of cluster identity, we'll use a set of marker genes that are expressed in major classes of T cell types:

# In[9]:


markers = [
    'CD4', # CD4 T cells
    'CD8A', # CD8 T cells
    'FHIT', # Higher in CD4 Naive
    'IKZF2', # Helios; Treg
    'LGALS3', # Double-Negative
    'SLC4A10', # MAIT
    'TRDC', # Gamma-Delta
    'KLRD1', 'GZMK', 'GZMB', # Higher in Effectors
    'CCR7' # Higher in Naive
]


# In[10]:


sc.pl.dotplot(
    adata, 
    groupby = 'leiden_resolution_1.5',
    var_names = markers,
    swap_axes = True
)


# ## Select clusters to retain
# 
# To select clusters, we'll use `select_clusters_by_gene_frac()` to select clusters for our desired cell type. We can also select clusters that express off-target genes (like HBB and PPBP), and use these to filter our list of clusters.

# In[11]:


sc.pl.umap(adata, color = 'leiden_resolution_1.5', legend_loc = 'on data')


# In[12]:


cd8_pos_cl = select_clusters_by_gene_frac(
    adata, gene = 'CD8A', cutoff = 0.5, clusters = 'leiden_resolution_1.5'
)
sc.pl.umap(adata, color = 'leiden_resolution_1.5', groups = cd8_pos_cl)


# In[13]:


gzmb_pos_cl = select_clusters_by_gene_frac(
    adata, gene = 'GZMB', cutoff = 0.5, clusters = 'leiden_resolution_1.5'
)
sc.pl.umap(adata, color = 'leiden_resolution_1.5', groups = gzmb_pos_cl)


# In[14]:


gzmk_pos_cl = select_clusters_by_gene_frac(
    adata, gene = 'GZMK', cutoff = 0.7, clusters = 'leiden_resolution_1.5'
)
sc.pl.umap(adata, color = 'leiden_resolution_1.5', groups = gzmk_pos_cl)


# In[15]:


ccr7_pos_cl = select_clusters_by_gene_frac(
    adata, gene = 'CCR7', cutoff = 0.7, clusters = 'leiden_resolution_1.5'
)
sc.pl.umap(adata, color = 'leiden_resolution_1.5', groups = ccr7_pos_cl)


# ## Select clusters and subset data
# 
# Here, we use Python's `set` class to keep the clusters we want, and remove off-target hits.

# In[16]:


keep_cl = set(cd8_pos_cl) - set(gzmb_pos_cl)
keep_cl = keep_cl - set(gzmk_pos_cl)
keep_cl = keep_cl - set(ccr7_pos_cl)
keep_cl = list(keep_cl)
keep_cl.sort()
keep_cl


# Now, we can filter the dataset to get the subset we're after.

# In[17]:


adata_subset = adata[adata.obs['leiden_resolution_1.5'].isin(keep_cl)]


# In[18]:


adata_subset.shape


# ## Normalize and harmonize subset
# 
# As in the original analysis of this dataset, we'll need to normalize, select marker genes, and run Harmony to integrate across our cohorts.
# 
# It's important that we redo this step for our subset, as gene variability may differ when computed within our subset of cells rather than across the entire set of PBMCs. This key feature selection step will affect our ability to cluster and identify cell types, so we do this iteratively for the subset we're using now.

# We previously stored raw counts in `adata.raw` - we can now recover these original count data for analysis of the selected cells:

# In[19]:


adata_subset = adata_subset.raw.to_adata()


# In[20]:


adata_subset.shape


# In[21]:


adata_subset.raw = adata_subset


# In[22]:


sc.pp.normalize_total(adata_subset, target_sum=1e4)


# In[23]:


sc.pp.log1p(adata_subset)
sc.pp.highly_variable_genes(adata_subset)
adata_subset = adata_subset[:, adata_subset.var_names[adata_subset.var['highly_variable']]]


# In[24]:


sc.pp.scale(adata_subset)


# In[25]:


sc.tl.pca(adata_subset, svd_solver='arpack')


# In[26]:


sce.pp.harmony_integrate(
    adata_subset, 
    'cohort.cohortGuid',
    max_iter_harmony = 30)


# In[27]:


sc.pp.neighbors(
    adata_subset, 
    n_neighbors = 50,
    use_rep = 'X_pca_harmony', 
    n_pcs = 30)


# In[28]:


sc.tl.umap(adata_subset, min_dist = 0.05)


# In[29]:


out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)


# In[30]:


subset_h5ad = 'output/pbmc_ref_{c}_subset_{d}.h5ad'.format(c = cell_class, d = date.today())
adata_subset.write_h5ad(subset_h5ad)


# ## Cluster with additional resolution
# 

# In[31]:


get_ipython().run_cell_magic('time', '', "sc.tl.leiden(\n    adata_subset, \n    resolution = 1.5, \n    key_added = 'leiden_resolution_1.5_{c}'.format(c = cell_class)\n)\n")


# In[32]:


clustered_h5ad = 'output/pbmc_ref_{c}_subclustered_{d}.h5ad'.format(c = cell_class, d = date.today())
adata_subset.write_h5ad(clustered_h5ad)


# ## Plot reference labels and clustering
# 
# Now that we've clustered, it's helpful to plot reference labels and clusters on our UMAP projection to see how they fall relative to each other.

# In[33]:


sc.pl.umap(
    adata_subset, 
    color = ['seurat.l2.5'], 
    size = 2,
    show = False,
    ncols = 1 ,
    frameon = False
)


# In[34]:


sc.pl.umap(
    adata_subset, 
    color = ['celltypist.low'], 
    size = 2,
    show = False,
    ncols = 1 ,
    frameon = False
)


# CMV status is also helpful to view, as CMV can drive expansion of some cell types.

# In[35]:


sc.pl.umap(
    adata_subset, 
    color = ['subject.cmv'], 
    size = 2,
    show = False,
    ncols = 1 ,
    frameon = False
)


# In[36]:


sc.pl.umap(
    adata_subset, 
    color = 'leiden_resolution_1.5_{c}'.format(c = cell_class), 
    size = 2,
    show = False,
    ncols = 1 ,
    frameon = False
)


# ## Save UMAP coordinates and labels

# In[37]:


umap_mat = adata_subset.obsm['X_umap']


# In[38]:


umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])


# In[39]:


obs = adata_subset.obs
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']


# In[40]:


out_csv = 'output/pbmc_ref_{c}_subclustered_umap_meta_{d}.csv'.format(c = cell_class, d = date.today())


# In[41]:


obs.to_csv(out_csv)


# In[42]:


out_parquet = 'output/pbmc_ref_{c}_subclustered_umap_meta_{d}.parquet'.format(c = cell_class, d = date.today())


# In[43]:


obs = obs.to_parquet(out_parquet)


# ## Compute markers for Leiden clustering

# In[44]:


adata_subset = adata_subset.raw.to_adata()
sc.pp.normalize_total(adata_subset, target_sum=1e4)
sc.pp.log1p(adata_subset)

res_csv = '{p}/pbmc_ref_{c}_res{n}_markers_{d}.csv'.format(p = out_dir, c = cell_class, n = 1.5, d = date.today())
sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_1.5_{c}'.format(c = cell_class), method = 'wilcoxon')
df = sc.get.rank_genes_groups_df(adata_subset, group = None)
df.to_csv(res_csv)
marker_files = res_csv


# ## Upload assembled data to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[45]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'PBMC Ref. CD8 CM subclustering {d}'.format(d = date.today())


# In[46]:


in_files = [h5ad_uuid]


# In[47]:


in_files


# In[48]:


out_files = [clustered_h5ad, out_csv, out_parquet, marker_files]


# In[49]:


out_files


# In[50]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[51]:


import session_info
session_info.show()


# In[ ]: