#!/usr/bin/env python
# coding: utf-8

# In[6]:


import numpy as np
import scanpy as sc
import ToppCellPy as tp
import warnings
warnings.filterwarnings("ignore")


# ### 1. load data

# This PBMC dataset is from [Wilk et al.2020](https://www.nature.com/articles/s41591-020-0944-y), which includes two conditions (COVID-19 vs. healthy) and 20 cell types. It can be downloaded from [here](https://cellxgene.cziscience.com/collections/a72afd53-ab92-4511-88da-252fb0e26b9a).

# In[2]:


output_dir = "/Users/jinmr2/Dropbox/Code/data/toppcell_test/" # define output folder
adata = sc.read("/Users/jinmr2/Dropbox/Code/data/COVID-19_data_normalized_Blish.h5ad")


# In[3]:


adata


# In[4]:


adata.obs.head()


# In this case, we select two cell annotations, including COVID-19 Status ("Status") and cell type ("cell_type_fine") as metadata for ToppCell gene modules.

# ### 2. create shred object

# Create a shred object with anndata and shred plan. 
# Usually we use normalized data (e.g. log2(TPM+1)) for following differential expression analysis and visualization. There're two major components in ToppCell analysis, including:
# - shred plan: the user-defined way for differential expression analysis and gene module generation. In this package, users can define a global level-wise comparison, like "cell_type_fine", which means gene module for each cell type; Additionally, a more complicated plan can be defined in a hierarchical way as well, like "Status+cell_type_fine|Status" (avoid blanks in between), which means the gene module for one cell type in a specific status versus all other cell types in this status.
# - bin_group: the way to generate pseudo-bulk bins for single-cell data in the heatmap visualization.
# 
# More details can be found in ToppCell [tutorial](https://toppcell.cchmc.org/biosystems/go/docs/tutorial.md).

# In[8]:


# create shred object
shred = tp.Shred(adata = adata,
                shred_plan = ["Status", "cell_type_fine", "Status+cell_type_fine|Status"],
                bin_group = ["Status", "cell_type_fine"],
                order_bins = None,
                order_modules = None,
                method = "wilcoxon",
                output_dir = output_dir)


# In[9]:


# conduct differential expression based on the shred plan
shred.do_shredplan()


# ### 3. generate heatmap

# Create a heatmap that contains all gene modules from the shred plan.

# In[10]:


# create a heatmap view using top differential expressed genes in each gene module (default = 200)
shred.create_heatmap_matrix()


# In[11]:


# draw heatmap for these gene modules. Both normal view and superbin view are generated
shred.draw_heatmap()


# ### 4. gene module enrichment

# Gene module enrichment using [ToppGene enrichment](https://toppgene.cchmc.org/). Currently five categories of enrichment are included: GeneOntologyBiologicalProcess, GeneOntologyCellularComponent, GeneOntologyMolecularFunction, MousePheno, Pathway. 

# In[12]:


shred.enrich_modules(categories = ["GeneOntologyCellularComponent"])


# Create multiple gene set enrichment from enrichment results of all gene modules. The strategy is inspired by [ToppCluster](https://toppcluster.cchmc.org/).

# In[13]:


shred.toppcluster()


# ### 5. Create Morpheus input

# Create a [Morpheus](https://software.broadinstitute.org/morpheus/) input file in GCT3-format.
# Morpheus is a online interactive platform developped by Broad Institute, which is designed for flexible heatmap exploration.

# In[14]:


shred.createGCT()


# A view of [GCT file in Morpheus](https://github.com/KANG-BIOINFO/ToppCellPy/blob/main/test/Morpheus-GCT3.pdf) after uploading the gct file from output folder.

# ### 6. All code in one run

# All steps above, including differential analysis, heatmap generation, gene module enrichment, ToppCluster view and GCT file generation, can be run in one code after initialization of shred object.

# In[ ]:


# create shred object
shred = tp.Shred(adata = adata,
                shred_plan = ["Status", "cell_type_fine", "Status+cell_type_fine|Status"],
                bin_group = ["Status", "cell_type_fine"],
                order_bins = None,
                order_modules = None,
                method = "wilcoxon",
                output_dir = output_dir)


# In[ ]:


# run all steps above in one row of code after initialization of shred object
shred.toppcell_batchRun()


# ### 7. Output file overview

# In[1]:


import os
files = os.listdir("/Users/jinmr2/Dropbox/Code/data/toppcell_test/output_2021-10-05 00:46:31.793638")
files.sort(); files


# ##### Categories of output files

# - (super) bin matrix and metadata:\
# bin_matrix.txt\
# bin_metadata.txt\
# superbin_matrix.txt\
# superbin_metadata.txt
# - toppgene and toppcluster enrichment output:\
# enrichment.txt\
# enrichment_toppcluster.txt
# - figures including gene module heatmap and toppcluster view:\
# figures
# - all gene modules:\
# genemodules_all.txt
# - heatmap in txt and gct format:\
# genemodules_heatmap.txt\
# heatmap_matrix.txt\
# heatmap_matrix_GCT.gct\
# heatmap_matrix_superbin.txt\
# heatmap_matrix_superbin_GCT.gct

# In[ ]: