#!/usr/bin/env python # coding: utf-8 # In[6]: import numpy as np import scanpy as sc import ToppCellPy as tp import warnings warnings.filterwarnings("ignore") # ### 1. load data # This PBMC dataset is from [Wilk et al.2020](https://www.nature.com/articles/s41591-020-0944-y), which includes two conditions (COVID-19 vs. healthy) and 20 cell types. It can be downloaded from [here](https://cellxgene.cziscience.com/collections/a72afd53-ab92-4511-88da-252fb0e26b9a). # In[2]: output_dir = "/Users/jinmr2/Dropbox/Code/data/toppcell_test/" # define output folder adata = sc.read("/Users/jinmr2/Dropbox/Code/data/COVID-19_data_normalized_Blish.h5ad") # In[3]: adata # In[4]: adata.obs.head() # In this case, we select two cell annotations, including COVID-19 Status ("Status") and cell type ("cell_type_fine") as metadata for ToppCell gene modules. # ### 2. create shred object # Create a shred object with anndata and shred plan. # Usually we use normalized data (e.g. log2(TPM+1)) for following differential expression analysis and visualization. There're two major components in ToppCell analysis, including: # - shred plan: the user-defined way for differential expression analysis and gene module generation. In this package, users can define a global level-wise comparison, like "cell_type_fine", which means gene module for each cell type; Additionally, a more complicated plan can be defined in a hierarchical way as well, like "Status+cell_type_fine|Status" (avoid blanks in between), which means the gene module for one cell type in a specific status versus all other cell types in this status. # - bin_group: the way to generate pseudo-bulk bins for single-cell data in the heatmap visualization. # # More details can be found in ToppCell [tutorial](https://toppcell.cchmc.org/biosystems/go/docs/tutorial.md). # In[8]: # create shred object shred = tp.Shred(adata = adata, shred_plan = ["Status", "cell_type_fine", "Status+cell_type_fine|Status"], bin_group = ["Status", "cell_type_fine"], order_bins = None, order_modules = None, method = "wilcoxon", output_dir = output_dir) # In[9]: # conduct differential expression based on the shred plan shred.do_shredplan() # ### 3. generate heatmap # Create a heatmap that contains all gene modules from the shred plan. # In[10]: # create a heatmap view using top differential expressed genes in each gene module (default = 200) shred.create_heatmap_matrix() # In[11]: # draw heatmap for these gene modules. Both normal view and superbin view are generated shred.draw_heatmap() # ### 4. gene module enrichment # Gene module enrichment using [ToppGene enrichment](https://toppgene.cchmc.org/). Currently five categories of enrichment are included: GeneOntologyBiologicalProcess, GeneOntologyCellularComponent, GeneOntologyMolecularFunction, MousePheno, Pathway. # In[12]: shred.enrich_modules(categories = ["GeneOntologyCellularComponent"]) # Create multiple gene set enrichment from enrichment results of all gene modules. The strategy is inspired by [ToppCluster](https://toppcluster.cchmc.org/). # In[13]: shred.toppcluster() # ### 5. Create Morpheus input # Create a [Morpheus](https://software.broadinstitute.org/morpheus/) input file in GCT3-format. # Morpheus is a online interactive platform developped by Broad Institute, which is designed for flexible heatmap exploration. # In[14]: shred.createGCT() # A view of [GCT file in Morpheus](https://github.com/KANG-BIOINFO/ToppCellPy/blob/main/test/Morpheus-GCT3.pdf) after uploading the gct file from output folder. # ### 6. All code in one run # All steps above, including differential analysis, heatmap generation, gene module enrichment, ToppCluster view and GCT file generation, can be run in one code after initialization of shred object. # In[ ]: # create shred object shred = tp.Shred(adata = adata, shred_plan = ["Status", "cell_type_fine", "Status+cell_type_fine|Status"], bin_group = ["Status", "cell_type_fine"], order_bins = None, order_modules = None, method = "wilcoxon", output_dir = output_dir) # In[ ]: # run all steps above in one row of code after initialization of shred object shred.toppcell_batchRun() # ### 7. Output file overview # In[1]: import os files = os.listdir("/Users/jinmr2/Dropbox/Code/data/toppcell_test/output_2021-10-05 00:46:31.793638") files.sort(); files # ##### Categories of output files # - (super) bin matrix and metadata:\ # bin_matrix.txt\ # bin_metadata.txt\ # superbin_matrix.txt\ # superbin_metadata.txt # - toppgene and toppcluster enrichment output:\ # enrichment.txt\ # enrichment_toppcluster.txt # - figures including gene module heatmap and toppcluster view:\ # figures # - all gene modules:\ # genemodules_all.txt # - heatmap in txt and gct format:\ # genemodules_heatmap.txt\ # heatmap_matrix.txt\ # heatmap_matrix_GCT.gct\ # heatmap_matrix_superbin.txt\ # heatmap_matrix_superbin_GCT.gct # In[ ]: