#!/usr/bin/env python
# coding: utf-8

# # Trajectory inference on synthetic scRNA-seq data from cellular reprogramming network
# 
# We analyse the synthetic scRNA-Seq data generated by scBoolSeq from the Boolean simulations of the cellular reprogramming network in [2.3 - synthetic scRNA-Seq from cellular reprogramming network](2.3%20-%20synthetic%20scRNA-Seq%20from%20cellular%20reprogramming%20network.ipynb).
# 
# ⚠️ This notebook must be run in a STREAM environment

# In[1]:


import pandas as pd
import stream as st
st.__version__


# In[2]:


st.set_figure_params(dpi=80,style='white',figsize=[5.4,4.8],
                     rc={'image.cmap': 'viridis'})


# #### Read in data

# In[3]:


in_file = 'synthetic_data_reprogramming_counts.tsv'

adata = st.read(
    file_name=in_file,
    workdir='./stream_branching'
)
df = pd.read_csv(
    in_file, sep="\t", index_col=0, header=0
)
df


# > **To load and use 10x Genomics single cell RNA-seq data processed with Cell Ranger:**   
# (*The variable index can be reset by choosing a different column in `gene.tsv`*)
# ```python
# adata=st.read(file_name='./filtered_gene_bc_matrices/matrix.mtx',
#               file_feature='./filtered_gene_bc_matrices/genes.tsv',
#               file_sample='./filtered_gene_bc_matrices/barcodes.tsv',
#               file_format='mtx',workdir='./stream_result')    
# adata.var.index = adata.var[1].values   
# ```
# 
# > **If the Anndata object is already created, to run STREAM, please simply specify work directory:**
# ```python
# st.set_workdir(adata,'./stream_result')
# ```

# In[4]:


adata


# #### Read in metadata

# In[5]:


st.add_metadata(
    adata, 
    file_name='synthetic_data_reprogramming_metadata.tsv'
)
adata.obs.head()


# In[6]:


adata.obs.label.value_counts()


# #### Calculate QC

# In[7]:


st.cal_qc(adata,assay='rna')


# In[8]:


st.plot_qc(adata,jitter=0.3,)


# In[9]:


st.filter_cells(adata,min_n_features= 3)
st.filter_features(adata,min_n_cells = 300)


# Commented out because our simulated data is already lib_size normalised and log2 transformed
# ```python
# ###Normalize gene expression based on library size  
# st.normalize(adata,method='lib_size')
# ###Logarithmize gene expression 
# st.log_transform(adata)
# ```

# #### Feature selection

# Please check if the blue curve fits the points well. If not, please adjust the parameter **'loess_frac'** (usually by lowering it) until the blue curve fits well. 

# In[10]:


st.select_variable_genes(adata,loess_frac=0.7,n_genes=20)
adata.uns["var_genes"]


# **Alternatively, user can also select top  principal components using all genes or variable genes:**
# - use all genes   
# `st.select_top_principal_components(adata,n_pc=15,first_pc=True)`
# - use variable genes   
#     - users need to first run `st.select_variable_genes(adata,loess_frac=0.01, n_genes=2000)`  
#     - `st.select_top_principal_components(adata,feature='var_genes',n_pc=40,first_pc=True)`

# #### Dimension reduction

# In[11]:


st.dimension_reduction(
    adata,
    method='se',
    feature='var_genes',
    n_components=2,
    n_neighbors=15,
    n_jobs=12
)


# > **Alternatively, using top principal components as features:**
# `st.dimension_reduction(adata,method='se',feature='top_pcs',n_neighbors=15, n_components=2)`

# In[12]:


for gene in df.index:
    st.plot_dimension_reduction(adata,color=['label', gene],
                            n_components=2,show_graph=False,show_text=False)


# #### Trajectory inference

# In[13]:


st.seed_elastic_principal_graph(adata,n_clusters=10)


# In[14]:


df.head()


# In[15]:


st.plot_dimension_reduction(adata,color=['label','TF1','n_genes'],n_components=2,show_graph=True,show_text=False)
st.plot_branches(adata,show_text=True)


# **`epg_alpha`, `epg_mu`, `epg_lambda` are the three most influential parameters for learning elastic principal graph.**  
# - `epg_alpha`: penalizes spurious branching events. **The larger, the fewer branches the function will learn**. (by default, `epg_alpha=0.02`)
# - `epg_mu`: penalizes the deviation from harmonic embedding, where harmonicity assumes that each node is the mean of its neighbor nodes. **The larger, the more edges the function will use to fit into points(cells)** (by default, `epg_mu=0.1`) 
# - `epg_lambda`: penalizes the total length of edges. **The larger, the 'shorter' curves the function will use to fit into points(cells)** and the fewer points(cells) the curves will reach. (by default, `epg_lambda=0.02`)
# 
# > **'epg_trimmingradius'** can help get rid of noisy points (by defalut `epg_trimmingradius=Inf`)   
# e.g. `st.elastic_principal_graph(adata,epg_trimmingradius=0.1)`  

# In[16]:


st.elastic_principal_graph(adata,epg_alpha=0.07,epg_mu=0.07,epg_lambda=0.03)


# In[17]:


st.plot_dimension_reduction(adata,color=['label','TF1','n_genes'],n_components=2,show_graph=True,show_text=False)
st.plot_branches(adata,show_text=False)


# #### Adjusting trajectories (optional)

# * Finetune branching event:  
# ```python
# st.optimize_branching(adata,incr_n_nodes=30)
# st.plot_dimension_reduction(adata,show_graph=True,show_text=False)
# st.plot_branches(adata,show_text=False)
# ```
# * Prune trivial branches:  
# ```python
# st.prune_elastic_principal_graph(adata,epg_collapse_mode='EdgesNumber',epg_collapse_par=2)
# st.plot_dimension_reduction(adata,show_graph=True,show_text=False)
# st.plot_branches(adata,show_text=False)
# ```
# 
# * Shift branching node:  
# ```python
# st.shift_branching(adata,epg_shift_mode='NodeDensity',epg_shift_radius=0.1,epg_shift_max=3)
# st.plot_dimension_reduction(adata,show_graph=True,show_text=False)
# st.plot_branches(adata,show_text=False)
# ```

# In[18]:


###Extend leaf branch to reach further cells 
st.extend_elastic_principal_graph(adata, epg_ext_mode='WeigthedCentroid',epg_ext_par=0.8)
st.plot_dimension_reduction(adata,color=['label'],n_components=2,show_graph=True,show_text=True)
st.plot_branches(adata,show_text=True)


# #### Trajectory visualization

# ##### flat tree

# In[19]:


st.plot_flat_tree(adata,color=['label','branch_id_alias','S2_pseudotime'],
                  dist_scale=0.5,show_graph=True,show_text=True)


# In[20]:


st.plot_flat_tree(adata,color=['label','TF1','S2_pseudotime'],
                  dist_scale=0.5,show_graph=True,show_text=True)


# ##### stream plot at single cell level

# In[21]:


root = "S1"


# In[22]:


st.plot_stream_sc(adata,root=root,color=['label','TF1', 'TF2'],
                  dist_scale=0.3,show_graph=True,show_text=True)


# ##### stream plots

# In[24]:


st.plot_stream(adata,root=root,color=['label','TF1', 'TF2'], preference=["S3", "S2"])


# #### Marker genes detection

# `marker_list` defines the list of genes to scan. If not specified, by default it uses all available genes. It might be time-consuming.  
# 
# Here we only include variable genes.

# ##### 1) detect marker genes for each leaf branch

# In[27]:


st.detect_leaf_markers(adata,marker_list=adata.uns['var_genes'],cutoff_zscore=1.0,cutoff_pvalue=0.01,
                       root=root,n_jobs=10)


# In[28]:


adata.uns['leaf_markers_all'].head()


# ##### 2) detect transition genes for each branch

# In[29]:


st.detect_transition_markers(adata,marker_list=adata.uns['var_genes'],cutoff_spearman=0.4,cutoff_logfc=0.25,
                             root=root,n_jobs=4)


# In[30]:


for _key in adata.uns['transition_markers'].keys():
    print(f"Transition {_key}")
    print(adata.uns['transition_markers'][_key].head(), end="\n\n")


# ##### detect cell population-specific markers
# 
# ```python
# st.detect_markers(adata,ident='label',marker_list=adata.uns['var_genes'],cutoff_zscore=1.0,cutoff_pvalue=0.01)
# ```

# In[31]:


st.detect_markers(
    adata,
    ident='label',
    marker_list=adata.uns['var_genes'],
    cutoff_zscore=1.0,
    cutoff_pvalue=0.01
)


# In[32]:


adata.uns["markers_label"].keys()


# In[33]:


for key in adata.uns["markers_label"].keys():
    print(f"Detected population-specific markers for `{key}`")
    print(f"\t{adata.uns['markers_label'][key].index.to_list()}")