#!/usr/bin/env python
# coding: utf-8

# In[1]:


import dynamo as dyn
import scanpy as sc
import matplotlib.pyplot as plt 

import squidpy

import pysal

import cv2

import numpy as np 
import pandas as pd 
get_ipython().run_line_magic('matplotlib', 'inline')


# ### Either use previous processed Zebrafish data or process it in some basis again.
# 

# In[2]:


# Change to the result of zebrafish tutorial after fixing saving issue in the previous notebook
# adata = dyn.read_h5ad("zebrafish_tutorial.h5ad")
adata = dyn.sample_data.zebrafish()
dyn.pp.recipe_monocle(adata) # , exprs_frac_max=0.005)
dyn.tl.dynamics(adata, model='stochastic', cores=3) 


# In[3]:


dyn.tl.reduceDimension(adata)
dyn.tl.cell_velocities(adata, method='pearson', other_kernels_dict={'transform': 'sqrt'})
# dyn.vf.VectorField(adata, basis='pca', M=1000, pot_curl_div=True) 


# Lets use use umap basis for visualization on 2D plots.

# In[4]:


dyn.pl.streamline_plot(adata, color=['Cell_type'], basis='umap', show_legend='on data', 
                       quiver_length=6, quiver_size=6,  show_arrowed_spines=False,
                       figsize=(4, 4))


# In[5]:


dyn.pl.streamline_plot(adata, color=['Cell_type'], basis='umap', show_legend='on data', 
                       quiver_length=6, quiver_size=6,  show_arrowed_spines=False,
                       figsize=(4, 4)
                      )
dyn.pl.streamline_plot(adata, color=['Cluster'], basis='umap', show_legend='on data', 
                       quiver_length=6, quiver_size=6,  show_arrowed_spines=False,
                       figsize=(4, 4)
                      )


# In[6]:


adata.var.use_for_transition.sum()


# ## Vector field analysis
# In this part we will go through how to utilize Dynamo to do vector field analysis.
# The big picture is to learn the vector field function at first, and then compute Jacobian, acceleration, and curvature information to rank genes across all cells or in each cell group. Finally we do gene enrichment analysis and regulatory network construction with visualization.
# 

# Calculate cell velocities in PCA space.

# In[7]:


dyn.tl.cell_velocities(adata, basis='pca')


# Here the `dyn.vf.VectorField` function learns a function of high dimensional vector field from sparse single cell samples in the entire space robustly. Learned vector field related information fields are inserted to adata. You can try different methods to learn the vector field by changing the method parameter.

# In[8]:


dyn.vf.VectorField(adata, basis='pca', M=100, method = "SparseVFC")


# Here we use `dyn.vf.curvature` to calculate curvature for each cell with the reconstructed vector field function stored in adata. `dyn.vf.rank_curvature_genes` ranks genes based on their absolute curvature values in different cell groups.

# In[9]:


dyn.vf.curvature(adata, basis='pca')


# In[10]:


dyn.vf.rank_curvature_genes(adata, groups='Cell_type')


# `dyn.vf.acceleration` is used here to compute acceleration for each cell with the learned vector field in adata. Note that we use PCA basis to calculate acceleration, but `dyn.vf.acceleration` will project acceleration_pca back to high dimension version for each gene. You can check the resulted adata having both acceleration and acceleration_pca. We then rank acceleration in the same fashion as what we did to curvature.

# In[11]:


dyn.vf.acceleration(adata, basis='pca')


# In[12]:


dyn.vf.rank_acceleration_genes(adata, groups='Cell_type')


# Use `dyn.pp.top_pca_genes` to calculate "top_pca_genes" for adata, according to PCs in `adata.uns`.

# In[13]:


dyn.pp.top_pca_genes(adata)


# In[14]:


top_pca_genes = adata.var.index[adata.var.top_pca_genes]
print(top_pca_genes)
print(len(top_pca_genes))


# Here we calculate Jacobian for each cell with the reconstructed vector field. If we use some kind of reduced dimension such as PCA space, `dyn.vf.jacobian` will project the low dimension Jacobian results back to high dimension as we can check in the resulted adata object.

# In[15]:


dyn.vf.jacobian(adata, regulators=top_pca_genes, effectors=top_pca_genes)


# Rank genes based on their diagonal Jacobian for each cell group. The "divergence" we are talking about here is basically the diagonal elements of the Jacobian, i.e. the self-activation\inhibition terms. The results are stored in `adata.uns['rank_div_gene_jacobian_pca']`.  
# Run `dyn.vf.jacobian` before using this function.

# In[16]:


divergence_rank = dyn.vf.rank_divergence_genes(adata, groups='Cell_type')


# We can check the result of `dyn.vf.rank_divergence_genes`

# In[17]:


divergence_rank.head(10)


# We can also rank all other elements in the Jacobian. There are 5 parameters we provide in `dyn.vf.rank_jacobian_genes`'s argument list to rank the Jacobian:
# 
# * "full reg": top regulators are ranked for each effector for each cell group
# 
# * "full eff": top effectors are ranked for each regulator for each cell group
# 
# * "reg": top regulators in each cell group
# 
# * "eff": top effectors in each cell group
# 
# * "int": top effector-regulator pairs in each cell group
# 
# Note that the default mode is "full reg".

# In[18]:


full_reg_rank = dyn.vf.rank_jacobian_genes(adata, groups='Cell_type', mode="full reg", abs=True, output_values=True)

rank = full_reg_rank[list(full_reg_rank.keys())[0]]


# In[19]:


full_eff_rank = dyn.vf.rank_jacobian_genes(adata, groups='Cell_type', mode='full eff', abs=True, exclude_diagonal=True, output_values=True)


# In[20]:


eff_rank = dyn.vf.rank_jacobian_genes(adata, groups='Cell_type', mode='eff', abs=True, output_values=True)


# In[21]:


reg_rank = dyn.vf.rank_jacobian_genes(adata, groups='Cell_type', mode='reg', abs=True, exclude_diagonal=True)


# In[22]:


int_rank = dyn.vf.rank_jacobian_genes(adata, groups='Cell_type', mode='int', exclude_diagonal=True, output_values=True)


# Save results to the current workspace.

# In[23]:


import pickle
pickle.dump(adata.uns['rank_acceleration'], open('./zebrafish_vf_rank_acceleration.p', 'wb'))
pickle.dump(adata.uns['rank_curvature'], open('./zebrafish_vf_rank_curvature.p', 'wb'))

pickle.dump(adata.uns['rank_abs_acceleration'], open('./zebrafish_vf_rank_abs_acceleration.p', 'wb'))
pickle.dump(adata.uns['rank_abs_curvature'], open('./zebrafish_vf_rank_abs_curvature.p', 'wb'))

adata.uns['rank_acceleration'].to_csv('zebrafish_vf_rank_acceleration.csv')
adata.uns['rank_curvature'].to_csv('zebrafish_vf_rank_curvature.csv')
adata.uns['rank_abs_acceleration'].to_csv('zebrafish_vf_rank_abs_acceleration.csv')
adata.uns['rank_abs_curvature'].to_csv('zebrafish_vf_rank_abs_curvature.csv')

pickle.dump(divergence_rank, open('./zebrafish_vf_divergence_rank.p', 'wb'))

pickle.dump(full_reg_rank, open('./zebrafish_vf_full_reg_rank.p', 'wb'))
pickle.dump(full_eff_rank, open('./zebrafish_vf_full_eff_rank.p', 'wb'))
pickle.dump(rank, open('./zebrafish_vf_rank.p', 'wb'))
pickle.dump(eff_rank, open('./zebrafish_vf_eff_rank.p', 'wb'))
pickle.dump(reg_rank, open('./zebrafish_vf_reg_rank.p', 'wb'))
pickle.dump(int_rank, open('./zebrafish_vf_int_rank.p', 'wb'))

pickle.dump(divergence_rank, open('./zebrafish_vf_divergence_rank.p', 'wb'))

pickle.dump(full_reg_rank, open('./zebrafish_vf_full_reg_rank.p', 'wb'))
pickle.dump(full_eff_rank, open('./zebrafish_vf_full_eff_rank.p', 'wb'))
pickle.dump(rank, open('./zebrafish_vf_rank.p', 'wb'))
pickle.dump(eff_rank, open('./zebrafish_vf_eff_rank.p', 'wb'))
pickle.dump(reg_rank, open('./zebrafish_vf_reg_rank.p', 'wb'))
pickle.dump(int_rank, open('./zebrafish_vf_int_rank.p', 'wb'))


# In[24]:


int_rank = pickle.load(open('./zebrafish_vf_int_rank.p', 'rb'))
reg_rank = pickle.load(open('./zebrafish_vf_reg_rank.p', 'rb'))


# -----------TODO: REFINE TUTORIAL CODE&EXPLANATION PART BELOW-----------

# In[25]:


# int_rank.head(50)
print(type(rank))
print(type(full_eff_rank))
print(type(int_rank))
print(type(reg_rank))
reg_rank


# Now we can check the top regulators for each marker gene in each cell type. This is the same when the ranking results  is based on full ranking (`full reg` or `full eff`)  
# Here again we can choose a few genes from figure 3 (si 5) of Saunders, et al (2019) to see their expression dynamics over time. 

# In[26]:


fig3_si5 = ['mitfa', 'pax3a', 'tfec', 'dct', 'alx4b', 'tyrp1b', 'gpnmb', 'pmela', 'pnp4a']
# selected_genes = adata.var_names
selected_genes = top_pca_genes[:10]
# selected_genes = fig3_si5
print("selected genes:", selected_genes)


# In[27]:


rank.columns.intersection(selected_genes)


# ### Build Regulatory Network

# In[28]:


edges_list = dyn.vf.build_network_per_cluster(adata,
                                              cluster='Cell_type',
                                              cluster_names=None,
                                              full_reg_rank=full_reg_rank,
                                              full_eff_rank=full_eff_rank,
                                              genes=selected_genes,
                                              n_top_genes=100)

import networkx as nx


# In[29]:


for i, name in enumerate(edges_list):
    network = nx.from_pandas_edgelist(edges_list[name].drop_duplicates().query("weight > 1e-5"), 'regulator', 'target', edge_attr='weight', create_using=nx.DiGraph())
    ax=dyn.pl.arcPlot(adata, cluster="Cell_type", cluster_name="base", edges_list=None, network=network, color="M_s", figsize=(12, 9), save_show_or_return='show')
    plt.savefig('./Cell_type_' + str(i) + '.pdf')


# In[30]:


print(adata.obs.Cell_type)
dyn.pl.circosPlot(adata, cluster="Cell_type", cluster_name="Schwann Cell", edges_list=None, 
                  network=network, color="M_s", )
dyn.pl.circosPlot(adata, cluster="Cell_type", cluster_name="Unknown", edges_list=None, 
                  network=network, color="M_s", )


# In[31]:


dyn.pl.circosPlot(adata, cluster="Cell_type", cluster_name="Schwann Cell", edges_list=None, 
                  network=network, color="M_s", )
dyn.pl.circosPlot(adata, cluster="Cell_type", cluster_name="Unknown", edges_list=None, 
                  network=network, color="M_s", )


# In[32]:


rank_acceleration = adata.uns['rank_acceleration']
np.unique(rank_acceleration.head(25).values)


# In[33]:


edges_list[list(edges_list.keys())[0]]
edge_list = edges_list[list(edges_list.keys())[0]]


# In[34]:


# dyn.pl.jacobian(adata, regulators=selected_genes[:4], effectors=selected_genes[:4], basis='pca_corrected')
dyn.pl.jacobian(adata, regulators=selected_genes[:4], effectors=selected_genes[:4], basis='pca')
dyn.pl.jacobian(adata, regulators=selected_genes[:4], effectors=selected_genes[:4], basis='pca')


# In[35]:


# dyn.pl.jacobian(adata, regulators=selected_genes[:4], effectors=selected_genes[:4], basis='pca_corrected', layer='M_s')
dyn.pl.jacobian(adata, regulators=selected_genes[:4], effectors=selected_genes[:4], basis='pca', layer='M_s')
dyn.pl.jacobian(adata, regulators=selected_genes[:4], effectors=selected_genes[:4], basis='pca', layer='M_s')


# In[36]:


# print(dyn.ext.enrichr.get_library_name())
rank_abs_acceleration = adata.uns['rank_abs_acceleration']
enr = dyn.ext.enrichr(list(np.unique(rank_abs_acceleration.head(10).values)), 
                      organism='zebrafish', outdir='./enrichr', gene_sets='KEGG_2019', no_plot=True)
from gseapy.plot import barplot, dotplot


# In[37]:


dyn.vf.rank_velocity_genes(adata)


# In[38]:


rank_speed = adata.uns['rank_velocity_S']

rank_speed.head(5)


# Select gene sets from https://amp.pharm.mssm.edu/modEnrichr
# 

# In[39]:


fish_gene_sets = [
"Anatomy_AutoRIF",
"Anatomy_AutoRIF_Predicted_Z-score"
"Anatomy_GeneRIF",
"Anatomy_GeneRIF_Predicted_Z-score",
"Coexpression_Predicted_GO_Biological_Process_2018",
"Coexpression_Predicted_GO_Cellular_Component_2018",
"Coexpression_Predicted_GO_Molecular_Function_2018",
"GO_Biological_Process_2018",
"GO_Biological_Process_AutoRIF",
"GO_Biological_Process_AutoRIF_Predicted_Z-score",
"GO_Biological_Process_GeneRIF",
"GO_Biological_Process_GeneRIF_Predicted_Z-score",
"GO_Cellular_Component_2018",
"GO_Cellular_Component_AutoRIF",
"GO_Cellular_Component_AutoRIF_Predicted_Z-score",
"GO_Cellular_Component_GeneRIF",
"GO_Cellular_Component_GeneRIF_Predicted_Z-score",
"GO_Molecular_Function_2018",
"GO_Molecular_Function_AutoRIF",
"GO_Molecular_Function_AutoRIF_Predicted_Z-score",
"GO_Molecular_Function_GeneRIF",
"GO_Molecular_Function_GeneRIF_Predicted_Z-score",
"InterPro_Domains_2019",
"KEGG_2019",
"Pfam_Domains_2019",
"Phenotype_AutoRIF",
"Phenotype_AutoRIF_Predicted_Z-score",
"Phenotype_GeneRIF",
"Phenotype_GeneRIF_Predicted_Z-score",
"WikiPathways_2018",
]
# enr = dyn.ext.enrichr(list(rank_speed['all'].head(150)), organism='Fish', outdir='./enrichr', gene_sets=fish_gene_sets, no_plot=True)
enr = dyn.ext.enrichr(list(top_pca_genes[:150]), organism='Fish', outdir='./enrichr', gene_sets=fish_gene_sets, no_plot=True)
print(enr.results.head(5))
dotplot(enr.res2d, title='top pca gene', cmap='viridis_r')

enr = dyn.ext.enrichr(list(rank_speed['all'].head(150)), organism='Fish', outdir='./enrichr', gene_sets=fish_gene_sets, no_plot=True)
print(enr.results.head(5))
dotplot(enr.res2d, title='top pca gene', cmap='viridis_r')


# In[40]:


dyn.ext.ddhodge(adata, basis='pca')


# In[41]:


dyn.pl.scatters(adata, basis='pca', color='pca_ddhodge_potential')
dyn.pl.scatters(adata, basis='pca', color='pca_ddhodge_potential')


# In[42]:


adata.obs.batch


# In[43]:


dyn.pl.kinetic_heatmap(adata[:, :], genes=selected_genes, tkey='pca_ddhodge_potential',
                       gene_order_method='maximum', mode='pseudotime', color_map='viridis')


# ### Build state graphs of cell types or clusters.

# In[44]:


import numpy as np 
print(adata.obs.index)
dyn.pl.scatters(adata, x=np.repeat('pca_ddhodge_potential', 5), pointsize=0.25, alpha=0.8, 
                y=selected_genes, layer='M_s', color='Cell_type',
                ncols=2, background='white', figsize=(7, 4))


# In[45]:


adata.obs['ClusterStr'] = adata.obs['Cluster'].astype(str)


# In[46]:


get_ipython().run_cell_magic('capture', '', "\ndyn.pd.state_graph(adata, group='Cell_type', basis='pca', method='vf')\ndyn.pd.state_graph(adata, group='ClusterStr', basis='pca', method='vf')\n")


# In[47]:


dyn.pl.state_graph(adata, group='ClusterStr', basis='pca', method='vf')


# In[48]:


dyn.pl.state_graph(adata, group='Cell_type', basis='pca', method='vf')