#!/usr/bin/env python
# coding: utf-8

# ### Basic settings

# In[1]:


import numpy as np
import pandas as pd
import scanpy as sc


# In[2]:


sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


# In[35]:


save_dir='./' ## need to change the directory
sc.settings.figdir = save_dir


# ### Import data

# The data could be downloaded from https://drive.google.com/drive/folders/1MZEaM28PHW4FTSKzy-h_OqPnSlRKiYKW?usp=sharing.

# In[3]:


get_ipython().run_cell_magic('time', '', "adata=sc.read('./Hochgerner_dentate_gyrus_QC.h5ad') ## need to change the directory\n")


# In[4]:


np.unique(adata.obs['CellTypes'])


# In[5]:


len(np.unique(adata.obs['CellTypes']))


# In[6]:


adata


# Note: the data is already filtered after proper quality control.

# ### Normalization

# In[7]:


sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


# ### HVG identification

# In[8]:


sc.pp.highly_variable_genes(adata,
                            n_top_genes=3000
                           )
print(sum(adata.var.highly_variable))
sc.pl.highly_variable_genes(adata)


# ### Run PCA

# In[9]:


use_highly_variable=True
from sklearn.preprocessing import StandardScaler
expr = adata[:, adata.var['highly_variable']].X if use_highly_variable else adata.X
expr=StandardScaler(with_mean=False).fit_transform(expr)
expr[expr > 10] = 10
from sklearn.decomposition import TruncatedSVD
transformer = TruncatedSVD(n_components=50, random_state=42)
adata.obsm['X_pca']= transformer.fit_transform(expr)


# In[10]:


sc.pl.pca(adata,color='CellTypes')


# ### Reorder the CellTypes

# In[11]:


get_ipython().run_cell_magic('time', '', "sc.tl.dendrogram(adata,groupby='CellTypes',use_rep='X_pca')\n")


# In[12]:


celltype=adata.obs['CellTypes'].values.copy()
new_order=adata.uns["dendrogram_['CellTypes']"]['categories_ordered']
celltype=celltype.reorder_categories(new_order)


# In[13]:


adata.obs['CellTypes']=celltype


# ### Run UMAP

# In[14]:


get_ipython().run_cell_magic('time', '', 'sc.pp.neighbors(adata,\n               n_neighbors=15,random_state=10,knn=True,\n                method="umap")\n')


# In[15]:


get_ipython().run_cell_magic('time', '', 'sc.tl.umap(adata)\n')


# In[16]:


get_ipython().run_cell_magic('time', '', "sc.pl.umap(adata,\n           color=['CellTypes'],\n           palette=sc.pl.palettes.default_102,\n           size=10,\n           frameon=True)\n")


# ### Marker gene identification

# #### 1. COSG

# Import COSG:

# In[46]:


import cosg as cosg
import importlib
importlib.reload(cosg)


# Print the usgae:

# In[47]:


help(cosg.cosg)


# In[49]:


get_ipython().run_cell_magic('time', '', "import time\nt0= time.clock()\ncosg.cosg(adata,\n    key_added='cosg',\n        mu=1,\n        n_genes_user=50,\n               groupby='CellTypes')\nruntime_cosg = time.clock() - t0\n")


# In[50]:


sc.tl.dendrogram(adata,groupby='CellTypes',use_rep='X_pca')


# In[51]:


sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes',
                                cmap='Spectral_r',
                                 standard_scale='var',
                                       n_genes=3,key='cosg')


# #### 2. Logistic regression

# In[29]:


get_ipython().run_cell_magic('time', '', "t0= time.clock()\nsc.tl.rank_genes_groups(adata,\n                        groupby='CellTypes',\n                        method='logreg',\n                        key_added='logreg',\n                        n_genes=50)\nruntime_logreg = time.clock() - t0\n")


# In[30]:


sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes',
                                cmap='Spectral_r',
                                 standard_scale='var',
                                       n_genes=3,key='logreg')


# #### 3. Wilcoxon-test

# In[31]:


get_ipython().run_cell_magic('time', '', "t0= time.clock()\nsc.tl.rank_genes_groups(adata, groupby='CellTypes', \n                        tie_correct=False,\n                        method='wilcoxon',\n                        key_added='wilcoxon',n_genes=50)\nruntime_wilcoxon= time.clock() - t0\n")


# In[32]:


sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes',
                                cmap='Spectral_r',
                                 standard_scale='var',
                                       n_genes=3,key='wilcoxon')


# #### 4. Wilcoxon-test (TIE)

# In[33]:


get_ipython().run_cell_magic('time', '', "t0= time.clock()\nsc.tl.rank_genes_groups(adata, groupby='CellTypes', \n                        tie_correct=True,\n                        method='wilcoxon',\n                        key_added='wilcoxon_tie',n_genes=50)\nruntime_wilcoxon_tie = time.clock() - t0\n")


# In[34]:


sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes',
                                cmap='Spectral_r',
                                 standard_scale='var',
                                       n_genes=3,key='wilcoxon_tie')


# ### Runtime comparison

# In[71]:


print( [runtime_logreg, runtime_wilcoxon, runtime_wilcoxon_tie, runtime_cosg] )


# ### Violin plots

# In[52]:


celltype_selected= 'GC-adult'


# In[53]:


adata_selected=adata[adata.obs['CellTypes'].isin(['CA3-Pyr',  'GC-adult', 'GC-juv'])]


# In[56]:


marker_violin=np.hstack([pd.DataFrame(adata.uns['logreg']['names'])[celltype_selected][:3].values,
pd.DataFrame(adata.uns['wilcoxon']['names'])[celltype_selected][:3].values,
pd.DataFrame(adata.uns['wilcoxon_tie']['names'])[celltype_selected][:3].values,
pd.DataFrame(adata.uns['cosg']['names'])[celltype_selected][:3].values])


# In[57]:


marker_violin


# In[66]:


vp=sc.pl.stacked_violin(adata_selected, 
                     marker_violin,
                     groupby='CellTypes', 
                     swap_axes=True,
                     figsize=(4, 9),
                     scale='width',
                     yticklabels=True,
                     size=0.2,
                     inner='box',
                     cut=True,              
                     return_fig=True,
                    )


# In[67]:


icolor=[ "#ffe119", # Logistic regression
                  "#e6194b",   # Wilcoxon-test
                 "#4363d8",  # Wilcoxon-test (TIE)
                    "#3cb44b" # COSG ,
       ]
icolor=np.hstack([np.repeat(i,3) for i in icolor])
icolor


# In[68]:


icolor=list(icolor)


# In[69]:


vp.style(ylim=(0,6),
         row_palette=icolor,yticklabels=True,).show()


# ### Save the data

# In[36]:


adata.write(save_dir+'/COSG_tutorial_Hochgerner_dentate_gyrus_QC.h5ad')


# ### Contact

# Min Dai, daimin@zju.edu.cn