#!/usr/bin/env python # coding: utf-8 # ### Basic settings # In[1]: import numpy as np import pandas as pd import scanpy as sc # In[2]: sc.settings.verbosity = 3 sc.logging.print_header() sc.settings.set_figure_params(dpi=80, facecolor='white') # In[35]: save_dir='./' ## need to change the directory sc.settings.figdir = save_dir # ### Import data # The data could be downloaded from https://drive.google.com/drive/folders/1MZEaM28PHW4FTSKzy-h_OqPnSlRKiYKW?usp=sharing. # In[3]: get_ipython().run_cell_magic('time', '', "adata=sc.read('./Hochgerner_dentate_gyrus_QC.h5ad') ## need to change the directory\n") # In[4]: np.unique(adata.obs['CellTypes']) # In[5]: len(np.unique(adata.obs['CellTypes'])) # In[6]: adata # Note: the data is already filtered after proper quality control. # ### Normalization # In[7]: sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) # ### HVG identification # In[8]: sc.pp.highly_variable_genes(adata, n_top_genes=3000 ) print(sum(adata.var.highly_variable)) sc.pl.highly_variable_genes(adata) # ### Run PCA # In[9]: use_highly_variable=True from sklearn.preprocessing import StandardScaler expr = adata[:, adata.var['highly_variable']].X if use_highly_variable else adata.X expr=StandardScaler(with_mean=False).fit_transform(expr) expr[expr > 10] = 10 from sklearn.decomposition import TruncatedSVD transformer = TruncatedSVD(n_components=50, random_state=42) adata.obsm['X_pca']= transformer.fit_transform(expr) # In[10]: sc.pl.pca(adata,color='CellTypes') # ### Reorder the CellTypes # In[11]: get_ipython().run_cell_magic('time', '', "sc.tl.dendrogram(adata,groupby='CellTypes',use_rep='X_pca')\n") # In[12]: celltype=adata.obs['CellTypes'].values.copy() new_order=adata.uns["dendrogram_['CellTypes']"]['categories_ordered'] celltype=celltype.reorder_categories(new_order) # In[13]: adata.obs['CellTypes']=celltype # ### Run UMAP # In[14]: get_ipython().run_cell_magic('time', '', 'sc.pp.neighbors(adata,\n n_neighbors=15,random_state=10,knn=True,\n method="umap")\n') # In[15]: get_ipython().run_cell_magic('time', '', 'sc.tl.umap(adata)\n') # In[16]: get_ipython().run_cell_magic('time', '', "sc.pl.umap(adata,\n color=['CellTypes'],\n palette=sc.pl.palettes.default_102,\n size=10,\n frameon=True)\n") # ### Marker gene identification # #### 1. COSG # Import COSG: # In[46]: import cosg as cosg import importlib importlib.reload(cosg) # Print the usgae: # In[47]: help(cosg.cosg) # In[49]: get_ipython().run_cell_magic('time', '', "import time\nt0= time.clock()\ncosg.cosg(adata,\n key_added='cosg',\n mu=1,\n n_genes_user=50,\n groupby='CellTypes')\nruntime_cosg = time.clock() - t0\n") # In[50]: sc.tl.dendrogram(adata,groupby='CellTypes',use_rep='X_pca') # In[51]: sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes', cmap='Spectral_r', standard_scale='var', n_genes=3,key='cosg') # #### 2. Logistic regression # In[29]: get_ipython().run_cell_magic('time', '', "t0= time.clock()\nsc.tl.rank_genes_groups(adata,\n groupby='CellTypes',\n method='logreg',\n key_added='logreg',\n n_genes=50)\nruntime_logreg = time.clock() - t0\n") # In[30]: sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes', cmap='Spectral_r', standard_scale='var', n_genes=3,key='logreg') # #### 3. Wilcoxon-test # In[31]: get_ipython().run_cell_magic('time', '', "t0= time.clock()\nsc.tl.rank_genes_groups(adata, groupby='CellTypes', \n tie_correct=False,\n method='wilcoxon',\n key_added='wilcoxon',n_genes=50)\nruntime_wilcoxon= time.clock() - t0\n") # In[32]: sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes', cmap='Spectral_r', standard_scale='var', n_genes=3,key='wilcoxon') # #### 4. Wilcoxon-test (TIE) # In[33]: get_ipython().run_cell_magic('time', '', "t0= time.clock()\nsc.tl.rank_genes_groups(adata, groupby='CellTypes', \n tie_correct=True,\n method='wilcoxon',\n key_added='wilcoxon_tie',n_genes=50)\nruntime_wilcoxon_tie = time.clock() - t0\n") # In[34]: sc.pl.rank_genes_groups_dotplot(adata,groupby='CellTypes', cmap='Spectral_r', standard_scale='var', n_genes=3,key='wilcoxon_tie') # ### Runtime comparison # In[71]: print( [runtime_logreg, runtime_wilcoxon, runtime_wilcoxon_tie, runtime_cosg] ) # ### Violin plots # In[52]: celltype_selected= 'GC-adult' # In[53]: adata_selected=adata[adata.obs['CellTypes'].isin(['CA3-Pyr', 'GC-adult', 'GC-juv'])] # In[56]: marker_violin=np.hstack([pd.DataFrame(adata.uns['logreg']['names'])[celltype_selected][:3].values, pd.DataFrame(adata.uns['wilcoxon']['names'])[celltype_selected][:3].values, pd.DataFrame(adata.uns['wilcoxon_tie']['names'])[celltype_selected][:3].values, pd.DataFrame(adata.uns['cosg']['names'])[celltype_selected][:3].values]) # In[57]: marker_violin # In[66]: vp=sc.pl.stacked_violin(adata_selected, marker_violin, groupby='CellTypes', swap_axes=True, figsize=(4, 9), scale='width', yticklabels=True, size=0.2, inner='box', cut=True, return_fig=True, ) # In[67]: icolor=[ "#ffe119", # Logistic regression "#e6194b", # Wilcoxon-test "#4363d8", # Wilcoxon-test (TIE) "#3cb44b" # COSG , ] icolor=np.hstack([np.repeat(i,3) for i in icolor]) icolor # In[68]: icolor=list(icolor) # In[69]: vp.style(ylim=(0,6), row_palette=icolor,yticklabels=True,).show() # ### Save the data # In[36]: adata.write(save_dir+'/COSG_tutorial_Hochgerner_dentate_gyrus_QC.h5ad') # ### Contact # Min Dai, daimin@zju.edu.cn