#!/usr/bin/env python # coding: utf-8 # # Trajectory inference on synthetic scRNA-seq data from cellular reprogramming network # # We analyse the synthetic scRNA-Seq data generated by scBoolSeq from the Boolean simulations of the cellular reprogramming network in [2.3 - synthetic scRNA-Seq from cellular reprogramming network](2.3%20-%20synthetic%20scRNA-Seq%20from%20cellular%20reprogramming%20network.ipynb). # # ⚠️ This notebook must be run in a STREAM environment # In[1]: import pandas as pd import stream as st st.__version__ # In[2]: st.set_figure_params(dpi=80,style='white',figsize=[5.4,4.8], rc={'image.cmap': 'viridis'}) # #### Read in data # In[3]: in_file = 'synthetic_data_reprogramming_counts.tsv' adata = st.read( file_name=in_file, workdir='./stream_branching' ) df = pd.read_csv( in_file, sep="\t", index_col=0, header=0 ) df # > **To load and use 10x Genomics single cell RNA-seq data processed with Cell Ranger:** # (*The variable index can be reset by choosing a different column in `gene.tsv`*) # ```python # adata=st.read(file_name='./filtered_gene_bc_matrices/matrix.mtx', # file_feature='./filtered_gene_bc_matrices/genes.tsv', # file_sample='./filtered_gene_bc_matrices/barcodes.tsv', # file_format='mtx',workdir='./stream_result') # adata.var.index = adata.var[1].values # ``` # # > **If the Anndata object is already created, to run STREAM, please simply specify work directory:** # ```python # st.set_workdir(adata,'./stream_result') # ``` # In[4]: adata # #### Read in metadata # In[5]: st.add_metadata( adata, file_name='synthetic_data_reprogramming_metadata.tsv' ) adata.obs.head() # In[6]: adata.obs.label.value_counts() # #### Calculate QC # In[7]: st.cal_qc(adata,assay='rna') # In[8]: st.plot_qc(adata,jitter=0.3,) # In[9]: st.filter_cells(adata,min_n_features= 3) st.filter_features(adata,min_n_cells = 300) # Commented out because our simulated data is already lib_size normalised and log2 transformed # ```python # ###Normalize gene expression based on library size # st.normalize(adata,method='lib_size') # ###Logarithmize gene expression # st.log_transform(adata) # ``` # #### Feature selection # Please check if the blue curve fits the points well. If not, please adjust the parameter **'loess_frac'** (usually by lowering it) until the blue curve fits well. # In[10]: st.select_variable_genes(adata,loess_frac=0.7,n_genes=20) adata.uns["var_genes"] # **Alternatively, user can also select top principal components using all genes or variable genes:** # - use all genes # `st.select_top_principal_components(adata,n_pc=15,first_pc=True)` # - use variable genes # - users need to first run `st.select_variable_genes(adata,loess_frac=0.01, n_genes=2000)` # - `st.select_top_principal_components(adata,feature='var_genes',n_pc=40,first_pc=True)` # #### Dimension reduction # In[11]: st.dimension_reduction( adata, method='se', feature='var_genes', n_components=2, n_neighbors=15, n_jobs=12 ) # > **Alternatively, using top principal components as features:** # `st.dimension_reduction(adata,method='se',feature='top_pcs',n_neighbors=15, n_components=2)` # In[12]: for gene in df.index: st.plot_dimension_reduction(adata,color=['label', gene], n_components=2,show_graph=False,show_text=False) # #### Trajectory inference # In[13]: st.seed_elastic_principal_graph(adata,n_clusters=10) # In[14]: df.head() # In[15]: st.plot_dimension_reduction(adata,color=['label','TF1','n_genes'],n_components=2,show_graph=True,show_text=False) st.plot_branches(adata,show_text=True) # **`epg_alpha`, `epg_mu`, `epg_lambda` are the three most influential parameters for learning elastic principal graph.** # - `epg_alpha`: penalizes spurious branching events. **The larger, the fewer branches the function will learn**. (by default, `epg_alpha=0.02`) # - `epg_mu`: penalizes the deviation from harmonic embedding, where harmonicity assumes that each node is the mean of its neighbor nodes. **The larger, the more edges the function will use to fit into points(cells)** (by default, `epg_mu=0.1`) # - `epg_lambda`: penalizes the total length of edges. **The larger, the 'shorter' curves the function will use to fit into points(cells)** and the fewer points(cells) the curves will reach. (by default, `epg_lambda=0.02`) # # > **'epg_trimmingradius'** can help get rid of noisy points (by defalut `epg_trimmingradius=Inf`) # e.g. `st.elastic_principal_graph(adata,epg_trimmingradius=0.1)` # In[16]: st.elastic_principal_graph(adata,epg_alpha=0.07,epg_mu=0.07,epg_lambda=0.03) # In[17]: st.plot_dimension_reduction(adata,color=['label','TF1','n_genes'],n_components=2,show_graph=True,show_text=False) st.plot_branches(adata,show_text=False) # #### Adjusting trajectories (optional) # * Finetune branching event: # ```python # st.optimize_branching(adata,incr_n_nodes=30) # st.plot_dimension_reduction(adata,show_graph=True,show_text=False) # st.plot_branches(adata,show_text=False) # ``` # * Prune trivial branches: # ```python # st.prune_elastic_principal_graph(adata,epg_collapse_mode='EdgesNumber',epg_collapse_par=2) # st.plot_dimension_reduction(adata,show_graph=True,show_text=False) # st.plot_branches(adata,show_text=False) # ``` # # * Shift branching node: # ```python # st.shift_branching(adata,epg_shift_mode='NodeDensity',epg_shift_radius=0.1,epg_shift_max=3) # st.plot_dimension_reduction(adata,show_graph=True,show_text=False) # st.plot_branches(adata,show_text=False) # ``` # In[18]: ###Extend leaf branch to reach further cells st.extend_elastic_principal_graph(adata, epg_ext_mode='WeigthedCentroid',epg_ext_par=0.8) st.plot_dimension_reduction(adata,color=['label'],n_components=2,show_graph=True,show_text=True) st.plot_branches(adata,show_text=True) # #### Trajectory visualization # ##### flat tree # In[19]: st.plot_flat_tree(adata,color=['label','branch_id_alias','S2_pseudotime'], dist_scale=0.5,show_graph=True,show_text=True) # In[20]: st.plot_flat_tree(adata,color=['label','TF1','S2_pseudotime'], dist_scale=0.5,show_graph=True,show_text=True) # ##### stream plot at single cell level # In[21]: root = "S1" # In[22]: st.plot_stream_sc(adata,root=root,color=['label','TF1', 'TF2'], dist_scale=0.3,show_graph=True,show_text=True) # ##### stream plots # In[24]: st.plot_stream(adata,root=root,color=['label','TF1', 'TF2'], preference=["S3", "S2"]) # #### Marker genes detection # `marker_list` defines the list of genes to scan. If not specified, by default it uses all available genes. It might be time-consuming. # # Here we only include variable genes. # ##### 1) detect marker genes for each leaf branch # In[27]: st.detect_leaf_markers(adata,marker_list=adata.uns['var_genes'],cutoff_zscore=1.0,cutoff_pvalue=0.01, root=root,n_jobs=10) # In[28]: adata.uns['leaf_markers_all'].head() # ##### 2) detect transition genes for each branch # In[29]: st.detect_transition_markers(adata,marker_list=adata.uns['var_genes'],cutoff_spearman=0.4,cutoff_logfc=0.25, root=root,n_jobs=4) # In[30]: for _key in adata.uns['transition_markers'].keys(): print(f"Transition {_key}") print(adata.uns['transition_markers'][_key].head(), end="\n\n") # ##### detect cell population-specific markers # # ```python # st.detect_markers(adata,ident='label',marker_list=adata.uns['var_genes'],cutoff_zscore=1.0,cutoff_pvalue=0.01) # ``` # In[31]: st.detect_markers( adata, ident='label', marker_list=adata.uns['var_genes'], cutoff_zscore=1.0, cutoff_pvalue=0.01 ) # In[32]: adata.uns["markers_label"].keys() # In[33]: for key in adata.uns["markers_label"].keys(): print(f"Detected population-specific markers for `{key}`") print(f"\t{adata.uns['markers_label'][key].index.to_list()}")