#!/usr/bin/env python # coding: utf-8 # Ref : Sonia Nestorowa, Fiona K. Hamey, Blanca Pijuan Sala, Evangelia Diamanti, Mairi Shep- # herd, Elisa Laurenti, Nicola K. Wilson, David G. Kent, and Berthold G ̈ottgens. A single- # cell resolution map of mouse hematopoietic stem and progenitor cell differentiation. Blood, # 128(8):e20–e31, 2016. # # [GEO Accession GSE81682](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682) # In[1]: from pathlib import Path as path import warnings warnings.filterwarnings("ignore") # anndata deprecation warnings related to pandas and numba do not concern us. # In[2]: import stream as st st.__version__ # In[3]: st.set_figure_params( dpi=120, style='white', figsize=[5.4,4.8], rc={'image.cmap': 'viridis'} ) # In[4]: get_ipython().system('test -f GSE81682_data_nestorowa2016_raw.tsv || (curl -fOL https://github.com/bnediction/scBoolSeq-supplementary/releases/download/artifacts/GSE81682_data_nestorowa2016_raw.tsv.gz && gunzip GSE81682_data_nestorowa2016_raw.tsv.gz)') # In[5]: _in_dir = path(".").resolve() _publish_dir = path("data_filtered_vargenes/").resolve() if not _publish_dir.exists(): _publish_dir.mkdir() print(f"Reading data from: {_in_dir}") print(f"Saving processed data to: {_publish_dir}") # In[6]: infile = _in_dir / 'GSE81682_data_nestorowa2016_raw.tsv' infile # In[7]: outfile = _publish_dir / "GSE81682_Hematopoiesis.csv" outfile # #### Read in data # In[8]: get_ipython().run_cell_magic('time', '', "adata = st.read(file_name=infile.as_posix(), workdir='./stream_result')\nadata\n") # > **To load and use 10x Genomics single cell RNA-seq data processed with Cell Ranger:** # (*The variable index can be reset by choosing a different column in `gene.tsv`*) # ```python # adata=st.read(file_name='./filtered_gene_bc_matrices/matrix.mtx', # file_feature='./filtered_gene_bc_matrices/genes.tsv', # file_sample='./filtered_gene_bc_matrices/barcodes.tsv', # file_format='mtx',workdir='./stream_result') # adata.var.index = adata.var[1].values # ``` # # > **If the Anndata object is already created, to run STREAM, please simply specify work directory:** # ```python # st.set_workdir(adata,'./stream_result') # ``` # In[9]: #adata.obs_names_make_unique() adata.var_names_make_unique() # In[10]: adata # #### Calculate QC # In[11]: st.cal_qc(adata,assay='rna') # In[12]: st.plot_qc(adata,jitter=0.3,) # In[13]: ### histogram plots and log-scale are also supported st.plot_qc(adata,jitter=0.3,log_scale=[0,1,4,5],hist_plot=[0,1,4,5]) # In[14]: st.filter_cells(adata, min_n_features= 100) st.filter_features(adata, min_n_cells = 5) # In[15]: ###Normalize gene expression based on library size st.normalize(adata,method='lib_size') ###Logarithmize gene expression st.log_transform(adata) ###Remove mitochondrial genes st.remove_mt_genes(adata) # #### Feature selection # Please check if the blue curve fits the points well. If not, please adjust the parameter **'loess_frac'** (usually by lowering it) until the blue curve fits well. # In[16]: get_ipython().run_cell_magic('time', '', 'st.select_variable_genes(adata,loess_frac=0.01,percentile=95)\n') # In[17]: adata # #### Save most variable genes to a new dataframe # In[18]: # extract the genes index var_genes = adata.uns["var_genes"] var_genes # In[19]: list(_publish_dir.glob("*csv")) # In[20]: adata.to_df()[var_genes].to_csv(outfile.as_posix()) # In[21]: list(_publish_dir.glob("*csv")) # In[ ]: