#!/usr/bin/env python
# coding: utf-8

# Ref : Sonia Nestorowa, Fiona K. Hamey, Blanca Pijuan Sala, Evangelia Diamanti, Mairi Shep-
# herd, Elisa Laurenti, Nicola K. Wilson, David G. Kent, and Berthold G ̈ottgens. A single-
# cell resolution map of mouse hematopoietic stem and progenitor cell differentiation. Blood,
# 128(8):e20–e31, 2016.
# 
# [GEO Accession GSE81682](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682)

# In[1]:


from pathlib import Path as path
import warnings
warnings.filterwarnings("ignore") # anndata deprecation warnings related to pandas and numba do not concern us.


# In[2]:


import stream as st
st.__version__


# In[3]:


st.set_figure_params(
    dpi=120,
    style='white',
    figsize=[5.4,4.8],
    rc={'image.cmap': 'viridis'}
)


# In[4]:


get_ipython().system('test -f GSE81682_data_nestorowa2016_raw.tsv      || (curl -fOL https://github.com/bnediction/scBoolSeq-supplementary/releases/download/artifacts/GSE81682_data_nestorowa2016_raw.tsv.gz      && gunzip GSE81682_data_nestorowa2016_raw.tsv.gz)')


# In[5]:


_in_dir = path(".").resolve()
_publish_dir = path("data_filtered_vargenes/").resolve()
if not _publish_dir.exists():
    _publish_dir.mkdir()
print(f"Reading data from: {_in_dir}")
print(f"Saving processed data to: {_publish_dir}")


# In[6]:


infile = _in_dir / 'GSE81682_data_nestorowa2016_raw.tsv'
infile


# In[7]:


outfile = _publish_dir / "GSE81682_Hematopoiesis.csv"
outfile


# #### Read in data

# In[8]:


get_ipython().run_cell_magic('time', '', "adata = st.read(file_name=infile.as_posix(), workdir='./stream_result')\nadata\n")


# > **To load and use 10x Genomics single cell RNA-seq data processed with Cell Ranger:**   
# (*The variable index can be reset by choosing a different column in `gene.tsv`*)
# ```python
# adata=st.read(file_name='./filtered_gene_bc_matrices/matrix.mtx',
#               file_feature='./filtered_gene_bc_matrices/genes.tsv',
#               file_sample='./filtered_gene_bc_matrices/barcodes.tsv',
#               file_format='mtx',workdir='./stream_result')    
# adata.var.index = adata.var[1].values   
# ```
# 
# > **If the Anndata object is already created, to run STREAM, please simply specify work directory:**
# ```python
# st.set_workdir(adata,'./stream_result')
# ```

# In[9]:


#adata.obs_names_make_unique()
adata.var_names_make_unique()


# In[10]:


adata


# #### Calculate QC

# In[11]:


st.cal_qc(adata,assay='rna')


# In[12]:


st.plot_qc(adata,jitter=0.3,)


# In[13]:


### histogram plots and log-scale are also supported
st.plot_qc(adata,jitter=0.3,log_scale=[0,1,4,5],hist_plot=[0,1,4,5]) 


# In[14]:


st.filter_cells(adata, min_n_features= 100)
st.filter_features(adata, min_n_cells = 5)


# In[15]:


###Normalize gene expression based on library size  
st.normalize(adata,method='lib_size')
###Logarithmize gene expression 
st.log_transform(adata)
###Remove mitochondrial genes 
st.remove_mt_genes(adata)


# #### Feature selection

# Please check if the blue curve fits the points well. If not, please adjust the parameter **'loess_frac'** (usually by lowering it) until the blue curve fits well. 

# In[16]:


get_ipython().run_cell_magic('time', '', 'st.select_variable_genes(adata,loess_frac=0.01,percentile=95)\n')


# In[17]:


adata


# #### Save most variable genes to a new dataframe

# In[18]:


# extract the genes index
var_genes = adata.uns["var_genes"]
var_genes


# In[19]:


list(_publish_dir.glob("*csv"))


# In[20]:


adata.to_df()[var_genes].to_csv(outfile.as_posix())


# In[21]:


list(_publish_dir.glob("*csv"))


# In[ ]: