In [2]:
import anndata as ad
import episcanpy.api as epi
import scanpy as sc
import numpy as np
import time
/Users/anna.danese/anaconda3/lib/python3.7/site-packages/scanpy/api/__init__.py:6: FutureWarning: 

In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.

  FutureWarning
In [3]:
import episcanpy
print(episcanpy.__version__)
0.1.7+23.g2bd1a9e

Load data

In [4]:
# define data directory
DATADIR = './data/'
In [6]:
# To load the peak matrix
adata = ad.read(DATADIR+'GSM3034622_BoneMarrow_62016_GSM3034623_BoneMarrow_62216_peakmatrix.h5ad')
adata
Out[6]:
AnnData object with n_obs × n_vars = 12906 × 436206 
    obs: 'batch', 'cell', 'cell_label\n', 'cluster', 'id', 'subset_cluster', 'subset_tsne1', 'subset_tsne2', 'tissue', 'tissue.replicate', 'tsne_1', 'tsne_2'
In [7]:
# format cell label metadata
adata.obs['cell_label'] =[n.rstrip('\n') for n in adata.obs['cell_label\n']]
del adata.obs['cell_label\n']
In [8]:
# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
                     frameon=True, vector_friendly=True,
                     color_map="YlGnBu", format='pdf', transparent=False,
                     ipython_format='png2x')


result_file = 'GSM3034622_BoneMarrow_62016_GSM3034623_BoneMarrow_62216_peakmatrix_processed.h5ad'

Quality controls

In [9]:
# preliminary filtering to remove cell barcode containing no open features
epi.pp.filter_cells(adata, min_features=1)
# preliminary filtering to remove features that are not sequenced in any cell
epi.pp.filter_features(adata, min_cells=1)

plot cell coverage

In [10]:
# plot cell coverage
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
               threshold=1000, save='coverage_cells.png')
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
               threshold=1000, save='coverage_cells.png')

plot feature coverage

In [11]:
# plot feature coverage
epi.pp.coverage_features(adata, binary=True, log=False, bins=50,
               threshold=40, save='coverage_cells.png')
epi.pp.coverage_features(adata, binary=True, log=10, bins=50,
               threshold=40, save='coverage_cells.png')
In [13]:
epi.pp.density_features(adata, hist=True)

Actually filtering the cells

In [14]:
# remove cells containing less than 100 features open
epi.pp.filter_cells(adata, min_features=1000)
In [15]:
# remove features sequenced in less than 40 cells
epi.pp.filter_features(adata, min_cells=40)

Looking at the QC plots after filtering

In [16]:
epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=1000)
epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=40)

Identifying the most variable features

In [17]:
adata
Out[17]:
AnnData object with n_obs × n_vars = 7563 × 246595 
    obs: 'batch', 'cell', 'cluster', 'id', 'subset_cluster', 'subset_tsne1', 'subset_tsne2', 'tissue', 'tissue.replicate', 'tsne_1', 'tsne_2', 'cell_label', 'nb_features'
    var: 'n_cells', 'commonness'
In [18]:
epi.pp.cal_var(adata)
In [19]:
min_score_value = 0.517
nb_feature_selected = 100000
epi.pl.variability_features(adata,log=None,
                     min_score=min_score_value, nb_features=nb_feature_selected,
                     save='variability_features_plot_bonemarrow_peakmatrix.png')

epi.pl.variability_features(adata,log='log10',
                     min_score=min_score_value, nb_features=nb_feature_selected,
                     save='variability_features_plot_bonemarrow_peakmatrix_log10.png')
In [20]:
# save the current matrix in the raw layer
adata.raw = adata
In [21]:
# create a new AnnData w=containing only the most variable features
adata2 = epi.pp.select_var_feature(adata,
                            nb_features=nb_feature_selected,
                            show=False,
                            copy=True)
In [22]:
adata2
Out[22]:
View of AnnData object with n_obs × n_vars = 7563 × 100172 
    obs: 'batch', 'cell', 'cluster', 'id', 'subset_cluster', 'subset_tsne1', 'subset_tsne2', 'tissue', 'tissue.replicate', 'tsne_1', 'tsne_2', 'cell_label', 'nb_features'
    var: 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'

Normalisation (correct library size)

In [23]:
epi.pp.normalize_per_cell(adata2)
epi.pp.log1p(adata2)
Trying to set attribute `.obs` of view, making a copy.
In [24]:
# save intermediary file
adata2.write(result_file)
... storing 'cell_label' as categorical
In [26]:
# Principal Component Analysis - Overview
epi.pp.pca(adata2)
epi.pl.pca_overview(adata2)