import anndata as ad
import episcanpy.api as epi
import scanpy as sc
import numpy as np
import time
import episcanpy
print(episcanpy.__version__)
# define data directory
DATADIR = './data/'
# To load the peak matrix
adata = ad.read(DATADIR+'GSM3034622_BoneMarrow_62016_GSM3034623_BoneMarrow_62216_peakmatrix.h5ad')
adata
# format cell label metadata
adata.obs['cell_label'] =[n.rstrip('\n') for n in adata.obs['cell_label\n']]
del adata.obs['cell_label\n']
# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
frameon=True, vector_friendly=True,
color_map="YlGnBu", format='pdf', transparent=False,
ipython_format='png2x')
result_file = 'GSM3034622_BoneMarrow_62016_GSM3034623_BoneMarrow_62216_peakmatrix_processed.h5ad'
# preliminary filtering to remove cell barcode containing no open features
epi.pp.filter_cells(adata, min_features=1)
# preliminary filtering to remove features that are not sequenced in any cell
epi.pp.filter_features(adata, min_cells=1)
# plot cell coverage
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
threshold=1000, save='coverage_cells.png')
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
threshold=1000, save='coverage_cells.png')
# plot feature coverage
epi.pp.coverage_features(adata, binary=True, log=False, bins=50,
threshold=40, save='coverage_cells.png')
epi.pp.coverage_features(adata, binary=True, log=10, bins=50,
threshold=40, save='coverage_cells.png')
epi.pp.density_features(adata, hist=True)
# remove cells containing less than 100 features open
epi.pp.filter_cells(adata, min_features=1000)
# remove features sequenced in less than 40 cells
epi.pp.filter_features(adata, min_cells=40)
epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=1000)
epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=40)
adata
epi.pp.cal_var(adata)
min_score_value = 0.517
nb_feature_selected = 100000
epi.pl.variability_features(adata,log=None,
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix.png')
epi.pl.variability_features(adata,log='log10',
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix_log10.png')
# save the current matrix in the raw layer
adata.raw = adata
# create a new AnnData w=containing only the most variable features
adata2 = epi.pp.select_var_feature(adata,
nb_features=nb_feature_selected,
show=False,
copy=True)
adata2
epi.pp.normalize_per_cell(adata2)
epi.pp.log1p(adata2)
# save intermediary file
adata2.write(result_file)
# Principal Component Analysis - Overview
epi.pp.pca(adata2)
epi.pl.pca_overview(adata2)
#Check if the coverage is a covariate to correct. Here is it is not the case anymore (post library size normalisation)
epi.pp.correlation_pc(adata2, 'nb_features')
epi.pp.pca(adata2, n_comps=100)
epi.pl.pca_overview(adata2)
sc.pl.pca(adata2, color=['nb_features', 'batch'], wspace=0.5)
epi.pp.neighbors(adata2, n_neighbors=15, n_pcs=100, method='gauss')
epi.tl.umap(adata2, min_dist=0.5,spread=1, a=1,b=1)
epi.tl.louvain(adata2)
sc.pl.umap(adata2, color=['nb_features', 'batch', 'louvain'], wspace=0.5)
# calculate a second louvain clustering with a lower resolution
epi.tl.louvain(adata2, resolution=0.5, key_added='louvain_lowres')
sc.pl.umap(adata2, color=['nb_features', 'louvain', 'batch'], wspace=0.5)
sc.pl.umap(adata2, color=['louvain', 'louvain_lowres', 'cell_label'], wspace=0.5)
epi.tl.diffmap(adata2)
sc.pl.diffmap(adata2, color=['nb_features', 'louvain', 'batch'], wspace=0.5)
sc.pl.diffmap(adata2, color=['louvain', 'louvain_lowres', 'cell_label'], wspace=0.5)
# Root for the dpt
adata2.uns['iroot'] = np.flatnonzero(adata2.obs['louvain'] == '2')[0]
# calculate the dpt
epi.tl.dpt(adata2)
adata2
sc.pl.umap(adata2, color=['louvain','nb_features', 'dpt_pseudotime', 'cell_label'], wspace=0.5)
sc.pl.diffmap(adata2, color=['louvain','nb_features', 'dpt_pseudotime', 'cell_label'], wspace=0.5)
adata2
# removing erythroblasts:
adata3 = adata2[adata2.obs['cell_label'] != 'Erythroblasts',:].copy()
adata3
epi.pp.lazy(adata3)
epi.tl.louvain(adata3, key_added='louvain_noerythro')
epi.tl.diffmap(adata3)
sc.pl.umap(adata3, color=['louvain','louvain_noerythro', 'cell_label'], wspace=0.5)
sc.pl.diffmap(adata3, color=['louvain','louvain_noerythro', 'cell_label'], wspace=0.5)
adata3.uns['iroot'] = np.flatnonzero(adata3.obs['louvain_noerythro'] == '1')[0]
sc.tl.dpt(adata3)
sc.pl.umap(adata3, color=['louvain_noerythro','nb_features', 'dpt_pseudotime', 'cell_label'], wspace=0.5)
sc.pl.diffmap(adata3, color=['louvain_noerythro','nb_features', 'dpt_pseudotime', 'cell_label'], wspace=0.5)