import anndata as ad
import episcanpy.api as epi
import scanpy as sc
import numpy as np
import time
import episcanpy
print(episcanpy.__version__)
# define data directory
DATADIR = './data/'
# To load the peak matrix
adata = ad.read(DATADIR+'GSM3034622_BoneMarrow_62016_GSM3034623_BoneMarrow_62216_peakmatrix.h5ad')
adata
# format cell label metadata
adata.obs['cell_label'] =[n.rstrip('\n') for n in adata.obs['cell_label\n']]
del adata.obs['cell_label\n']
# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
frameon=True, vector_friendly=True,
color_map="YlGnBu", format='pdf', transparent=False,
ipython_format='png2x')
result_file = 'GSM3034622_BoneMarrow_62016_GSM3034623_BoneMarrow_62216_peakmatrix_processed.h5ad'
# preliminary filtering to remove cell barcode containing no open features
epi.pp.filter_cells(adata, min_features=1)
# preliminary filtering to remove features that are not sequenced in any cell
epi.pp.filter_features(adata, min_cells=1)
# plot cell coverage
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
threshold=1000, save='coverage_cells.png')
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
threshold=1000, save='coverage_cells.png')
# plot feature coverage
epi.pp.coverage_features(adata, binary=True, log=False, bins=50,
threshold=40, save='coverage_cells.png')
epi.pp.coverage_features(adata, binary=True, log=10, bins=50,
threshold=40, save='coverage_cells.png')
epi.pp.density_features(adata, hist=True)
# remove cells containing less than 100 features open
epi.pp.filter_cells(adata, min_features=1000)
# remove features sequenced in less than 40 cells
epi.pp.filter_features(adata, min_cells=40)
epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=1000)
epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=40)
adata
epi.pp.cal_var(adata)
min_score_value = 0.517
nb_feature_selected = 100000
epi.pl.variability_features(adata,log=None,
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix.png')
epi.pl.variability_features(adata,log='log10',
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix_log10.png')
# save the current matrix in the raw layer
adata.raw = adata
# create a new AnnData w=containing only the most variable features
adata2 = epi.pp.select_var_feature(adata,
nb_features=nb_feature_selected,
show=False,
copy=True)
adata2
epi.pp.normalize_per_cell(adata2)
epi.pp.log1p(adata2)
# save intermediary file
adata2.write(result_file)
# Principal Component Analysis - Overview
epi.pp.pca(adata2)
epi.pl.pca_overview(adata2)