The data consist in ~3000 cells of human PBMCs. Cells were FAC sorted thus providing a ground truth for cell type identification. The data was produces by Buenrostro et al. 2018 The original count matrix (using bulk peaks as features) from Buenrostro et al. is available here: GSE96772
On a unix system, you can uncomment and run the following to download the count matri in its anndata format.
# !mkdir data
# !wget https://www.dropbox.com/s/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz?dl=0 -O data/data_tutorial_buenrostro.tar.gz
# !cd data; tar -xzf data_tutorial_buenrostro.tar.gz
# !mkdir write
If episcanpy isn't installed already. Or if some of the functions aren't working. Install the following version:
# !pip install git+https://github.com/colomemaria/epiScanpy
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import episcanpy.api as epi
sc.settings.set_figure_params(dpi=80, color_map='gist_earth')
results_file = 'write/buenrostro_pbmc.h5ad' # the file that will store the analysis results
Read in the count matrix into an AnnData object, which holds many slots for annotations and different representations of the data. It also comes with its own HDF5 file format: .h5ad.
adata = ad.read('data/data_tutorial_buenrostro/all_buenrostro_bulk_peaks.h5ad')
adata
# display information stored in adata.obs
adata.obs
# checking the variable names (bulk peaks coordinates)
print(adata.var_names)
adata.obs['facs_label'] = ['MEP' if 'MEP' in line else line.split('.bam')[0].lstrip('singles-').split('BM')[-1].split('-')[1] for line in adata.obs['cell_name'].tolist()]
adata
!head 'data/data_tutorial_buenrostro/metadata.tsv'
# format the cell names to match the metadata file
adata.obs_names = [x.split('/')[-1].split('.dedup.st.bam')[0] for x in adata.obs_names.tolist()]
adata.obs_names
epi.pp.load_metadata(adata,
'data/data_tutorial_buenrostro/metadata.tsv',
separator='\t')
adata
adata.obs['cell_type'] = [x.rstrip('\n') for x in adata.obs['cell_type\n']]
del adata.obs['cell_type\n']
adata
# check the 2 different annotation obtained
pd.crosstab(adata.obs['cell_type'], adata.obs['facs_label'])
Download annotation file (the data are aligned on hg19)
# !wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz -O data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz
# !cd data/data_tutorial_buenrostro; gunzip gencode.v19.annotation.gtf
epi.tl.find_genes(adata,
gtf_file='data/data_tutorial_buenrostro/gencode.v19.annotation.gtf',
key_added='transcript_annotation',
upstream=2000,
feature_type='transcript',
annotation='HAVANA',
raw=False)
adata
print(np.max(adata.X))
# Here, the data is already binary. If not, run the following commands
# epi.pp.binarize(adata)
# print(np.max(adata.X))
Proceed to basic filtering.
adata
# remove any potential empty features or barcodes
epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
# filtered out 24210 peaks
adata
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
adata
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
# set a minimum number of cells to keep
min_features = 1000
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
threshold=min_features, save='Buenrostro_bulk_peaks_coverage_cells.png')
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
threshold=min_features, save='Buenrostro_bulk_peaks_coverage_cells_log10.png')
# minimum number of cells sharing a feature
min_cells = 5
epi.pp.coverage_features(adata, binary=True, log=False,
threshold=min_cells, save='Buenrostro_bulk_peaks_coverage_peaks.png')
epi.pp.coverage_features(adata, binary=True, log=True,
threshold=min_cells, save='Buenrostro_bulk_peaks_coverage_peaks_log10.png')
min_features = 1000
epi.pp.filter_cells(adata, min_features=min_features)
adata
min_cells = 5
epi.pp.filter_features(adata, min_cells=min_cells)
adata
epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=min_features)
epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=min_cells)
We aim to select a cuttof after the elbow.
epi.pp.cal_var(adata)
min_score_value = 0.515
nb_feature_selected = 120000
epi.pl.variability_features(adata,log=None,
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix.png')
epi.pl.variability_features(adata,log='log10',
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix_log10.png')
# save the current matrix in the raw layer
adata.raw = adata
# create a new AnnData containing only the most variable features
adata = epi.pp.select_var_feature(adata,
nb_features=nb_feature_selected,
show=False,
copy=True)
adata
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])