import numpy as np
import pandas as pd
import scanpy as sc
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.6.1 anndata==0.7.5 umap==0.4.6 numpy==1.19.5 scipy==1.5.3 pandas==1.1.5 scikit-learn==0.24.0 statsmodels==0.12.1 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3
save_dir='./' ## need to change the directory
sc.settings.figdir = save_dir
The data could be downloaded from https://drive.google.com/drive/folders/1MZEaM28PHW4FTSKzy-h_OqPnSlRKiYKW?usp=sharing.
%%time
adata=sc.read('./Hochgerner_dentate_gyrus_QC.h5ad') ## need to change the directory
CPU times: user 174 ms, sys: 237 ms, total: 412 ms Wall time: 410 ms
np.unique(adata.obs['CellTypes'])
array(['Astro-adult', 'Astro-juv', 'CA3-Pyr', 'Cajal-Retzius', 'Endothelial', 'Ependymal', 'GABA', 'GC-adult', 'GC-juv', 'Immature-Astro', 'Immature-GABA', 'Immature-GC', 'Immature-Pyr', 'MOL', 'MiCajal-Retziusoglia', 'NFOL', 'Neuroblast', 'OPC', 'PVM', 'RGL', 'RGL_young', 'VLMC', 'nIPC', 'nIPC-perin'], dtype=object)
len(np.unique(adata.obs['CellTypes']))
24
adata
AnnData object with n_obs × n_vars = 23025 × 19444 obs: 'source name', 'organism', 'characteristics: strain', 'characteristics: age', 'characteristics: sex of pooled animals', 'characteristics: cell cluster', 'molecule', 'SRR run accession', 'raw file (original file name)', 'UMI_CellularBarcode', 'CellTypes', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt' var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts' uns: 'CellTypes_colors'
Note: the data is already filtered after proper quality control.
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
normalizing counts per cell finished (0:00:00)
sc.pp.highly_variable_genes(adata,
n_top_genes=3000
)
print(sum(adata.var.highly_variable))
sc.pl.highly_variable_genes(adata)
If you pass `n_top_genes`, all cutoffs are ignored. extracting highly variable genes finished (0:00:01) --> added 'highly_variable', boolean vector (adata.var) 'means', float vector (adata.var) 'dispersions', float vector (adata.var) 'dispersions_norm', float vector (adata.var) 3000
use_highly_variable=True
from sklearn.preprocessing import StandardScaler
expr = adata[:, adata.var['highly_variable']].X if use_highly_variable else adata.X
expr=StandardScaler(with_mean=False).fit_transform(expr)
expr[expr > 10] = 10
from sklearn.decomposition import TruncatedSVD
transformer = TruncatedSVD(n_components=50, random_state=42)
adata.obsm['X_pca']= transformer.fit_transform(expr)
sc.pl.pca(adata,color='CellTypes')
%%time
sc.tl.dendrogram(adata,groupby='CellTypes',use_rep='X_pca')
Storing dendrogram info using `.uns["dendrogram_['CellTypes']"]` CPU times: user 11.5 ms, sys: 3.96 ms, total: 15.4 ms Wall time: 13.6 ms
celltype=adata.obs['CellTypes'].values.copy()
new_order=adata.uns["dendrogram_['CellTypes']"]['categories_ordered']
celltype=celltype.reorder_categories(new_order)
adata.obs['CellTypes']=celltype
%%time
sc.pp.neighbors(adata,
n_neighbors=15,random_state=10,knn=True,
method="umap")
computing neighbors using 'X_pca' with n_pcs = 50 finished: added to `.uns['neighbors']` `.obsp['distances']`, distances for each pair of neighbors `.obsp['connectivities']`, weighted adjacency matrix (0:00:20) CPU times: user 33.4 s, sys: 16 s, total: 49.4 s Wall time: 20.9 s
%%time
sc.tl.umap(adata)
computing UMAP finished: added 'X_umap', UMAP coordinates (adata.obsm) (0:00:14) CPU times: user 57.5 s, sys: 2min 26s, total: 3min 23s Wall time: 14.7 s
%%time
sc.pl.umap(adata,
color=['CellTypes'],
palette=sc.pl.palettes.default_102,
size=10,
frameon=True)