#!/usr/bin/env python # coding: utf-8 # ### Code used for single cell data analysis and generation of anndata object--part 1 # _Qianjiang Hu; 02-22-2022_ # In[1]: ## Load Libraries import numpy as np import matplotlib.pyplot as plt import scanpy as sc import pandas as pd import seaborn as sb import scrublet as scr from scipy import sparse import warnings warnings.filterwarnings("ignore") ## Plotting Parameters sc.set_figure_params(vector_friendly = True) plt.rcParams["figure.figsize"] = (6, 5) plt.rcParams.update({'font.size': 14}) sb.set_style("ticks") ## Set Filepaths dge_path = "./ipf/data/" out_folder = "./ipf/output/" object_folder = "./ipf/object/" sc.logging.print_version_and_date() # In[2]: from rpy2.robjects import pandas2ri pandas2ri.activate() get_ipython().run_line_magic('load_ext', 'rpy2.ipython') # In[3]: get_ipython().run_cell_magic('R', '', '\nsuppressPackageStartupMessages(library(scran))\nsuppressPackageStartupMessages(library(SoupX))\nsuppressPackageStartupMessages(library(Matrix))\n') # ### Read in count matrices # In[4]: samples = ["LTC22", "LTC34", "LTC52", "LTC46", "LTC55", "LTC50"] files = ["%s.h5" %s for s in samples] grouping = pd.read_csv(out_folder + "ipf_samples_info.txt", sep = '\t', header = (0), index_col = 0) grouping # In[5]: def read_dges_10x(files): adatas = [] for file in files: sample = file.replace(".h5", "") print("Reading DGE for sample %s" %(sample)) a = sc.read_10x_h5(dge_path + file) a.var_names_make_unique() del(a.var) ## Add all columns from Grouping Table as Metainfo for col in np.setdiff1d(grouping.columns, "cells"): a.obs[col] = grouping.loc[sample, col] a.obs_names = ["%s_%s" %(sample, cell.replace("-1", "")) for cell in a.obs_names.values] adatas.append(a) return adatas # In[6]: adatas = read_dges_10x(files) adata = adatas[0].concatenate(adatas[1:], batch_key = "identifier", join = "outer", index_unique = None) adata.obs["identifier"].cat.categories = samples ## adata.X = np.nan_to_num(adata.X) adata.X = adata.X.toarray() adata # ### Preprocessing and Quality Control # # In[7]: # Quality control - calculate QC Metrics mt_genes = [gene.startswith("MT-") for gene in adata.var_names] adata.obs["n_counts"] = adata.X.sum(1) adata.obs["n_genes"] = (adata.X > 0).sum(1) adata.obs["percent_mito"] = adata.X[:, mt_genes].sum(1) / adata.obs["n_counts"] adata.X = sparse.csr_matrix(adata.X) # ### Quality control - plot QC Metrics # In[8]: group = "internal_id" fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 4)) axs = axs.ravel() sc.pl.violin(adata, "n_counts", groupby = group, size = 1.5, rotation = 90, show = False, ax = axs[0]) sc.pl.violin(adata, "percent_mito", groupby = group, size = 1.5, rotation = 90, show = False, ax = axs[1]) axs[0].axhline(20000, color = "blue") axs[1].axhline(0.15, color = "red") plt.show() sc.pl.scatter(adata, "n_counts", "n_genes", color = group, size = 10) # In[9]: fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize = (12, 10)) axs = axs.ravel() sb.distplot(adata.obs['n_counts'], kde = False, ax = axs[0]) sb.distplot(adata.obs['n_counts'][adata.obs['n_counts'] < 5000], kde = False, bins = 60, ax = axs[1]) sb.distplot(adata.obs['n_genes'], kde = False, ax = axs[2]) sb.distplot(adata.obs['n_genes'][adata.obs['n_genes'] < 1500], kde = False, bins = 60, ax = axs[3]) axs[1].axvline(1000, color = "blue") axs[3].axvline(400, color = "blue") plt.show() # #### Actually do the filtering # In[10]: print('Total number of cells: {:d}'.format(adata.n_obs)) sc.pp.filter_cells(adata, min_counts = 1000) print('Number of cells after max count filter: {:d}'.format(adata.n_obs)) sc.pp.filter_cells(adata, max_counts = 20000) print('Number of cells after max count filter: {:d}'.format(adata.n_obs)) sc.pp.filter_cells(adata, min_genes = 400) print('Number of cells after min count filter: {:d}'.format(adata.n_obs)) sc.pp.filter_genes(adata, min_cells = 3) print('Number of cells after gene filter: {:d}'.format(adata.n_obs)) adata = adata[adata.obs["percent_mito"] < 0.15] print('Number of cells after MT filter: {:d}'.format(adata.n_obs)) # ### Summary Statistics after Filtering # In[11]: group = "internal_id" info = pd.DataFrame(data = adata.obs.identifier.cat.categories, columns = ["identifier"]) info[group] = grouping.loc[info.identifier.values].loc[:, group].values info["n_counts"] = adata.obs.groupby(["identifier"])["n_counts"].median().values info["n_genes"] = adata.obs.groupby(["identifier"])["n_genes"].median().values info["percent_mito"] = adata.obs.groupby(["identifier"])["percent_mito"].mean().values info["n_cells"] = adata.obs.groupby(["identifier"])["n_genes"].size().values info # ### Doublet Detection with Scrublet (sample-wise) # In[12]: ids = adata.obs.identifier.cat.categories adata.obs["doublet_scores"] = np.nan for i, cur_id in enumerate(ids): sub = adata[adata.obs.identifier == cur_id].copy() ## Input: raw (unnormalized) UMI counts matrix counts_matrix with cells as rows and genes as columns print("%s\tCalculating doublet score for %s" %(i, cur_id)) scrub = scr.Scrublet(sub.X) doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose = False) doublet_scores = pd.DataFrame(doublet_scores, index = sub.obs_names, columns = ["doublet_scores"]) adata.obs["doublet_scores"].update(doublet_scores.doublet_scores) # ### Ambient gene removal with SoupX and normalization with Scran # In[13]: get_ipython().run_cell_magic('R', '', '\nsoup_counts <- function(data, cells, genes, soupx_groups){\n print(dim(data))\n rownames(data) = genes\n colnames(data) = cells\n data <- as(data, "sparseMatrix")\n \n ## Generate SoupChannel Object for SoupX\n sc = SoupChannel(data, data, calcSoupProfile = FALSE)\n soupProf = data.frame(row.names = rownames(data), est = rowSums(data)/sum(data), counts = rowSums(data))\n sc = setSoupProfile(sc, soupProf)\n sc = setClusters(sc, soupx_groups)\n\n ## Set the Contamination Fraction manually\n print(paste("Started SoupX", Sys.time()))\n sc = setContaminationFraction(sc, 0.3)\n out = adjustCounts(sc, roundToInt = TRUE)\n print(paste("Finished SoupX", Sys.time()))\n return(out)\n}\n\nget_size_factors <- function(out, scran_groups){\n ## Input souped count matrix directly to Scran\n print(paste("Started Size Factor Calculation", Sys.time()))\n size_factors = computeSumFactors(out, clusters = scran_groups, min.mean = 0.1)\n print(paste("Finished Size Factor Calculation", Sys.time()))\n return(size_factors)\n}\n') # In[14]: ## Perform a clustering for SoupX and scran normalization in clusters adata_pp = adata.copy() sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after = 1e6) sc.pp.log1p(adata_pp) sc.pp.pca(adata_pp, n_comps = 15) sc.pp.neighbors(adata_pp) sc.tl.louvain(adata_pp, key_added = "soupx_groups", resolution = 2) sc.tl.louvain(adata_pp, key_added = "scran_groups", resolution = 1) ## Preprocessing variables for SoupX and scran normalization soupx_groups = adata_pp.obs["soupx_groups"] scran_groups = adata_pp.obs["scran_groups"] cells = adata.obs_names genes = adata.var_names data = adata.X.T.todense() # In[15]: get_ipython().run_line_magic('R', '-i data -i cells -i genes -i soupx_groups out <- soup_counts(data, cells, genes, soupx_groups)') get_ipython().run_line_magic('R', '-i scran_groups -o size_factors size_factors <- get_size_factors(out, scran_groups)') get_ipython().run_line_magic('R', '-o data data <- t(as.matrix(out))') print(data.shape) # In[16]: ## Add results to adata adata.obs["size_factors"] = np.nan size_factors = pd.DataFrame(size_factors, index = adata.obs_names, columns = ["size_factors"]) adata.obs["size_factors"].update(size_factors.size_factors) ## Add souped count layer from scipy import sparse adata.layers["unsouped_counts"] = adata.X.copy() adata.layers["counts"] = sparse.csr_matrix(data) adata.X = sparse.csr_matrix(data) fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 5)) axs = axs.ravel() sb.distplot(adata.obs["doublet_scores"], kde = False, ax = axs[0]) sb.distplot(adata.obs["size_factors"], kde = False, ax = axs[1]) plt.show() # In[17]: del(adata_pp) del(data) ## Normalize and Log Transform the ambient corrected counts adata.X /= adata.obs['size_factors'].values[:, None] sc.pp.log1p(adata) adata.X = sparse.csr_matrix(adata.X) # In[18]: print("unsouped counts\n", type(adata.layers["unsouped_counts"]), "\n", adata.layers["unsouped_counts"][40000:40004, 110:120].todense()) print("souped counts\n", type(adata.layers["counts"]), "\n", adata.layers["counts"][40000:40004, 110:120].todense()) print("Counts in X\n", type(adata.X), "\n", adata.X[40000:40004, 110:120].todense()) # ### Selection of highly Variable Genes (consider in how many Samples they are variable) # In[19]: batch = "internal_id" sc.pp.highly_variable_genes(adata, min_disp = None, max_disp = None, min_mean = None, max_mean = None, batch_key = batch, n_top_genes = 4000, n_bins = 20, flavor = "cell_ranger", subset = False) vartab = pd.DataFrame(adata.var["highly_variable_nbatches"], index = adata.var_names) sb.distplot(vartab, kde = False, bins = len(np.unique(adata.obs[batch]))) # In[20]: thresh = 3 hvgs = vartab[vartab.highly_variable_nbatches.values >= thresh].index print("%s Genes kept, variable in at least %s samples" %(len(hvgs), thresh)) # ### Remove Cell cycle Genes from list of variable genes # In[22]: cc_genes = pd.read_table(out_folder + "Macosko_cell_cycle_genes_human.txt", delimiter = "\t") s_genes = cc_genes['S'].dropna() g2m_genes = cc_genes['G2.M'].dropna() cc_genes = np.concatenate(cc_genes.values) cc_genes = np.unique([g for g in cc_genes if isinstance(g, str)]) ## Calculate cell cycle Score sc.tl.score_genes_cell_cycle(adata, s_genes = s_genes, g2m_genes = g2m_genes) ## Remove cell cycle genes from list hvgs = np.setdiff1d(hvgs, cc_genes) adata.var["highly_variable"] = [g in hvgs for g in adata.var_names] sum(adata.var["highly_variable"]) # In[24]: ## Remove mitochondrial and ribosomal genes from list rbmt_genes = [gene.startswith("MT-") | gene.startswith("RP") for gene in adata.var_names] adata.var["highly_variable"] = [False if g in adata.var_names[rbmt_genes] else v for g, v in zip(adata.var_names, adata.var["highly_variable"])] sum(adata.var["highly_variable"]) # ### Save the pre-processed object # In[25]: adata.write(object_folder + "GPR87IPF_preprocessed.h5ad") # In[ ]: